示例#1
0
 def handler(offset, batch):
     _values = [
         value(start + offset + i + 1, row)
         for i, row in enumerate(batch)
     ]
     DBSession.execute(model.__table__.insert(), _values)
     values.extend(_values)
示例#2
0
def update(args):
    count = 0
    assert args.json

    iid = int(DBSession.execute(
        "select max(cast(id as integer)) from identifier").fetchone()[0]) + 1
    pk = DBSession.execute(
        "select max(pk) from identifier").fetchone()[0] + 1

    langs = {}
    for gid, name in args.json['wikipedia'].items():
        if gid not in langs:
            langs[gid] = Languoid.get(gid)
        langs[gid].update_jsondata(wikipedia=name.split('/')[-1])

    for gid, codes in args.json['multitree'].items():
        l = langs[gid]
        lcodes = [i.name for i in l.identifiers if i.type == 'multitree']

        for code in set(codes):
            if code not in lcodes:
                identifier = DBSession.query(common.Identifier)\
                    .filter(common.Identifier.type == 'multitree')\
                    .filter(common.Identifier.name == code)\
                    .first()
                if not identifier:
                    identifier = common.Identifier(
                        pk=pk, id=str(iid), name=code, type='multitree')
                    iid += 1
                    pk += 1
                count += 1
                DBSession.add(
                    common.LanguageIdentifier(language=l, identifier=identifier))

    print count, 'new multitree identifiers'
示例#3
0
def insert(db, table, model, value, start=0, order=None):
    log.info('migrating %s ...' % table)
    sql = 'select * from %s' % table
    values = []
    if not order:
        log.info(sql)
        values = [
            value(start + i + 1, row) for i, row in enumerate(db.execute(sql))
        ]
        DBSession.execute(model.__table__.insert(), values)
    else:

        def handler(offset, batch):
            _values = [
                value(start + offset + i + 1, row)
                for i, row in enumerate(batch)
            ]
            DBSession.execute(model.__table__.insert(), _values)
            values.extend(_values)

        order = [order] if isinstance(order, basestring) else order
        select(db, '%s order by %s' % (sql, ', '.join(order)), handler)

    DBSession.execute('COMMIT')
    log.info('... done')
    return values
示例#4
0
 def handler(offset, batch):
     svalues = []
     rvalues = []
     for row in batch:
         jsondata = json.loads(row['jsondata'] or "{}")
         jsondata['bibtexkey'] = row['bibtexkey']
         dicts = {
             's':
             dict(pk=row['id'],
                  polymorphic_type='base',
                  id=str(row['id']),
                  name='%(author)s %(year)s' % row,
                  description=row['title'],
                  bibtex_type=getattr(EntryType, row['type']),
                  jsondata=jsondata),
             'r':
             dict(pk=row['id']),
         }
         for model, map_ in {
                 's': {
                     'author': None,
                     'yearstring': 'year',
                     'year': 'year_int',
                     'startpage': 'startpage_int',
                     'numberofpages': 'pages_int',
                     'pages': None,
                     'edition': None,
                     'school': None,
                     'address': None,
                     'url': None,
                     'note': None,
                     'number': None,
                     'series': None,
                     'editor': None,
                     'booktitle': None,
                     'journal': None,
                     'volume': None,
                     'publisher': None,
                 },
                 'r': {
                     'endpage': 'endpage_int',
                     'inlg': None,
                     'inlg_code': None,
                     'subject': None,
                     'subject_headings': None,
                     'keywords': None,
                     'normalizedauthorstring': None,
                     'normalizededitorstring': None,
                     'ozbib_id': None,
                 }
         }.items():
             for okey, nkey in map_.items():
                 dicts[model][nkey or okey] = row[okey]
         svalues.append(dicts['s'])
         rvalues.append(dicts['r'])
     DBSession.execute(common.Source.__table__.insert(), svalues)
     DBSession.execute(models2.Ref.__table__.insert(), rvalues)
示例#5
0
 def handler(offset, batch):
     svalues = []
     rvalues = []
     for row in batch:
         jsondata = json.loads(row['jsondata'] or "{}")
         jsondata['bibtexkey'] = row['bibtexkey']
         dicts = {
             's': dict(
                 pk=row['id'],
                 polymorphic_type='base',
                 id=str(row['id']),
                 name='%(author)s %(year)s' % row,
                 description=row['title'],
                 bibtex_type=getattr(EntryType, row['type']),
                 jsondata=jsondata),
             'r': dict(pk=row['id']),
         }
         for model, map_ in {
             's': {
                 'author': None,
                 'yearstring': 'year',
                 'year': 'year_int',
                 'startpage': 'startpage_int',
                 'numberofpages': 'pages_int',
                 'pages': None,
                 'edition': None,
                 'school': None,
                 'address': None,
                 'url': None,
                 'note': None,
                 'number': None,
                 'series': None,
                 'editor': None,
                 'booktitle': None,
                 'journal': None,
                 'volume': None,
                 'publisher': None,
             },
             'r': {
                 'endpage': 'endpage_int',
                 'inlg': None,
                 'inlg_code': None,
                 'subject': None,
                 'subject_headings': None,
                 'keywords': None,
                 'normalizedauthorstring': None,
                 'normalizededitorstring': None,
                 'ozbib_id': None,
             }
         }.items():
             for okey, nkey in map_.items():
                 dicts[model][nkey or okey] = row[okey]
         svalues.append(dicts['s'])
         rvalues.append(dicts['r'])
     DBSession.execute(common.Source.__table__.insert(), svalues)
     DBSession.execute(models2.Ref.__table__.insert(), rvalues)
示例#6
0
文件: util.py 项目: somiyagawa/asjp
def dataset_detail_html(context=None, request=None, **kw):
    """
    #unique language names:  6895

    #Ethnologue families:  223

    #Glottolog families:  381

    #languages with unique ISO codes:  4424  match [a-z]{3}!

    asjp=# select count(*) from (select distinct name from identifier where type = 'iso639-3') as s;
-[ RECORD 1 ]
count | 4401

    #words in the database (not counting synonyms):  238976 and counting synonyms: ...
    """
    stats = {
        'wordlists': DBSession.query(common.Language.pk).count(),
        'ethnologue_families': DBSession.query(models.Doculect.ethnologue_family)
        .distinct().count(),
        'glottolog_families': DBSession.query(models.Doculect.glottolog_family)
        .distinct().count(),
        'iso_langs': DBSession.query(common.Identifier.name)
        .filter(common.Identifier.type == common.IdentifierType.iso.value).distinct()
        .count(),
        'synsets': DBSession.execute(
            'select count(*) from (select distinct valueset_pk from value) as s')
        .fetchone()[0],
        'words': DBSession.query(common.Value.pk).count(),
    }
    return stats
示例#7
0
文件: util.py 项目: clld/asjp
def dataset_detail_html(context=None, request=None, **kw):
    """
    #unique language names:  6895

    #Ethnologue families:  223

    #Glottolog families:  381

    #languages with unique ISO codes:  4424  match [a-z]{3}!

    asjp=# select count(*) from (select distinct name from identifier where type = 'iso639-3') as s;
-[ RECORD 1 ]
count | 4401

    #words in the database (not counting synonyms):  238976 and counting synonyms: ...
    """
    stats = {
        'wordlists': DBSession.query(common.Language.pk).count(),
        'ethnologue_families': DBSession.query(models.Doculect.ethnologue_family)
        .distinct().count(),
        'glottolog_families': DBSession.query(models.Doculect.glottolog_family)
        .distinct().count(),
        'iso_langs': DBSession.query(common.Identifier.name)
        .filter(common.Identifier.type == common.IdentifierType.iso.value).distinct()
        .count(),
        'synsets': DBSession.execute(
            'select count(*) from (select distinct valueset_pk from value) as s')
        .fetchone()[0],
        'words': DBSession.query(common.Value.pk).count(),
        'missing_iso': len(missing_iso()),
    }
    return {k: '{0:,}'.format(n) for k, n in stats.items()}
示例#8
0
def dataset_detail_html(context=None, request=None, **kw):
    res = dict((row[0], row[1]) for row in DBSession.execute("select source, count(pk) from inventory group by source"))
    res["inventory_count"] = DBSession.query(Inventory).count()
    res["segment_count"] = DBSession.query(Parameter).count()
    res["language_count"] = DBSession.query(Language).count()
    res["contributors"] = (
        DBSession.query(Contributor)
        .order_by(Contributor.name)
        .options(joinedload(Contributor.contribution_assocs), joinedload(Contributor.references))
        .all()
    )
    res["sources"] = {
        k: Source.get(k)
        for k in [
            "moisikesling2011",
            "ipa2005",
            "hayes2009",
            "moran2012a",
            "moranetal2012",
            "cysouwetal2012",
            "mccloyetal2013",
        ]
    }
    res["descriptions"] = {c.id: desc(request, c.description, res["sources"]) for c in res["contributors"]}
    return res
示例#9
0
def extract_data(endangerment):  # pragma: no cover
    status = {}
    lpks = DBSession.query(common.Language.pk) \
        .filter(common.Language.active == True) \
        .filter(common.Language.latitude != None) \
        .filter(Languoid.level == LanguoidLevel.language) \
        .order_by(common.Language.pk).all()
    print(len(lpks))

    sql = """\
select ls.source_pk, count(ls.language_pk) from languagesource as ls, ref as r 
where ls.source_pk = r.pk and r.ca_doctype_trigger is null and r.ca_language_trigger is null 
group by source_pk 
    """
    lcounts = {r[0]: r[1] for r in DBSession.execute(sql)}

    # loop over active, established languages with geo-coords
    for i, lpk in enumerate(lpks):
        l = DBSession.query(common.Language).filter(common.Language.pk == lpk).one()
        # let's collect the relevant sources in a way that allows computation of med.
        # Note: we limit refs to the ones without computerized assignments.
        sources = list(DBSession.query(Ref).join(common.LanguageSource) \
                       .filter(common.LanguageSource.language_pk == lpk) \
                       .filter(Ref.ca_doctype_trigger == None) \
                       .filter(Ref.ca_language_trigger == None) \
                       .options(joinedload(Ref.doctypes)))
        sources = sorted([Source(s, lcounts.get(s.pk, 0)) for s in sources])

        # keep the overall med
        # note: this source may not be included in the potential meds computed
        # below,
        # e.g. because it may not have a year.
        med = sources[0].__json__() if sources else None

        # now we have to compute meds respecting a cut-off year.
        # to do so, we collect eligible sources per year and then
        # take the med of this collection.
        potential_meds = []

        # we only have to loop over publication years within all sources, because
        # only in these years something better might have come along.
        for year in set(s.year for s in sources if s.year):
            # let's see if something better was published!
            eligible = [s for s in sources if s.year and s.year <= year]
            if eligible:
                potential_meds.append(sorted(eligible)[0])

        # we store the precomputed sources information as jsondata:
        status[l.id] = [
            med,
            [s.__json__() for s in
             sorted(set(potential_meds), key=lambda s: -s.year)],
            endangerment.get(l.id, {}).get('source')
        ]
        if i and i % 1000 == 0:
            print(i)
            DBSession.close()

    return status
示例#10
0
def extract_data(endangerment):  # pragma: no cover
    status = {}
    lpks = DBSession.query(common.Language.pk) \
        .filter(common.Language.active == True) \
        .filter(common.Language.latitude != None) \
        .filter(Languoid.level == LanguoidLevel.language) \
        .order_by(common.Language.pk).all()
    print(len(lpks))

    sql = """\
select ls.source_pk, count(ls.language_pk) from languagesource as ls, ref as r 
where ls.source_pk = r.pk and r.ca_doctype_trigger is null and r.ca_language_trigger is null 
group by source_pk 
    """
    lcounts = {r[0]: r[1] for r in DBSession.execute(sql)}

    # loop over active, established languages with geo-coords
    for i, lpk in enumerate(lpks):
        l = DBSession.query(common.Language).filter(common.Language.pk == lpk).one()
        # let's collect the relevant sources in a way that allows computation of med.
        # Note: we limit refs to the ones without computerized assignments.
        sources = list(DBSession.query(Ref).join(common.LanguageSource) \
                       .filter(common.LanguageSource.language_pk == lpk) \
                       .filter(Ref.ca_doctype_trigger == None) \
                       .filter(Ref.ca_language_trigger == None) \
                       .options(joinedload(Ref.doctypes)))
        sources = sorted([Source(s, lcounts.get(s.pk, 0)) for s in sources])

        # keep the overall med
        # note: this source may not be included in the potential meds computed
        # below,
        # e.g. because it may not have a year.
        med = sources[0].__json__() if sources else None

        # now we have to compute meds respecting a cut-off year.
        # to do so, we collect eligible sources per year and then
        # take the med of this collection.
        potential_meds = []

        # we only have to loop over publication years within all sources, because
        # only in these years something better might have come along.
        for year in set(s.year for s in sources if s.year):
            # let's see if something better was published!
            eligible = [s for s in sources if s.year and s.year <= year]
            if eligible:
                potential_meds.append(sorted(eligible)[0])

        # we store the precomputed sources information as jsondata:
        status[l.id] = [
            med,
            [s.__json__() for s in
             sorted(set(potential_meds), key=lambda s: -s.year)],
            endangerment.get(l.id, {}).get('source')
        ]
        if i and i % 1000 == 0:
            print(i)
            DBSession.close()

    return status
示例#11
0
def prime_cache(args):
    #
    # add precalculated scores for meanings and semantic fields:
    #
    icons_dir = path(wold2.__file__).dirname().joinpath('static')
    for vocab in DBSession.query(models.Vocabulary):
        figure(figsize=(0.4, 0.4))
        axes([0.1, 0.1, 0.8, 0.8])
        coll = pie((100, ), colors=('#' + vocab.color, ))
        coll[0][0].set_linewidth(0.5)
        savefig(icons_dir.joinpath('%s.png' % vocab.color), transparent=True)

        words = DBSession.query(models.Word.borrowed_score)\
            .join(common.Unit.language)\
            .join(models.WoldLanguage.vocabulary)\
            .filter(models.Vocabulary.pk == vocab.pk)\
            .filter(models.Word.borrowed_score != None)
        vocab.count_words = words.count()
        vocab.borrowed_score = sum(score[0] for score in words) / float(
            vocab.count_words)
        vocab.count_core_list_counterparts = DBSession.query(models.Counterpart)\
            .join(common.Value.valueset)\
            .join(common.ValueSet.parameter)\
            .filter(common.ValueSet.contribution_pk == vocab.pk)\
            .filter(models.Meaning.core_list == True)\
            .count()

    for meaning in DBSession.query(models.Meaning):
        meaning.representation = DBSession.query(models.Counterpart)\
            .join(common.ValueSet)\
            .filter(common.ValueSet.parameter_pk == meaning.pk)\
            .count()

    for type_ in ['borrowed', 'age', 'simplicity']:
        attr = '%s_score' % type_

        for row in DBSession.execute(models.score_per_meaning_query(type_)):
            meaning = DBSession.query(models.Meaning).get(row['meaning_pk'])
            setattr(meaning, attr, row[attr])

        for row in DBSession.execute(
                models.score_per_semanticfield_query(type_)):
            sf = DBSession.query(models.SemanticField).get(
                row['semantic_field_pk'])
            setattr(sf, attr, row[1])
 def select_leafs(pk):
     l, tc = Languoid.__table__.alias(
         'l'), TreeClosureTable.__table__.alias('tc')
     return [
         r['l_hid'] for r in DBSession.execute(
             select([l, tc], use_labels=True).where(
                 and_(l.c.pk == tc.c.child_pk, l.c.hid != None, l.c.status
                      != LanguoidStatus.provisional, tc.c.parent_pk == pk)))
     ]
示例#13
0
def prime_cache(args):
    #
    # add precalculated scores for meanings and semantic fields:
    #
    icons_dir = path(wold2.__file__).dirname().joinpath('static')
    for vocab in DBSession.query(models.Vocabulary):
        figure(figsize=(0.4, 0.4))
        axes([0.1, 0.1, 0.8, 0.8])
        coll = pie((100,), colors=('#' + vocab.color,))
        coll[0][0].set_linewidth(0.5)
        savefig(icons_dir.joinpath('%s.png' % vocab.color), transparent=True)

        words = DBSession.query(models.Word.borrowed_score)\
            .join(common.Unit.language)\
            .join(models.WoldLanguage.vocabulary)\
            .filter(models.Vocabulary.pk == vocab.pk)\
            .filter(models.Word.borrowed_score != None)
        vocab.count_words = words.count()
        vocab.borrowed_score = sum(score[0] for score in words) / float(vocab.count_words)
        vocab.count_core_list_counterparts = DBSession.query(models.Counterpart)\
            .join(common.Value.valueset)\
            .join(common.ValueSet.parameter)\
            .filter(common.ValueSet.contribution_pk == vocab.pk)\
            .filter(models.Meaning.core_list == True)\
            .count()

    for meaning in DBSession.query(models.Meaning):
        meaning.representation = DBSession.query(models.Counterpart)\
            .join(common.ValueSet)\
            .filter(common.ValueSet.parameter_pk == meaning.pk)\
            .count()

    for type_ in ['borrowed', 'age', 'simplicity']:
        attr = '%s_score' % type_

        for row in DBSession.execute(models.score_per_meaning_query(type_)):
            meaning = DBSession.query(models.Meaning).get(row['meaning_pk'])
            setattr(meaning, attr, row[attr])

        for row in DBSession.execute(models.score_per_semanticfield_query(type_)):
            sf = DBSession.query(models.SemanticField).get(row['semantic_field_pk'])
            setattr(sf, attr, row[1])
示例#14
0
def prime_cache(args):
    """If data needs to be denormalized for lookup, do that here.
    This procedure should be separate from the db initialization, because
    it will have to be run periodiucally whenever data has been updated.
    """
    DBSession.execute('delete from closuretable')
    SQL = ClosureTable.__table__.insert()

    # store a mapping of pk to father_pk for all languoids:
    father_map = {r[0]: r[1] for r in DBSession.execute('select pk, father_pk from treefeature')}

    # we compute the ancestry for each single languoid
    for pk, father_pk in father_map.items():
        depth = 1

        # now follow up the line of ancestors
        while father_pk:
            DBSession.execute(SQL, dict(child_pk=pk, parent_pk=father_pk, depth=depth))
            depth += 1
            father_pk = father_map[father_pk]
示例#15
0
def insert(db, table, model, value, start=0, order=None):
    log.info('migrating %s ...' % table)
    sql = 'select * from %s' % table
    values = []
    if not order:
        log.info(sql)
        values = [value(start + i + 1, row) for i, row in enumerate(db.execute(sql))]
        DBSession.execute(model.__table__.insert(), values)
    else:
        def handler(offset, batch):
            _values = [value(start + offset + i + 1, row) for i, row in enumerate(batch)]
            DBSession.execute(model.__table__.insert(), _values)
            values.extend(_values)

        order = [order] if isinstance(order, basestring) else order
        select(db, '%s order by %s' % (sql, ', '.join(order)), handler)

    DBSession.execute('COMMIT')
    log.info('... done')
    return values
示例#16
0
def update(args):
    count = 0
    assert args.json

    iid = int(
        DBSession.execute("select max(cast(id as integer)) from identifier").
        fetchone()[0]) + 1
    pk = DBSession.execute("select max(pk) from identifier").fetchone()[0] + 1

    langs = {}
    for gid, name in args.json['wikipedia'].items():
        if gid not in langs:
            langs[gid] = Languoid.get(gid)
        langs[gid].update_jsondata(wikipedia=name.split('/')[-1])

    for gid, codes in args.json['multitree'].items():
        l = langs[gid]
        lcodes = [i.name for i in l.identifiers if i.type == 'multitree']

        for code in set(codes):
            if code not in lcodes:
                identifier = DBSession.query(common.Identifier)\
                    .filter(common.Identifier.type == 'multitree')\
                    .filter(common.Identifier.name == code)\
                    .first()
                if not identifier:
                    identifier = common.Identifier(pk=pk,
                                                   id=str(iid),
                                                   name=code,
                                                   type='multitree')
                    iid += 1
                    pk += 1
                count += 1
                DBSession.add(
                    common.LanguageIdentifier(language=l,
                                              identifier=identifier))

    print count, 'new multitree identifiers'
示例#17
0
文件: adapters.py 项目: clld/lexibank
 def template_context(self, ctx, req):
     pairs = [(r[0], r[1]) for r in DBSession.execute(SQL.format(ctx.pk))]
     values = {
         v.pk: v for v in DBSession.query(Value)
         .filter(Value.pk.in_(list(chain(*pairs))))
         .options(
             joinedload_all(Value.valueset, ValueSet.language, LexibankLanguage.family),
             joinedload(Value.valueset, ValueSet.parameter),
             joinedload(Value.valueset, ValueSet.contribution))
         }
     distinct_families = defaultdict(set)
     for v1, v2 in pairs:
         distinct_families[values[v2].valueset.parameter_pk].add(
             values[v2].valueset.language.family_pk)
     return {'pairs': pairs, 'values': values, 'families': distinct_families}
示例#18
0
 def template_context(self, ctx, req):
     pairs = [(r[0], r[1]) for r in DBSession.execute(SQL.format(ctx.pk))]
     values = {
         v.pk: v for v in DBSession.query(Value)
         .filter(Value.pk.in_(list(chain(*pairs))))
         .options(
             joinedload_all(Value.valueset, ValueSet.language, LexiRumahLanguage.family),
             joinedload(Value.valueset, ValueSet.parameter),
             joinedload(Value.valueset, ValueSet.contribution))
         }
     distinct_families = defaultdict(set)
     for v1, v2 in pairs:
         distinct_families[values[v2].valueset.parameter_pk].add(
             values[v2].valueset.language.family_pk)
     return {'pairs': pairs, 'values': values, 'families': distinct_families}
 def select_leafs(pk):
     l, tc = Languoid.__table__.alias("l"), TreeClosureTable.__table__.alias("tc")
     return [
         r["l_hid"]
         for r in DBSession.execute(
             select([l, tc], use_labels=True).where(
                 and_(
                     l.c.pk == tc.c.child_pk,
                     l.c.hid != None,
                     l.c.status != LanguoidStatus.provisional,
                     tc.c.parent_pk == pk,
                 )
             )
         )
     ]
示例#20
0
def ldstatus(ppk):
    sql = """\
select
  l.id, v.domainelement_pk, vs.source, l.jsondata::json->>'meds'
from
  language as l, languoid as ll, valueset as vs, value as v, parameter as p
where
  l.jsondata::json->>'meds' is not null and l.pk = vs.language_pk and vs.parameter_pk = p.pk
  and v.valueset_pk = vs.pk and vs.parameter_pk = {0}
  and ll.pk = l.pk and ll.category in ('Spoken L1 Language', 'Sign Language')
""".format(ppk)
    res = {}
    for lid, aespk, aes_source, meds in DBSession.execute(sql):
        meds = json.loads(meds)
        res[lid] = (aespk, meds[0] if meds else None, meds, aes_source)
    return res
示例#21
0
文件: util.py 项目: muhalisu/phoible
def dataset_detail_html(context=None, request=None, **kw):
    res = dict(
        (row[0], row[1]) for row in
        DBSession.execute("select source, count(pk) from inventory group by source"))
    res['inventory_count'] = DBSession.query(Inventory).count()
    res['segment_count'] = DBSession.query(Parameter).count()
    res['language_count'] = DBSession.query(Language).count()
    res['contributors'] = DBSession.query(Contributor).order_by(Contributor.name).options(
        joinedload(Contributor.contribution_assocs),
        joinedload(Contributor.references)).all()
    res['sources'] = {
        k: Source.get(k) for k in
        ['moisikesling2011', 'ipa2005', 'hayes2009', 'moran2012a', 'moranetal2012',
         'cysouwetal2012', 'mccloyetal2013']}
    res['descriptions'] = {c.id: desc(request, c.description, res['sources'])
                           for c in res['contributors']}
    return res
示例#22
0
    def jqtree(self, icon_map=None):
        tree_ = []
        children_map = {}
        children_of_self = [c.pk for c in self.children]

        for row in DBSession.execute("""\
select
    ll.father_pk, c.child_pk, l.id, l.name, l.latitude, ll.hid, ll.level, ll.status, ll.child_language_count, c.depth
from
    treeclosuretable as c, language as l, languoid as ll, language as l2
where
    l.active is true
    and c.parent_pk = l2.pk and c.child_pk = l.pk and c.child_pk = ll.pk
    and c.parent_pk = %s
order by
    l2.name, c.depth, l.name;""" % (self.family_pk or self.pk,)):
            fpk, cpk, id_, name, lat, hid, level, status, clc, depth = row
            if hid and len(hid) != 3:
                hid = None

            label = name
            if clc:
                label += ' (%s)' % clc
            #label = '%s [%s]' % (name, id_)
            #if level == 'language' and hid and len(hid) == 3:
            #    label += '[%s]' % hid
            node = {'id': id_, 'pk': cpk, 'iso': hid, 'level': level, 'status': status, 'label': label, 'children': []}
            if icon_map and id_ == self.id and lat:
                node['map_marker'] = icon_map[cpk]
            if cpk in children_of_self:
                node['child'] = True
                if icon_map and (level == 'family' or lat):
                    node['map_marker'] = icon_map[cpk]
            children_map[cpk] = node['children']

            if not fpk:
                tree_.append(node)
            else:
                if fpk not in children_map:
                    # this can be the case for dialects attached to inactive nodes
                    continue
                children_map[fpk].append(node)
        return tree_
示例#23
0
def dump(fn = "gbdump.tsv"):
    import io
#    dumpsql = """
#select l.id, p.id, v.name, v.description, s.name
#from value as v, language as l, parameter as p, valueset as vs LEFT OUTER JOIN valuesetreference as vsref ON vsref.valueset_pk = vs.pk LEFT OUTER JOIN source as s ON vsref.source_pk = s.pk
#where v.valueset_pk = vs.pk and vs.language_pk = l.pk and vs.parameter_pk = p.pk
#    """
    #datatriples = grp2([((v[0], v[1], v[2], v[3] or ""), v[4] or "") for v in DBSession.execute(dumpsql)])
    #dump = [xs + ("; ".join(refs),) for (xs, refs) in datatriples.iteritems()]
    dumpsql = """
select l.name, l.id, p.id, v.name, v.description, vs.source
from value as v, language as l, parameter as p, valueset as vs
where v.valueset_pk = vs.pk and vs.language_pk = l.pk and vs.parameter_pk = p.pk
    """
    dump = [v for v in DBSession.execute(dumpsql)]
    tab = lambda rows: u''.join([u'\t'.join(row) + u"\n" for row in rows])
    txt = tab([("Language_Name", "Language_ID", "Feature", "Value", "Comment", "Source")] + dump)
    with io.open(fn, 'w', encoding="utf-8") as fp:
        fp.write(txt)
示例#24
0
文件: util.py 项目: clld/kba
def dataset_detail_html(context=None, request=None, **kw):
    stats = {
        'wordlists':
        DBSession.query(common.Language.pk).count(),
        #'ethnologue_families': DBSession.query(
        #    models.Doculect.ethnologue_family).distinct().count(),
        #'glottolog_families': DBSession.query(
        #    models.Doculect.glottolog_family).distinct().count(),
        'iso_langs':
        DBSession.query(common.Identifier.name).filter(
            common.Identifier.type ==
            common.IdentifierType.iso.value).distinct().count(),
        'synsets':
        DBSession.execute(
            'select count(*) from (select distinct valueset_pk from '
            'value) as '
            's').fetchone()[0],
        'words':
        DBSession.query(common.Value.pk).count(),
        # 'missing_iso': len(missing_iso()),
    }
    return {k: '{0:,}'.format(n) for k, n in stats.items()}
示例#25
0
def _freeze(table, fpath):
    def conv(v, col):
        if v is None:
            return ''
        if isinstance(col.type, DeclEnumType):  # pragma: no cover
            return v.value
        if isinstance(col.type, JSONEncodedDict):
            return json.dumps(v)
        if isinstance(v, (datetime, date)):
            return v.isoformat()
        return v

    keys = [col.name for col in table.columns]
    cols = {col.name: col for col in table.columns}
    rows = [keys]

    for row in DBSession.execute(select([table])):
        rows.append([conv(row[key], cols[key]) for key in keys])

    if len(rows) > 1:
        with UnicodeWriter(fpath) as writer:
            writer.writerows(rows)
示例#26
0
def _freeze(table, fpath):
    def conv(v, col):
        if v is None:
            return ''
        if isinstance(col.type, DeclEnumType):  # pragma: no cover
            return v.value
        if isinstance(col.type, JSONEncodedDict):
            return json.dumps(v)
        if isinstance(v, (datetime, date)):
            return v.isoformat()
        return v

    keys = [col.name for col in table.columns]
    cols = {col.name: col for col in table.columns}
    rows = [keys]

    for row in DBSession.execute(select([table])):
        rows.append([conv(row[key], cols[key]) for key in keys])

    if len(rows) > 1:
        with UnicodeWriter(fpath) as writer:
            writer.writerows(rows)
示例#27
0
def prime(args):
    """If data needs to be denormalized for lookup, do that here.
    This procedure should be separate from the db initialization, because
    it will have to be run periodically whenever data has been updated.
    """
    #
    # Now that we loaded all languoids and refs, we can compute the MED values.
    #
    meds = defaultdict(list)
    for lpk, spk, sid, sname, med_type, year, pages in DBSession.execute("""\
select
  l.pk, r.pk, s.id, s.name, r.med_type, s.year_int, r.med_pages
from
  languagesource as ls,
  language as l,
  source as s,
  ref as r
where
  ls.active = TRUE and l.pk = ls.language_pk and s.pk = ls.source_pk and s.pk = r.pk
order by
  l.id, r.med_index desc, r.med_pages, coalesce(s.year_int, 0), s.pk
"""):
        meds[lpk].append((spk, sid, sname, med_type, year, pages))  # The last one is the overall MED

    # Now weed out the "newer but worse" sources:
    for lpk, sources in {k: reversed(v) for k, v in meds.items()}.items():
        relevant, lastyear = [], 10000
        for spk, sid, sname, med_type, year, pages in sources:
            if year and year < lastyear:  # If year is more recent, this is a "newer but worse" item
                relevant.append((spk, sid, sname, med_type, year, pages))
                lastyear = year
        meds[lpk] = relevant

    med_param = common.Parameter.get('med')
    med_domain = {de.id: de for de in med_param.domain}
    contrib = common.Contribution.get('glottolog')

    for l in DBSession.query(common.Language).filter(common.Language.pk.in_(list(meds.keys()))):
        l.update_jsondata(meds=[
            (sid, med_type, year, pages, sname) for spk, sid, sname, med_type, year, pages in meds[l.pk]])
        if not meds[l.pk]:
            continue

        med = meds[l.pk][0]
        # Record the overall MED as value for the 'med' Parameter:
        vs = common.ValueSet(
            id=idjoin('med', l.id),
            contribution=contrib,
            parameter=med_param,
            language=l,
        )
        DBSession.add(common.Value(
            id=idjoin('med', l.id),
            name=getattr(args.repos.med_types, med[3]).name,
            domainelement=med_domain[idjoin('med', med[3])],
            valueset=vs,
        ))
        DBSession.flush()
        DBSession.add(common.ValueSetReference(source_pk=med[0], valueset_pk=vs.pk))

    recreate_treeclosure()

    macroareas = {r[0]: (r[1], r[2]) for r in DBSession.execute("""\
select de.pk, de.id, de.name
from domainelement as de, parameter as p
where de.parameter_pk = p.pk and p.id = 'macroarea'
""")}

    for lid, lpk, cpk, ppk, mas in DBSession.execute("""\
select
  l.id, l.pk, vs.contribution_pk, vs.parameter_pk, array_agg(distinct v.domainelement_pk)
from
  language as l,
  treeclosuretable as t,
  parameter as p,
  valueset as vs,
  value as v
where
  l.pk = t.parent_pk and
  t.child_pk = vs.language_pk and
  vs.parameter_pk = p.pk and
  p.id = 'macroarea' and
  v.valueset_pk = vs.pk and
  l.pk not in (
    select language_pk 
    from valueset as _vs, parameter as _p 
    where _vs.parameter_pk = _p.pk and _p.id = 'macroarea'
  )
group by l.id, l.pk, vs.contribution_pk, vs.parameter_pk"""):
        for i, mapk in enumerate(mas):
            if i == 0:
                vs = common.ValueSet(
                    id=idjoin('macroarea', lid),
                    language_pk=lpk,
                    parameter_pk=ppk,
                    contribution_pk=cpk)
            DBSession.add(common.Value(
                id=idjoin(macroareas[mapk][0], lid),
                name=macroareas[mapk][1],
                domainelement_pk=mapk,
                valueset=vs))

    for vs in DBSession.query(common.ValueSet)\
            .join(common.Language)\
            .join(common.Parameter)\
            .filter(common.Parameter.id == 'macroarea')\
            .options(joinedload(common.ValueSet.values), joinedload(common.ValueSet.language)):
        vs.language.macroareas = ', '.join([macroareas[v.domainelement_pk][1] for v in vs.values])

    for row in list(DBSession.execute(
        "select pk, pages, pages_int, startpage_int from source where pages_int < 0"
    )):
        raise ValueError(row)

    version = assert_release(args.repos.repos)
    with jsonlib.update(gc2version(args), indent=4) as legacy:
        for lang in DBSession.query(common.Language):
            if lang.id not in legacy:
                lang.update_jsondata(new=True)
                legacy[lang.id] = version

    valuesets = {
        r[0]: r[1] for r in DBSession.query(common.ValueSet.id, common.ValueSet.pk)}
    refs = {
        r[0]: r[1]
        for r in DBSession.query(models.Refprovider.id, models.Refprovider.ref_pk)}

    for vsid, vspk in valuesets.items():
        if vsid.startswith('macroarea-'):
            DBSession.add(common.ValueSetReference(
                source_pk=refs[args.repos.macroareas.__defaults__['reference_id']],
                valueset_pk=vspk))

    for vs in DBSession.query(common.ValueSet)\
            .join(common.Parameter)\
            .filter(common.Parameter.id == 'aes'):
        if vs.jsondata['reference_id']:
            DBSession.add(common.ValueSetReference(
                source_pk=refs[vs.jsondata['reference_id']], valueset_pk=vs.pk))

    for lang in args.repos.languoids():
        if lang.category == args.repos.language_types.bookkeeping.category:
            continue
        clf = lang.classification_comment
        if clf:
            for pid, attr_ in [('sc', 'sub'), ('fc', 'family')]:
                if getattr(clf, attr_ + 'refs'):
                    if split_items(lang.cfg['classification'][attr_ + 'refs']) != \
                            split_items(lang.cfg['classification'].get(attr_)):
                        vspk = valuesets['{0}-{1}'.format(pid, lang.id)]
                        for ref in getattr(clf, attr_ + 'refs'):
                            spk = refs.get(ref.key)
                            if spk:
                                DBSession.add(
                                    common.ValueSetReference(source_pk=spk, valueset_pk=vspk))
示例#28
0
def load(args):
    fts.index('fts_index', models.Ref.fts, DBSession.bind)
    DBSession.execute("CREATE EXTENSION IF NOT EXISTS unaccent WITH SCHEMA public;")

    dataset = common.Dataset(
        id='glottolog',
        name="Glottolog {0}".format(args.args[0]),
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="https://shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        domain='glottolog.org',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})

    data = Data()
    for i, (id_, name) in enumerate([
        ('hammarstroem', 'Harald Hammarström'),
        ('bank', 'Sebastian Bank'),
        ('forkel', 'Robert Forkel'),
        ('haspelmath', 'Martin Haspelmath'),
    ]):
        ed = data.add(common.Contributor, id_, id=id_, name=name)
        common.Editor(dataset=dataset, contributor=ed, ord=i + 1)
    DBSession.add(dataset)

    clf = data.add(common.Contribution, 'clf', id='clf', name='Classification')
    DBSession.add(common.ContributionContributor(
        contribution=clf, contributor=data['Contributor']['hammarstroem']))

    for pid, pname in [
        ('fc', 'Family classification'),
        ('sc', 'Subclassification'),
        ('vitality', 'Degree of endangerment'),
    ]:
        data.add(common.Parameter, pid, id=pid, name=pname)

    legacy = jsonlib.load(gc2version(args))
    for gc, version in legacy.items():
        data.add(models.LegacyCode, gc, id=gc, version=version)

    glottolog = args.repos
    for ma in Macroarea:
        data.add(
            models.Macroarea,
            ma.name,
            id=ma.name,
            name=ma.value,
            description=ma.description)

    for country in glottolog.countries:
        data.add(models.Country, country.id, id=country.id, name=country.name)

    lgcodes, mas, countries, lgsources = {}, {}, {}, defaultdict(list)
    languoids = list(glottolog.languoids())
    nodemap = {l.id: l for l in languoids}
    for lang in languoids:
        for ref in lang.sources:
            lgsources['{0.provider}#{0.bibkey}'.format(ref)].append(lang.id)
        load_languoid(data, lang, nodemap)
        mas[lang.id] = [ma.name for ma in lang.macroareas]
        countries[lang.id] = [c.id for c in lang.countries]
        lgcodes[lang.id] = lang.id
        if lang.hid:
            lgcodes[lang.hid] = lang.id
        if lang.iso:
            lgcodes[lang.iso] = lang.id

    for gc in glottolog.glottocodes:
        if gc not in data['Languoid'] and gc not in legacy:
            common.Config.add_replacement(gc, None, model=common.Language)

    for obj in jsonlib.load(glottolog.references_path('replacements.json')):
        common.Config.add_replacement(
            '{0}'.format(obj['id']),
            '{0}'.format(obj['replacement']) if obj['replacement'] else None,
            model=common.Source)

    DBSession.flush()
    for lid, maids in mas.items():
        for ma in maids:
            DBSession.add(models.Languoidmacroarea(
                languoid_pk=data['Languoid'][lid].pk,
                macroarea_pk=data['Macroarea'][ma].pk))

    for lid, cids in countries.items():
        for cid in cids:
            DBSession.add(models.Languoidcountry(
                languoid_pk=data['Languoid'][lid].pk,
                country_pk=data['Country'][cid].pk))

    for doctype in glottolog.hhtypes:
        data.add(
            models.Doctype, doctype.id, id=doctype.id,
            name=doctype.name,
            description=doctype.description,
            abbr=doctype.abbv,
            ord=doctype.rank)

    for bib in glottolog.bibfiles:
        data.add(
            models.Provider,
            bib.id,
            id=bib.id,
            name=bib.title,
            description=bib.description,
            abbr=bib.abbr,
            url=bib.url)
    DBSession.flush()

    s = time()
    for i, entry in enumerate(
            BibFile(glottolog.build_path('monster-utf8.bib')).iterentries()):
        if i % 10000 == 0:
            args.log.info('{0}: {1:.3}'.format(i, time() - s))
            s = time()
        ref = load_ref(data, entry, lgcodes, lgsources)
        if 'macro_area' in entry.fields:
            for ma in split_text(entry.fields['macro_area'], separators=',;', strip=True):
                ma = 'North America' if ma == 'Middle America' else ma
                ma = Macroarea.get('Papunesia' if ma == 'Papua' else ma)
                DBSession.add(models.Refmacroarea(
                    ref_pk=ref.pk, macroarea_pk=data['Macroarea'][ma.name].pk))
示例#29
0
def create(args):
    args.log.info('starting migration ...')
    data = Data()
    db = create_engine('postgresql://robert@/glottolog2')

    with transaction.manager:
        sn = data.add(common.Contributor,
                      'sn',
                      id='sn',
                      name='Sebastian Nordhoff')
        hh = data.add(common.Contributor,
                      'hh',
                      id='hh',
                      name='Harald Hammarström')
        rf = data.add(common.Contributor,
                      'rf',
                      id='rf',
                      name='Robert Forkel',
                      url="https://github.com/xrotwang")
        mh = data.add(common.Contributor,
                      'mh',
                      id='mh',
                      name='Martin Haspelmath')
        contrib = data.add(common.Contribution,
                           'c',
                           id='classification',
                           name='Classification')
        data.add(common.ContributionContributor,
                 'hh',
                 contribution=contrib,
                 contributor=hh)
        params = dict(
            fc=data.add(common.Parameter,
                        'fc',
                        id='fc',
                        name='Family classification'),
            sc=data.add(common.Parameter,
                        'sc',
                        id='sc',
                        name='Subclassification'),
        )

        dataset = data.add(
            common.Dataset,
            'd',
            id='glottolog',
            name='Glottolog 2.0',
            description='',
            published=datetime.date(2013, 8, 15),
            domain='glottolog.org',
            contact='*****@*****.**',
            jsondata={
                'license_icon':
                'http://i.creativecommons.org/l/by-sa/3.0/88x31.png',
                'license_name':
                'Creative Commons Attribution-ShareAlike 3.0 Unported License'
            })
        for i, ed in enumerate([sn, hh, rf, mh]):
            DBSession.add(
                common.Editor(dataset=dataset, contributor=ed, ord=i + 1))

        valuesets = {}

        def create_languoid(row, father_pk=None):
            glottocode = {
                'akun1242': 'akun1241'
            }.get(row['alnumcode'], row['alnumcode'])
            attrs = dict(
                pk=row['id'],
                id=glottocode,
                name=row['primaryname'],
                description=row['globalclassificationcomment'],
                level=getattr(models2.LanguoidLevel, row['level']),
                status=getattr(models2.LanguoidStatus,
                               (row['status'] or '').replace(' ', '_'), None),
                father_pk=father_pk,
                created=row['updated'],
                jsondata={} if not row['hname'] else {'hname': row['hname']},
            )
            for attr in [
                    'active',
                    'updated',
                    'hid',
                    'latitude',
                    'longitude',
            ]:
                attrs[attr] = row[attr]
            l = data.add(models2.Languoid, row['id'], **attrs)
            for type_ in params:
                id_ = '%s%s' % (type_, row['id'])
                vs = data.add(common.ValueSet,
                              id_,
                              id=id_,
                              description=row['classificationcomment'] if type_
                              == 'fc' else row['subclassificationcomment'],
                              language=l,
                              parameter=params[type_],
                              contribution=contrib)
                data.add(common.Value,
                         id_,
                         id=id_,
                         name='%s - %s' % (row['level'], row['status']),
                         valueset=vs)
                DBSession.flush()
                valuesets[id_] = vs.pk
            return str(row['id'])

        level = 0
        parents = [
            create_languoid(row) for row in db.execute(
                'select * from languoidbase where father_id is null')
        ]
        while parents:
            args.log.info('level: %s' % level)
            level += 1
            parents = [
                create_languoid(
                    row, father_pk=data['Languoid'][row['father_id']].pk)
                for row in db.execute(
                    'select * from languoidbase where father_id in (%s)' %
                    ','.join(parents))
            ]

    def handler(offset, batch):
        svalues = []
        rvalues = []
        for row in batch:
            jsondata = json.loads(row['jsondata'] or "{}")
            jsondata['bibtexkey'] = row['bibtexkey']
            dicts = {
                's':
                dict(pk=row['id'],
                     polymorphic_type='base',
                     id=str(row['id']),
                     name='%(author)s %(year)s' % row,
                     description=row['title'],
                     bibtex_type=getattr(EntryType, row['type']),
                     jsondata=jsondata),
                'r':
                dict(pk=row['id']),
            }
            for model, map_ in {
                    's': {
                        'author': None,
                        'yearstring': 'year',
                        'year': 'year_int',
                        'startpage': 'startpage_int',
                        'numberofpages': 'pages_int',
                        'pages': None,
                        'edition': None,
                        'school': None,
                        'address': None,
                        'url': None,
                        'note': None,
                        'number': None,
                        'series': None,
                        'editor': None,
                        'booktitle': None,
                        'journal': None,
                        'volume': None,
                        'publisher': None,
                    },
                    'r': {
                        'endpage': 'endpage_int',
                        'inlg': None,
                        'inlg_code': None,
                        'subject': None,
                        'subject_headings': None,
                        'keywords': None,
                        'normalizedauthorstring': None,
                        'normalizededitorstring': None,
                        'ozbib_id': None,
                    }
            }.items():
                for okey, nkey in map_.items():
                    dicts[model][nkey or okey] = row[okey]
            svalues.append(dicts['s'])
            rvalues.append(dicts['r'])
        DBSession.execute(common.Source.__table__.insert(), svalues)
        DBSession.execute(models2.Ref.__table__.insert(), rvalues)

    select(db, 'select * from refbase order by id', handler)
    DBSession.execute('COMMIT')

    for table, model, value, order in [
        ('macroarea', models2.Macroarea,
         lambda i, row: dict(pk=row['id'],
                             id=slug(row['name']),
                             name=row['name'],
                             description=row['description']), None),
        ('country', models2.Country,
         lambda i, row: dict(pk=row['id'], id=row['alpha2'], name=row['name']),
         None),
        ('provider', models2.Provider,
         lambda i, row: dict(pk=row['id'],
                             id=slug(row['name']),
                             name=row['description'],
                             description=row['comment'],
                             abbr=row['abbr'],
                             url=row['url'],
                             refurl=row['refurl'],
                             bibfield=row['bibfield']), None),
        ('doctype', models2.Doctype,
         lambda i, row: dict(pk=row['id'],
                             id=slug(row['name']),
                             abbr=row['abbr'],
                             name=row['name'],
                             description=row['description']), None),
        ('refprovider', models2.Refprovider, lambda i, row: dict(
            pk=i, provider_pk=row['provider_id'], ref_pk=row['refbase_id']),
         ('provider_id', 'refbase_id')),
        ('refdoctype', models2.Refdoctype, lambda i, row: dict(
            pk=i, doctype_pk=row['doctype_id'], ref_pk=row['refbase_id']),
         ('doctype_id', 'refbase_id')),
    ]:
        insert(db, table, model, value, order=order)

    names = dict(
        (int(d['id']), d['pk'])
        for d in insert(db,
                        'namebase',
                        common.Identifier,
                        lambda i, row: dict(pk=i,
                                            id=str(row['id']),
                                            name=row['namestring'],
                                            type='name',
                                            description=row['nameprovider'],
                                            lang=row['inlg'] if row['inlg'] and
                                            len(row['inlg']) <= 3 else 'en'),
                        order='id'))

    codes = dict(
        (int(d['id']), d['pk'])
        for d in insert(db,
                        'codebase',
                        common.Identifier,
                        lambda i, row: dict(pk=i,
                                            id=str(row['id']),
                                            name=row['codestring'],
                                            type=common.IdentifierType.iso.
                                            value if row['codeprovider'] ==
                                            'ISO' else row['codeprovider']),
                        start=len(names),
                        order='id'))

    res = insert(
        db, 'nodecodes', common.LanguageIdentifier,
        lambda i, row: dict(pk=i,
                            language_pk=row['languoidbase_id'],
                            identifier_pk=codes[row['codebase_id']]))

    insert(db,
           'nodenames',
           common.LanguageIdentifier,
           lambda i, row: dict(pk=i,
                               language_pk=row['languoidbase_id'],
                               identifier_pk=names[row['namebase_id']]),
           start=len(res))

    for table, model, value in [
        ('languoidmacroarea', models2.Languoidmacroarea,
         lambda i, row: dict(pk=i,
                             languoid_pk=row['languoidbase_id'],
                             macroarea_pk=row['macroarea_id'])),
        ('languoidcountry', models2.Languoidcountry,
         lambda i, row: dict(pk=i,
                             languoid_pk=row['languoidbase_id'],
                             country_pk=row['country_id'])),
        ('noderefs', common.LanguageSource,
         lambda i, row: dict(pk=i,
                             language_pk=row['languoidbase_id'],
                             source_pk=row['refbase_id'])),
        ('refmacroarea', models2.Refmacroarea, lambda i, row: dict(
            pk=i, macroarea_pk=row['macroarea_id'], ref_pk=row['refbase_id'])),
        ('refcountry', models2.Refcountry, lambda i, row: dict(
            pk=i, country_pk=row['country_id'], ref_pk=row['refbase_id'])),
        ('spuriousreplacements', models2.Superseded,
         lambda i, row: dict(pk=i,
                             languoid_pk=row['languoidbase_id'],
                             replacement_pk=row['replacement_id'],
                             description=row['relation'])),
        ('justification', common.ValueSetReference, lambda i, row: dict(
            pk=i,
            valueset_pk=valuesets['%s%s' % ('fc' if row[
                'type'] == 'family' else 'sc', row['languoidbase_id'])],
            source_pk=row['refbase_id'],
            description=row['pages'])),
    ]:
        insert(db, table, model, value)
示例#30
0
def main(args):
    fts.index('fts_index', Word.fts, DBSession.bind)
    DBSession.execute("CREATE EXTENSION IF NOT EXISTS unaccent WITH SCHEMA public;")

    data = Data()

    dataset = common.Dataset(
        id=dictionaria.__name__,
        name="Dictionaria",
        description="The Dictionary Journal",
        published=date(2017, 3, 30),
        contact='*****@*****.**',
        domain='dictionaria.clld.org',
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="https://shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})

    for i, (id_, name) in enumerate([
        ('haspelmathmartin', 'Martin Haspelmath'),
        ('moselulrike', 'Ulrike Mosel'),
        ('stiebelsbarbara', 'Barbara Stiebels')
    ]):
        ed = data.add(common.Contributor, id_, id=id_, name=name)
        common.Editor(dataset=dataset, contributor=ed, ord=i + 1)
    DBSession.add(dataset)

    for id_, name in LGR_ABBRS.items():
        DBSession.add(common.GlossAbbreviation(id=id_, name=name))

    comparison_meanings = {}

    print('loading concepts ...')

    glosses = set()
    concepticon = Concepticon(
        REPOS.joinpath('..', '..', 'concepticon', 'concepticon-data'))
    if not args.no_concepts:
        for conceptset in concepticon.conceptsets.values():
            if conceptset.gloss in glosses:
                continue
            glosses.add(conceptset.gloss)
            cm = data.add(
                ComparisonMeaning,
                conceptset.id,
                id=conceptset.id,
                name=conceptset.gloss.lower(),
                description=conceptset.definition,
                concepticon_url='http://concepticon.clld.org/parameters/%s' % conceptset.id)
            comparison_meanings[cm.id] = cm

    DBSession.flush()

    print('... done')

    comparison_meanings = {k: v.pk for k, v in comparison_meanings.items()}
    submissions = []

    for submission in REPOS.joinpath(
            'submissions-internal' if args.internal else 'submissions').glob('*'):
        if not submission.is_dir():
            continue

        try:
            submission = Submission(submission)
        except ValueError:
            continue

        md = submission.md
        if md is None:
            continue

        if not md['date_published']:
            continue

        id_ = submission.id
        if args.dict and args.dict != id_ and args.dict != 'all':
            continue
        lmd = md['language']
        props = md.get('properties', {})
        props.setdefault('custom_fields', [])
        props['metalanguage_styles'] = {}
        for v, s in zip(props.get('metalanguages', {}).values(),
                        ['success', 'info', 'warning', 'important']):
            props['metalanguage_styles'][v] = s
        props['custom_fields'] = ['lang-' + f if f in props['metalanguage_styles'] else f
                                  for f in props['custom_fields']]

        language = data['Variety'].get(lmd['glottocode'])
        if not language:
            language = data.add(
                Variety, lmd['glottocode'], id=lmd['glottocode'], name=lmd['name'])

        md['date_published'] = md['date_published'] or date.today().isoformat()
        if '-' not in md['date_published']:
            md['date_published'] = md['date_published'] + '-01-01'
        dictionary = data.add(
            Dictionary,
            id_,
            id=id_,
            number=md.get('number'),
            name=props.get('title', lmd['name'] + ' dictionary'),
            description=submission.description,
            language=language,
            published=date(*map(int, md['date_published'].split('-'))),
            jsondata=props)

        for i, spec in enumerate(md['authors']):
            if not isinstance(spec, dict):
                cname, address = spec, None
                spec = {}
            else:
                cname, address = spec['name'], spec.get('affiliation')
            name = HumanName(cname)
            cid = slug('%s%s' % (name.last, name.first))
            contrib = data['Contributor'].get(cid)
            if not contrib:
                contrib = data.add(
                    common.Contributor,
                    cid,
                    id=cid,
                    name=cname,
                    address=address,
                    url=spec.get('url'),
                    email=spec.get('email'))
            DBSession.add(common.ContributionContributor(
                ord=i + 1,
                primary=True,
                contributor=contrib,
                contribution=dictionary))

        submissions.append((dictionary.id, language.id, submission))
    transaction.commit()

    for did, lid, submission in submissions:
        #if submission.id != 'sidaama':
        #    continue
        transaction.begin()
        print('loading %s ...' % submission.id)
        dictdata = Data()
        lang = Variety.get(lid)
        submission.load_examples(Dictionary.get(did), dictdata, lang)
        submission.dictionary.load(
            submission,
            dictdata,
            Dictionary.get(did),
            lang,
            comparison_meanings,
            OrderedDict(submission.md.get('properties', {}).get('labels', [])))
        transaction.commit()
        print('... done')

    transaction.begin()
    load_families(
        Data(),
        [v for v in DBSession.query(Variety) if re.match('[a-z]{4}[0-9]{4}', v.id)],
        glottolog_repos='../../glottolog3/glottolog')
示例#31
0
def prime_cache(args):
    """If data needs to be denormalized for lookup, do that here.
    This procedure should be separate from the db initialization, because
    it will have to be run periodically whenever data has been updated.
    """
    compute_language_sources()
    return 
    from time import time
    _s = time()

    def checkpoint(s, msg=None):
        n = time()
        print(n - s, msg or '')
        return n

    sql = """
select p.id, l.id, v.name from value as v, valueset as vs, language as l, parameter as p
where v.valueset_pk = vs.pk and vs.language_pk = l.pk and vs.parameter_pk = p.pk
    """
    datatriples = [(v[0], v[1], v[2]) for v in DBSession.execute(sql)]
    _s = checkpoint(_s, '%s values loaded' % len(datatriples))

    flv = dict([(feature, dict(lvs)) for (feature, lvs) in grp(datatriples).items()])
    _s = checkpoint(_s, 'triples grouped')

    clfps = list(get_clf_paths([row[0] for row in DBSession.execute("select id from language")]))
    _s = checkpoint(_s, '%s clfps loaded' % len(clfps))

    features = {f.id: f for f in DBSession.query(Feature)}
    for (f, lv) in flv.items():
        features[f].representation = len(lv)
    DBSession.flush()
    _s = checkpoint(_s, 'representation assigned')

    families = {f.id: f for f in DBSession.query(Family)}
    if False:
        fs = feature_stability(datatriples, clfps)
        _s = checkpoint(_s, 'feature_stability computed')

        for (f, (s, transitions, stationarity_p, synchronic_p)) in fs:
            print(f)
            stability = Stability(
                id=f.replace("GB", "S"),
                feature=features[f],
                parsimony_stability_value=s["stability"],
                parsimony_retentions=s["retentions"],
                parsimony_transitions=s["transitions"],
                jsondata={'diachronic_p': stationarity_p, "synchronic_p": synchronic_p})
            DBSession.add(stability)
            for (i, (fam, (fromnode, tonode), (ft, tt))) in enumerate(transitions):
                DBSession.add(Transition(
                    id="%s: %s->%s" % (f, fromnode, tonode),
                    stability=stability,
                    fromnode=get_name(fromnode),
                    tonode=get_name(tonode),
                    fromvalue=ft,
                    tovalue=tt,
                    family=families[fam],
                    retention_innovation="Retention" if ft == tt else "Innovation"))
        DBSession.flush()
        _s = checkpoint(_s, 'stability and transitions loaded')

    imps = feature_dependencies(datatriples)
    _s = checkpoint(_s, 'feature_dependencies computed')
    if True:
        (H, V) = dependencies_graph([(v, f1, f2) for ((v, dstats), f1, f2) in imps])
        _s = checkpoint(_s, 'dependencies_graph written')

        for (i, ((v, dstats), f1, f2)) in enumerate(imps):
            combinatory_status = ("primary" if (f1, f2) in H else ("epiphenomenal" if v > 0.0 else None)) if H else "N/A"
            DBSession.add(Dependency(
                id="%s->%s" % (f1, f2),
                strength=v,
                feature1=features[f1],
                feature2=features[f2],
                representation=dstats["representation"],
                combinatory_status=combinatory_status,
                jsondata=dstats))
        DBSession.flush()
        _s = checkpoint(_s, 'dependencies loaded')

    coordinates = {
        lg.id: (lg.longitude, lg.latitude)
        for lg in DBSession.query(common.Language)
        .filter(common.Language.longitude != None)
        .filter(common.Language.latitude != None)}
    deepfams = deep_families(datatriples, clfps, coordinates=coordinates)
    _s = checkpoint(_s, '%s deep_families computed' % len(deepfams))

    missing_families = set()
    data = Data()
    for ((l1, l2), support_value, significance, supports, f1c, f2c) in deepfams:
        dname = "proto-%s x proto-%s" % (glottolog_names[l1], glottolog_names[l2])
        kmdistance = havdist(f1c, f2c)
        (f1lon, f1lat) = f1c if f1c else (None, None)
        (f2lon, f2lat) = f2c if f2c else (None, None)

        for li in [l1, l2]:
            if li not in families:
                missing_families.add(li)

        deepfam = DeepFamily(
            id=dname,
            support_value=support_value,
            significance=significance,
            family1=families.get(l1),
            family2=families.get(l2),
            family1_latitude = f1lat,
            family1_longitude = f1lon,
            family2_latitude = f2lat,
            family2_longitude = f2lon,
            geographic_plausibility = kmdistance)
        DBSession.add(deepfam)
        for (f, v1, v2, historical_score, independent_score, support_score) in supports:
            vid = ("%s: %s %s %s" % (f, v1, "==" if v1 == v2 else "!=", v2)).replace(".", "")
            #vname = ("%s|%s" % (v1, v2)).replace(".", "")
            #print vid, vname
            if vid not in data["Support"]:
                data.add(
                    Support, vid,
                    id = vid,
                    historical_score = historical_score,
                    independent_score = independent_score,
                    support_score = support_score,
                    value1= v1,
                    value2 = v2,
                    feature=features[f])
            DBSession.add(HasSupport(
                id=dname + "-" + vid,
                deepfamily = deepfam,
                support = data["Support"][vid]))
    print('missing_families:')
    print(missing_families)
    DBSession.flush()
    _s = checkpoint(_s, 'deep_families loaded')

    compute_language_sources()
示例#32
0
def main(args):
    active_only = not args.all
    coords = dict(
        (r[0], r[1:]) for r in dsv.rows(args.data_file('coordinates.tab')))
    codes = dict((row[0], row[1]) for row in DBSession.execute(
        "select ll.hid, l.pk from languoid as ll, language as l where ll.pk = l.pk and ll.hid is not null"
    ))

    maxid = DBSession.execute(
        "select pk from languoid order by pk desc limit 1").fetchone()[0]
    gcs = {}

    lnames = {}
    for row in DBSession.execute("select pk, name from language"):
        lnames[row[0]] = row[1]

    # dict mapping branches (i.e. tuples of sub-family names) to dicts of H-languages
    families = OrderedDict()

    # dict mapping identifiers of H-languages to branches
    languages = OrderedDict()

    parse_families(args.data_file('lff.txt'), families, languages)

    # handle isolates / collapse families with exactly one leaf:
    isolate_names = {}
    collapsed_names = {}
    for key in families.keys():
        if len(families[key]) == 1:
            if len(key) == 1:
                # isolate
                languages[families[key].keys()[0]][0] = None
                isolate_names[key[0]] = families[key].keys()[
                    0]  # map name to code
            else:
                languages[families[key].keys()[0]][0] = key[:-1]
                collapsed_names[key[-1]] = families[key].keys()[0]
            del families[key]

    # we also want to be able to lookup families by name
    names = {}
    for branch in families:
        name = branch[-1]
        if name in names:
            names[name].append(branch)
        else:
            names[name] = [branch]

    # now add the unclassifiabble, unattested, un-whatever
    parse_families(args.data_file('lof.txt'), families, languages)

    ncodes = {}
    languoids = []
    for code in languages:
        if code not in codes:
            maxid += 1
            ncodes[code] = maxid
            hnode, status, name, comment = languages[code]
            # we have to insert a new H-language!
            attrs = languoid(
                maxid,
                'language',
                hid=code,
                id=glottocode(unicode(name), DBSession, gcs),
                name=name,
                hname=name,
                status=status,
                globalclassificationcomment=comment or None,
            )
            print '++', attrs
            if coords.get(code):
                attrs['longitude'], attrs['latitude'] = map(
                    float, coords.get(code))
            languoids.append(attrs)

    urnodes = {}
    rnodes = {}
    for family in families:
        leafs = families[family]
        assert family[0] not in ['Speech Register', 'Spurious']
        leafs = tuple(
            sorted(code for code in families[family].keys() if code in codes))
        assert leafs
        if leafs in rnodes:
            # special case: there may be additional "Unclassified something" nodes in
            # branch without any changes in the set of leafs.
            assert [n for n in family if n.startswith('Unclassified')]
            fset, rset = set(family), set(rnodes[leafs])
            assert rset.issubset(fset)
            assert leafs not in urnodes
            urnodes[leafs] = family
            #if len(family) > rnodes[leafs]:
            #    rnodes[leafs] = family
        else:
            rnodes[leafs] = family

    #
    # at this point rnodes is a consolidated mapping of sets of H-Languages to branches in
    # the family tree.
    #

    # for set comparisons we compute a list of actual sets of leafs as well
    leafsets = [set(t) for t in sorted(rnodes.keys(), key=lambda s: len(s))]

    todo = []

    # dict mapping (id, name, level) tuples for gl languoids of level family to tuples of leafs
    glnodes = {}
    #
    # note: all languoids with level null have children, thus are not dialects!
    #
    sql = "select l.pk, l.name, ll.level, ll.father_pk from languoid as ll, language as l where ll.pk = l.pk and ll.level = 'family' or ll.level is null"
    if active_only:
        sql = "select l.pk, l.name, ll.level, ll.father_pk from languoid as ll, language as l where ll.pk = l.pk and ll.level = 'family' and l.active = true"

    for row in DBSession.execute(sql).fetchall():
        leafs = [
            r[0] for r in DBSession.execute(
                "select distinct l.hid from treeclosuretable as t, languoid as l where t.child_pk = l.pk and t.parent_pk = %s and l.hid is not null and l.status != 'provisional'"
                % row[0])
        ]
        if leafs:
            glnodes[(row[0], row[2], row[1], row[3])] = tuple(sorted(leafs))
        else:
            # families without leafs will be marked as retired
            if row[1] in names and len(names[row[1]]) == 1:
                # unique family name, good enough for a match!?
                todo.append(Migration(row[0], None, pointer=names[row[1]][0]))
            else:
                todo.append(Migration(row[0], None))

    # note: for legacy gl nodes, we map leaf-tuples to lists of matching nodes!
    rglnodes = {}
    for node, leafs in glnodes.items():
        if leafs in rglnodes:
            rglnodes[leafs].append(node)
        else:
            rglnodes[leafs] = [node]

    # now we look for matches between old and new classification:
    for leafs, nodes in rglnodes.items():
        assert leafs
        assert nodes
        todo.extend(match_nodes(leafs, nodes, rnodes, urnodes, leafsets,
                                names))

    # compile a mapping for exact matches:
    branch_to_pk = {}
    for m in todo:
        if m.hid:
            assert m.hid not in branch_to_pk
            branch_to_pk[m.hid] = m.pk

    new = 0
    for hnode in sorted(families.keys(), key=lambda b: (len(b), b)):
        # loop through branches breadth first to determine what's to be inserted
        if hnode not in branch_to_pk:
            t = tuple(sorted(families[hnode].keys()))
            if t in rglnodes:
                # the "Unclassified subfamily" special case from above:
                assert [n for n in hnode if n.startswith('Unclassified')]
                # make sure, the existing glottolog family for the set of leafs is mapped
                # to some other node in the new classification:
                assert rglnodes[t][0][0] in [m.pk for m in todo if m.hid]

            maxid += 1
            attrs = languoid(
                maxid,
                'family',
                id=glottocode(unicode(hnode[-1]), DBSession, gcs),
                name=hnode[-1],
                hname=hnode[-1],
            )
            branch_to_pk[hnode] = maxid
            lnames[maxid] = hnode[-1]
            if len(hnode) > 1:
                attrs['father_pk'] = branch_to_pk[tuple(list(hnode)[:-1])]
                assert attrs['father_pk']
            print '++', attrs
            new += 1
            languoids.append(attrs)

    # now on to the updates for families:
    matches, migrations, nomatches = 0, 0, 0
    for m in todo:
        attrs = languoid(m.pk, 'family', name=lnames[m.pk])
        if m.hid:
            #print '==', lnames[m.pk].encode('utf8'), '->', ', '.join(m.hid).encode('utf8')
            matches += 1

            if len(m.hid) > 1:
                attrs['father_pk'] = branch_to_pk[tuple(list(m.hid)[:-1])]
            if getattr(m, 'rename', False):
                attrs['name'] = m.hid[-1]
            attrs['hname'] = m.hid[-1]
        else:
            attrs['active'] = False
            if getattr(m, 'pointer', False):
                print '~~', lnames[m.pk].encode('utf8'), '->', ', '.join(
                    m.pointer).encode('utf8')
                migrations += 1

                attrs['replacement'] = branch_to_pk[m.pointer]
            else:
                print '--', lnames[m.pk].encode('utf8'), '->'
                nomatches += 1
        languoids.append(attrs)

    print matches, 'matches'
    print migrations, 'migrations'
    print nomatches, 'nomatches'
    print new, 'new nodes'

    risolate_names = dict(zip(isolate_names.values(), isolate_names.keys()))
    rcollapsed_names = dict(
        zip(collapsed_names.values(), collapsed_names.keys()))

    # and updates of father_pks for languages:
    for l in languages:
        hnode, status, name, comment = languages[l]
        id_ = codes.get(l, ncodes.get(l))
        attrs = languoid(id_, 'language', status=status)
        if hnode:
            attrs['father_pk'] = branch_to_pk[hnode]
        attrs['globalclassificationcomment'] = comment or None
        # look for hnames!
        if l in risolate_names:
            attrs['hname'] = risolate_names[l]
        if l in rcollapsed_names:
            attrs['hname'] = rcollapsed_names[l]
        languoids.append(attrs)

    for row in DBSession.execute(
            "select l.pk, ll.hid, l.name from languoid as ll, language as l where ll.pk = l.pk and ll.hid like '%NOCODE_%'"
    ).fetchall():
        if row[1] not in languages:
            # languoids with Harald's private code that are no longer in use
            attrs = languoid(row[0],
                             'language',
                             status='retired',
                             active=False,
                             father_pk=None)
            languoids.append(attrs)

    with open(args.data_file('languoids.json'), 'w') as fp:
        json.dump(languoids, fp)
示例#33
0
def main(args):  # pragma: no cover
    stats = Counter(new=0, updated=0, skipped=0)
    changes = {}

    with transaction.manager:
        update_providers(args)
        DBSession.flush()
        provider_map = get_map(Provider)
        macroarea_map = get_map(Macroarea)
        doctype_map = get_map(Doctype)

        languoid_map = {}
        for l in DBSession.query(Languoid):
            if l.hid:
                languoid_map[l.hid] = l
            languoid_map[l.id] = l

        for i, rec in enumerate(get_bib(args)):
            if i and i % 1000 == 0:
                print i, 'records done', stats['updated'] + stats['new'], 'changed'

            if len(rec.keys()) < 6:
                # not enough information!
                stats.update(['skipped'])
                continue

            changed = False
            assert rec.get('glottolog_ref_id')
            id_ = int(rec.get('glottolog_ref_id'))

            ref = DBSession.query(Source).get(id_)
            update = True if ref else False

            kw = {
                'pk': id_,
                'bibtex_type': rec.genre,
                'id': str(id_),
                'jsondata': {'bibtexkey': rec.id},
            }

            for source, target in FIELD_MAP.items():
                if target is None:
                    continue
                value = rec.get(source)
                if value:
                    value = unescape(value)
                    if target:
                        kw[target] = CONVERTER.get(source, lambda x: x)(value)
                    else:
                        kw['jsondata'][source] = value

            if kw['jsondata'].get('hhtype'):
                trigger = ca_trigger(kw['jsondata']['hhtype'])
                if trigger:
                    kw['ca_doctype_trigger'], kw['jsondata']['hhtype'] = trigger

            # try to extract numeric year, startpage, endpage, numberofpages, ...
            if kw.get('year'):
                # prefer years in brackets over the first 4-digit number.
                match = PREF_YEAR_PATTERN.search(kw.get('year'))
                if match:
                    kw['year_int'] = int(match.group('year'))
                else:
                    match = YEAR_PATTERN.search(kw.get('year'))
                    if match:
                        kw['year_int'] = int(match.group('year'))

            if kw.get('publisher'):
                p = kw.get('publisher')
                if ':' in p:
                    address, publisher = [s.strip() for s in kw['publisher'].split(':', 1)]
                    if 'address' not in kw or kw['address'] == address:
                        kw['address'], kw['publisher'] = address, publisher

            if rec.get('numberofpages'):
                try:
                    kw['pages_int'] = int(rec.get('numberofpages').strip())
                except ValueError:
                    pass

            if kw.get('pages'):
                start, end, number = compute_pages(kw['pages'])
                if start is not None:
                    kw['startpage_int'] = start
                if end is not None:
                    kw['endpage_int'] = end
                if number is not None and 'pages_int' not in kw:
                    kw['pages_int'] = number

            for k in kw.keys():
                v = kw[k]
                if isinstance(v, basestring):
                    v = v.strip() or None
                kw[k] = v

            if update:
                for k in kw.keys():
                    if k == 'pk':
                        continue
                    v = getattr(ref, k)
                    if kw[k] != v:
                        if k == 'jsondata':
                            d = {k: v for k, v in ref.jsondata.items()
                                 if k in NONREF_JSONDATA}
                            d.update(**kw[k])
                            ref.jsondata = d
                        else:
                            #print k, '--', v
                            #print k, '++', kw[k]
                            setattr(ref, k, kw[k])
                            changed = True
                            if ref.id in changes:
                                changes[ref.id][k] = ('%s' % v, '%s' % kw[k])
                            else:
                                changes[ref.id] = {k: ('%s' % v, '%s' % kw[k])}
            else:
                changed = True
                ref = Ref(name='%s %s' % (kw.get('author', 'na'), kw.get('year', 'nd')), **kw)

            ref.description = ref.title or ref.booktitle
            originator = ref.author or ref.editor or 'Anonymous'
            ref.name = '%s %s' % (originator, ref.year or 'n.d.')

            a, r = update_relationship(
                ref.macroareas,
                [macroarea_map[name] for name in
                 set(filter(None, [s.strip() for s in kw['jsondata'].get('macro_area', '').split(',')]))])
            changed = changed or a or r

            src = [s.strip() for s in kw['jsondata'].get('src', '').split(',')]
            prv = {provider_map[slug(s)] for s in src if s}
            if set(ref.providers) != prv:
                ref.providers = list(prv)
                changed = True

            a, r = update_relationship(
                ref.doctypes,
                [doctype_map[m.group('name')] for m in
                 DOCTYPE_PATTERN.finditer(kw['jsondata'].get('hhtype', ''))])
            changed = changed or a or r

            if not update:
                stats.update(['new'])
                DBSession.add(ref)
            elif changed:
                stats.update(['updated'])

    args.log.info('%s' % stats)

    DBSession.execute("update source set description = title where description is null and title is not null;")
    DBSession.execute("update source set description = booktitle where description is null and booktitle is not null;")

    for row in list(DBSession.execute(
            "select pk, pages, pages_int, startpage_int from source where pages_int < 0")):
        pk, pages, number, start = row
        _start, _end, _number = compute_pages(pages)
        if _number > 0 and _number != number:
            DBSession.execute(
                "update source set pages_int = %s, startpage_int = %s where pk = %s" %
                (_number, _start, pk))
            DBSession.execute(
                "update ref set endpage_int = %s where pk = %s" %
                (_end, pk))

    jsondump(changes, args.data_dir.joinpath('references', 'changes.json'))
示例#34
0
def main(args):
    active_only = not args.all
    codes = dict((row[0], row[1]) for row in
                 DBSession.execute("select ll.hid, l.pk from languoid as ll, language as l where ll.pk = l.pk and ll.hid is not null"))

    maxid = DBSession.execute(
        "select pk from languoid order by pk desc limit 1").fetchone()[0]
    gcs = {}

    lnames = {}
    for row in DBSession.execute("select pk, name from language"):
        lnames[row[0]] = row[1]

    # dict mapping branches (i.e. tuples of sub-family names) to dicts of H-languages
    families = OrderedDict()

    # dict mapping identifiers of H-languages to branches
    languages = OrderedDict()

    parse_families(data_file(args, 'lff.txt'), families, languages)

    # handle isolates / collapse families with exactly one leaf:
    isolate_names = {}
    collapsed_names = {}
    for key in families.keys():
        if len(families[key]) == 1:
            if len(key) == 1:
                # isolate
                languages[families[key].keys()[0]][0] = None
                isolate_names[key[0]] = families[key].keys()[0]  # map name to code
            else:
                languages[families[key].keys()[0]][0] = key[:-1]
                collapsed_names[key[-1]] = families[key].keys()[0]
            del families[key]

    # we also want to be able to lookup families by name
    names = {}
    for branch in families:
        name = branch[-1]
        if name in names:
            names[name].append(branch)
        else:
            names[name] = [branch]

    # now add the unclassifiabble, unattested, un-whatever
    parse_families(data_file(args, 'lof.txt'), families, languages)

    ncodes = {}
    languoids = []
    for code in languages:
        if code not in codes:
            maxid += 1
            ncodes[code] = maxid
            hnode, status, name, comment = languages[code]
            # we have to insert a new H-language!
            attrs = languoid(
                maxid,
                'language',
                hid=code,
                id=glottocode(unicode(name), DBSession, gcs),
                name=name,
                hname=name,
                status=status,
                globalclassificationcomment=comment or None,
            )
            print '++', attrs
            languoids.append(attrs)

    urnodes = {}
    rnodes = {}
    for family in families:
        #leafs = families[family]
        assert family[0] not in ['Speech Register', 'Spurious']
        leafs = tuple(sorted(code for code in families[family].keys() if code in codes))
        try:
            assert leafs
        except:
            print 'Family with only new languages!!'
            print family
            continue
            #raise
        if leafs in rnodes:
            # so we have already seen this exact set of leaves.
            #
            # special case: there may be additional "Unclassified something" nodes in
            # branch without any changes in the set of leafs ...
            try:
                assert [n for n in family if n.startswith('Unclassified')]
            except:
                print family
                print leafs
                # ... or the full leafset contains new languages
                assert [code for code in families[family[:-1]].keys() if code in ncodes]
            fset, rset = set(family), set(rnodes[leafs])
            assert rset.issubset(fset)
            assert leafs not in urnodes
            urnodes[leafs] = family
            #if len(family) > rnodes[leafs]:
            #    rnodes[leafs] = family
        else:
            rnodes[leafs] = family

    #
    # at this point rnodes is a consolidated mapping of sets of H-Languages to branches in
    # the family tree.
    #

    # for set comparisons we compute a list of actual sets of leafs as well
    leafsets = [set(t) for t in sorted(rnodes.keys(), key=lambda s: len(s))]

    todo = []

    # dict mapping (id, name, level) tuples for gl languoids of level family to tuples of leafs
    glnodes = {}
    #
    # note: all languoids with level null have children, thus are not dialects!
    #
    sql = "select l.pk, l.name, ll.level, ll.father_pk from languoid as ll, language as l where ll.pk = l.pk and ll.level = 'family' or ll.level is null"
    if active_only:
        sql = "select l.pk, l.name, ll.level, ll.father_pk from languoid as ll, language as l where ll.pk = l.pk and ll.level = 'family' and l.active = true"

    for row in DBSession.execute(sql).fetchall():
        leafs = [r[0] for r in DBSession.execute(
            "select distinct l.hid from treeclosuretable as t, languoid as l where t.child_pk = l.pk and t.parent_pk = %s and l.hid is not null and l.status != 'provisional'"
            % row[0])]
        if leafs:
            glnodes[(row[0], row[2], row[1], row[3])] = tuple(sorted(leafs))
        else:
            # families without leafs will be marked as retired
            if row[1] in names and len(names[row[1]]) == 1:
                # unique family name, good enough for a match!?
                todo.append(Migration(row[0], None, pointer=names[row[1]][0]))
            else:
                todo.append(Migration(row[0], None))

    # note: for legacy gl nodes, we map leaf-tuples to lists of matching nodes!
    rglnodes = {}
    for node, leafs in glnodes.items():
        if leafs in rglnodes:
            rglnodes[leafs].append(node)
        else:
            rglnodes[leafs] = [node]

    # now we look for matches between old and new classification:
    for leafs, nodes in rglnodes.items():
        assert leafs
        assert nodes
        todo.extend(match_nodes(leafs, nodes, rnodes, urnodes, leafsets, names))

    # compile a mapping for exact matches:
    branch_to_pk = {}
    for m in todo:
        if m.hid:
            if m.hid in branch_to_pk:
                if branch_to_pk[m.hid] != m.pk:
                    # compare names:
                    if lnames[m.pk] == m.hid[-1]:
                        print '#### type1'
                        branch_to_pk[m.hid] = m.pk
                    elif lnames[branch_to_pk[m.hid]] == m.hid[-1]:
                        print '#### type2'
                        pass
                    else:
                        print m.hid
                        print m.hid[-1]
                        print lnames[m.pk]
                        print branch_to_pk[m.hid]
                        print m.pk
                        raise ValueError
            else:
                #assert m.hid not in branch_to_pk
                branch_to_pk[m.hid] = m.pk

    new = 0
    for hnode in sorted(families.keys(), key=lambda b: (len(b), b)):
        # loop through branches breadth first to determine what's to be inserted
        if hnode not in branch_to_pk:
            t = tuple(sorted(families[hnode].keys()))
            if t in rglnodes:
                # the "Unclassified subfamily" special case from above:
                try:
                    assert [n for n in hnode if n.startswith('Unclassified')]
                except:
                    # or the "new language inserted higher up" case!
                    assert [code for code in families[hnode[:-1]].keys() if code in ncodes]
                    #print hnode
                    #print t
                    #raise
                # make sure, the existing glottolog family for the set of leafs is mapped
                # to some other node in the new classification:
                assert rglnodes[t][0][0] in [m.pk for m in todo if m.hid]

            maxid += 1
            attrs = languoid(
                maxid,
                'family',
                id=glottocode(unicode(hnode[-1]), DBSession, gcs),
                name=hnode[-1],
                hname=hnode[-1],
            )
            branch_to_pk[hnode] = maxid
            lnames[maxid] = hnode[-1]
            if len(hnode) > 1:
                attrs['father_pk'] = branch_to_pk[tuple(list(hnode)[:-1])]
                assert attrs['father_pk']
            print '++', attrs
            new += 1
            languoids.append(attrs)

    # now on to the updates for families:
    matches, migrations, nomatches = 0, 0, 0
    for m in todo:
        attrs = languoid(m.pk, 'family', name=lnames[m.pk])
        if m.hid:
            #print '==', lnames[m.pk].encode('utf8'), '->', ', '.join(m.hid).encode('utf8')
            matches += 1

            if len(m.hid) > 1:
                attrs['father_pk'] = branch_to_pk[tuple(list(m.hid)[:-1])]
            if getattr(m, 'rename', False):
                attrs['name'] = m.hid[-1]
            attrs['hname'] = m.hid[-1]
        else:
            attrs['active'] = False
            if getattr(m, 'pointer', False):
                print '~~', lnames[m.pk].encode('utf8'), '->', ', '.join(m.pointer).encode('utf8')
                migrations += 1

                attrs['replacement'] = branch_to_pk[m.pointer]
            else:
                print '--', lnames[m.pk].encode('utf8'), '->'
                nomatches += 1
        languoids.append(attrs)

    print matches, 'matches'
    print migrations, 'migrations'
    print nomatches, 'nomatches'
    print new, 'new nodes'

    risolate_names = dict(zip(isolate_names.values(), isolate_names.keys()))
    rcollapsed_names = dict(zip(collapsed_names.values(), collapsed_names.keys()))

    # and updates of father_pks for languages:
    for l in languages:
        hnode, status, name, comment = languages[l]
        id_ = codes.get(l, ncodes.get(l))
        attrs = languoid(id_, 'language', status=status)
        if hnode:
            attrs['father_pk'] = branch_to_pk[hnode]
        attrs['globalclassificationcomment'] = comment or None
        # look for hnames!
        if l in risolate_names:
            attrs['hname'] = risolate_names[l]
        if l in rcollapsed_names:
            attrs['hname'] = rcollapsed_names[l]
        languoids.append(attrs)

    for row in DBSession.execute(
        "select l.pk, ll.hid, l.name from languoid as ll, language as l where ll.pk = l.pk and ll.hid like '%NOCODE_%'"
    ).fetchall():
        if row[1] not in languages:
            # languoids with Harald's private code that are no longer in use
            attrs = languoid(
                row[0], 'language', status='retired', active=False, father_pk=None)
            languoids.append(attrs)

    with open(data_file(args, 'languoids.json'), 'w') as fp:
        json.dump(languoids, fp)
def main(args):
    stats = Counter(new=0, matches=0, migrations=0, nomatches=0)
    l, ll = Language.__table__.alias('l'), Languoid.__table__.alias('ll')
    gl_languoids = list(
        DBSession.execute(
            select([l, ll],
                   use_labels=True).where(l.c.pk == ll.c.pk)).fetchall())

    # we collect a list of changes which we will store in a JSON file.
    changes = []

    hid_to_pk = {
        row['ll_hid']: row['l_pk']
        for row in gl_languoids if row['ll_hid']
    }
    max_languoid_pk = max(*[row['l_pk'] for row in gl_languoids])
    new_glottocodes = {}
    pk_to_name = {row['l_pk']: row['l_name'] for row in gl_languoids}

    # dict mapping branches (i.e. tuples of sub-family names) to dicts of H-languages
    hh_families = OrderedDict()

    # dict mapping identifiers (i.e. hid) of H-languages to branches
    hh_languages = OrderedDict()

    parse_families(args.data_dir.joinpath('languoids', 'lff.txt'), hh_families,
                   hh_languages)

    # handle isolates / collapse families with exactly one leaf:
    isolate_names = {}
    collapsed_names = {}
    for key in hh_families.keys():
        if len(hh_families[key]) == 1:
            if len(key) == 1:
                # isolate
                hh_languages[hh_families[key].keys()[0]][0] = None
                isolate_names[key[0]] = hh_families[key].keys()[
                    0]  # map name to code
            else:
                hh_languages[hh_families[key].keys()[0]][0] = key[:-1]
                collapsed_names[key[-1]] = hh_families[key].keys()[0]
            del hh_families[key]

    # now add the unclassifiabble, unattested, un-whatever
    parse_families(args.data_dir.joinpath('languoids', 'lof.txt'), hh_families,
                   hh_languages)

    # we also want to be able to lookup families by name
    fname_to_branches = defaultdict(list)
    for branch in hh_families:
        fname_to_branches[branch[-1]].append(branch)

    new_hid_to_pk = {}
    for code, (hnode, status, name) in hh_languages.items():
        if code not in hid_to_pk:
            # we have to insert a new H-language!
            max_languoid_pk += 1
            new_hid_to_pk[code] = max_languoid_pk

            if name in pk_to_name.values():
                args.log.warn('new code {1} for existing name {0}'.format(
                    name, code))
            changes.append(
                languoid(max_languoid_pk,
                         'language',
                         hid=code,
                         id=glottocode(unicode(name), DBSession,
                                       new_glottocodes),
                         name=name,
                         hname=name,
                         status=status))
            stats.update(['new_languages'])

    duplicate_leafset_to_branch = {}
    leafset_to_branch = {}
    for family, langs in hh_families.items():
        leafs = get_leafset(hid for hid in langs.keys() if hid in hid_to_pk)
        if not leafs:
            args.log.info('Family with only new languages: %s, %s' %
                          (family, langs))
            continue

        if leafs in leafset_to_branch:
            # so we have already seen this exact set of leaves.
            #
            # special case: there may be additional "Unclassified something" nodes in
            # branch without any changes in the set of leafs ...
            if not [n for n in family if n.startswith('Unclassified')]:
                # ... or the full leafset contains new languages
                assert [
                    hid for hid in hh_families[family[:-1]].keys()
                    if hid in new_hid_to_pk
                ]
            fset, rset = set(family), set(leafset_to_branch[leafs])
            assert rset.issubset(fset)
            assert leafs not in duplicate_leafset_to_branch
            duplicate_leafset_to_branch[leafs] = family
        else:
            leafset_to_branch[leafs] = family

    #
    # at this point leafset_to_branch is a consolidated mapping of sets of H-Languages
    # to branches in the new family tree.
    #

    # for set comparisons we compute a list of actual sets (not tuples) of leafs
    # ordered by length.
    leafsets = [
        set(t) for t in sorted(leafset_to_branch.keys(), key=lambda s: len(s))
    ]

    todo = []

    gl_family_to_leafset = {}

    def select_leafs(pk):
        l, tc = Languoid.__table__.alias(
            'l'), TreeClosureTable.__table__.alias('tc')
        return [
            r['l_hid'] for r in DBSession.execute(
                select([l, tc], use_labels=True).where(
                    and_(l.c.pk == tc.c.child_pk, l.c.hid != None, l.c.status
                         != LanguoidStatus.provisional, tc.c.parent_pk == pk)))
        ]

    for row in gl_languoids:
        if row['ll_level'] == LanguoidLevel.family and row['l_active']:
            leafs = get_leafset(select_leafs(row['l_pk']))
            assert leafs
            glnode = GLNode(row['l_pk'], row['l_name'], row['ll_level'].name,
                            row['ll_father_pk'],
                            row['l_jsondata'].get('hname'))
            gl_family_to_leafset[glnode] = leafs

    # note: for legacy gl nodes, we map leaf-tuples to lists of matching nodes!
    leafset_to_gl_family = defaultdict(list)
    for node, leafs in gl_family_to_leafset.items():
        leafset_to_gl_family[leafs].append(node)

    # now we look for matches between old and new classification:
    for leafs, nodes in leafset_to_gl_family.items():
        todo.extend(
            match_nodes(args, leafs, nodes, leafset_to_branch,
                        duplicate_leafset_to_branch, leafsets,
                        fname_to_branches))

    # compile a mapping for exact matches:
    branch_to_pk = {}
    for m in todo:
        if m.hid:
            if m.hid in branch_to_pk:
                if branch_to_pk[m.hid] != m.pk:
                    # compare names:
                    if pk_to_name[m.pk] == m.hid[-1]:
                        args.log.info('#### type1')
                        branch_to_pk[m.hid] = m.pk
                    elif pk_to_name[branch_to_pk[m.hid]] == m.hid[-1]:
                        args.log.info('#### type2')
                    else:
                        raise ValueError
            else:
                branch_to_pk[m.hid] = m.pk

    for hnode in sorted(hh_families.keys(), key=lambda b: (len(b), b)):
        # loop through branches breadth first to determine what's to be inserted
        if hnode not in branch_to_pk:
            t = get_leafset(hh_families[hnode].keys())
            if t in leafset_to_gl_family:
                # the "Unclassified subfamily" special case from above:
                if not [n for n in hnode if n.startswith('Unclassified')]:
                    assert [
                        hid for hid in hh_families[hnode[:-1]].keys()
                        if hid in new_hid_to_pk
                    ]
                # make sure, the existing glottolog family for the set of leafs is mapped
                # to some other node in the new classification:
                assert leafset_to_gl_family[t][0].pk in [
                    m.pk for m in todo if m.hid
                ]

            max_languoid_pk += 1
            branch_to_pk[hnode] = max_languoid_pk
            pk_to_name[max_languoid_pk] = hnode[-1]
            attrs = languoid(
                max_languoid_pk,
                'family',
                id=glottocode(unicode(hnode[-1]), DBSession, new_glottocodes),
                name=hnode[-1],
                hname=hnode[-1],
            )
            if len(hnode) > 1:
                attrs['father_pk'] = branch_to_pk[tuple(list(hnode)[:-1])]
                assert attrs['father_pk']
            stats.update(['new'])
            changes.append(attrs)

    # now on to the updates for families:
    for m in todo:
        attrs = languoid(m.pk, 'family', name=pk_to_name[m.pk])
        if m.hid:
            stats.update(['matches'])
            if len(m.hid) > 1:
                attrs['father_pk'] = branch_to_pk[tuple(list(m.hid)[:-1])]
            if getattr(m, 'rename', False):
                attrs['name'] = m.hid[-1]
            attrs['hname'] = m.hid[-1]
        else:
            attrs['active'] = False  # mark the languoid as obsolete.
            if getattr(m, 'pointer', False):
                print '~~', m.pk, pk_to_name[m.pk].encode('utf8'), '->', \
                    ', '.join(m.pointer).encode('utf8')
                stats.update(['migrations'])
                attrs['replacement'] = branch_to_pk[m.pointer]
            else:
                stats.update(['nomatches'])
        changes.append(attrs)

    args.log.info('%s' % stats)

    risolate_names = dict(zip(isolate_names.values(), isolate_names.keys()))
    rcollapsed_names = dict(
        zip(collapsed_names.values(), collapsed_names.keys()))

    # and updates of father_pks for languages:
    for l, (hnode, status, name) in hh_languages.items():
        id_ = hid_to_pk.get(l)
        if not id_:
            id_ = new_hid_to_pk.get(l)
            attrs = languoid(id_, 'language', status=status)
        else:
            attrs = languoid(id_, 'language', status=status)
            # In case of existing languoids, we don't change the active flag!
            del attrs['active']
        if id_ in pk_to_name and name != pk_to_name[id_]:
            if slug(pk_to_name[id_]) == slug(name):
                attrs['name'] = name
        if hnode:
            attrs['father_pk'] = branch_to_pk[hnode]
        # look for hnames!
        if l in risolate_names:
            attrs['hname'] = risolate_names[l]
        if l in rcollapsed_names:
            attrs['hname'] = rcollapsed_names[l]
        changes.append(attrs)

    for row in gl_languoids:
        hid = row['ll_hid']
        if hid and 'NOCODE' in hid and hid not in hh_languages:
            # languoids with Harald's private code that are no longer in use
            changes.append(
                languoid(row['l_pk'],
                         'language',
                         status='retired',
                         active=False,
                         father_pk=None))

    jsondump(changes,
             args.data_dir.joinpath('languoids', 'changes.json'),
             indent=4)
示例#36
0
文件: util.py 项目: kublaj/glottolog3
def recreate_treeclosure():
    DBSession.execute('delete from treeclosuretable')
    SQL = TreeClosureTable.__table__.insert()
    ltable = Languoid.__table__

    # we compute the ancestry for each single languoid
    for lid, fid in DBSession.execute('select pk, father_pk from languoid').fetchall():
        depth = 0
        DBSession.execute(SQL, dict(child_pk=lid, parent_pk=lid, depth=depth))
        tlf = None

        # now follow up the line of ancestors
        while fid:
            tlf = fid
            depth += 1
            DBSession.execute(SQL, dict(child_pk=lid, parent_pk=fid, depth=depth))
            fid = DBSession.execute(
                sql.select([ltable.c.father_pk]).where(ltable.c.pk == fid)
            ).fetchone()[0]

        DBSession.execute(
            'UPDATE languoid SET family_pk = :tlf WHERE pk = :lid', locals())

    # we also pre-compute counts of descendants for each languoid:
    for level in ['language', 'dialect', 'family']:
        DBSession.execute("""\
UPDATE languoid SET child_%(level)s_count = (
    SELECT count(*)
    FROM treeclosuretable as t, languoid as l
    WHERE languoid.pk = t.parent_pk
    AND languoid.pk != t.child_pk AND t.child_pk = l.pk AND l.level = '%(level)s'
)""" % locals())

    DBSession.execute('COMMIT')
示例#37
0
def load(args):
    glottolog = args.repos
    fts.index('fts_index', models.Ref.fts, DBSession.bind)
    DBSession.execute("CREATE EXTENSION IF NOT EXISTS unaccent WITH SCHEMA public;")
    version = assert_release(glottolog.repos)
    dataset = common.Dataset(
        id='glottolog',
        name="{0} {1}".format(glottolog.publication.web.name, version),
        publisher_name=glottolog.publication.publisher.name,
        publisher_place=glottolog.publication.publisher.place,
        publisher_url=glottolog.publication.publisher.url,
        license=glottolog.publication.license.url,
        domain=purl.URL(glottolog.publication.web.url).domain(),
        contact=glottolog.publication.web.contact,
        jsondata={'license_icon': 'cc-by.png', 'license_name': glottolog.publication.license.name},
    )
    data = Data()

    for e in glottolog.editors.values():
        if e.current:
            ed = data.add(common.Contributor, e.id, id=e.id, name=e.name)
            common.Editor(dataset=dataset, contributor=ed, ord=int(e.ord))
    DBSession.add(dataset)

    contrib = data.add(common.Contribution, 'glottolog', id='glottolog', name='Glottolog')
    DBSession.add(common.ContributionContributor(
        contribution=contrib, contributor=data['Contributor']['hammarstroem']))

    #
    # Add Parameters:
    #
    add = functools.partial(add_parameter, data)
    add('fc', name='Family classification')
    add('sc', name='Subclassification')
    add('aes',
        args.repos.aes_status.values(),
        name=args.repos.aes_status.__defaults__['name'],
        pkw=dict(
            jsondata=dict(
                reference_id=args.repos.aes_status.__defaults__['reference_id'],
                sources=[attr.asdict(v) for v in args.repos.aes_sources.values()],
                scale=[attr.asdict(v) for v in args.repos.aes_status.values()])),
        dekw=lambda de: dict(name=de.name, number=de.ordinal, jsondata=dict(icon=de.icon)),
    )
    add('med',
        args.repos.med_types.values(),
        name='Most Extensive Description',
        dekw=lambda de: dict(
            name=de.name, description=de.description, number=de.rank, jsondata=dict(icon=de.icon)),
    )
    add('macroarea',
        args.repos.macroareas.values(),
        pkw=dict(
            description=args.repos.macroareas.__defaults__['description'],
            jsondata=dict(reference_id=args.repos.macroareas.__defaults__['reference_id'])),
        dekw=lambda de: dict(
            name=de.name,
            description=de.description,
            jsondata=dict(geojson=read_macroarea_geojson(args.repos, de.name, de.description)),
        ),
    )
    add('ltype',
        args.repos.language_types.values(),
        name='Language Type',
        dekw=lambda de: dict(name=de.category, description=de.description),
        delookup='category',
    )
    add('country',
        args.repos.countries,
        dekw=lambda de: dict(name=de.id, description=de.name),
    )

    legacy = jsonlib.load(gc2version(args))
    for gc, version in legacy.items():
        data.add(models.LegacyCode, gc, id=gc, version=version)

    #
    # Now load languoid data, keeping track of relations that can only be inserted later.
    #
    lgsources = defaultdict(list)
    # Note: We rely on languoids() yielding languoids in the "right" order, i.e. such that top-level
    # nodes will precede nested nodes. This order must be preserved using an `OrderedDict`:
    nodemap = OrderedDict([(l.id, l) for l in glottolog.languoids()])
    lgcodes = {k: v.id for k, v in args.repos.languoids_by_code(nodemap).items()}
    for lang in nodemap.values():
        for ref in lang.sources:
            lgsources['{0.provider}#{0.bibkey}'.format(ref)].append(lang.id)
        load_languoid(glottolog, data, lang, nodemap)

    for gc in glottolog.glottocodes:
        if gc not in data['Languoid'] and gc not in legacy:
            common.Config.add_replacement(gc, None, model=common.Language)

    for obj in jsonlib.load(glottolog.references_path('replacements.json')):
        common.Config.add_replacement(
            '{0}'.format(obj['id']),
            '{0}'.format(obj['replacement']) if obj['replacement'] else None,
            model=common.Source)

    DBSession.flush()

    for doctype in glottolog.hhtypes:
        data.add(
            models.Doctype, doctype.id, id=doctype.id,
            name=doctype.name,
            description=doctype.description,
            abbr=doctype.abbv,
            ord=doctype.rank)

    for bib in glottolog.bibfiles:
        data.add(
            models.Provider,
            bib.id,
            id=bib.id,
            name=bib.title,
            description=bib.description,
            abbr=bib.abbr,
            url=bib.url)
    DBSession.flush()

    s = time()
    for i, entry in enumerate(
            BibFile(glottolog.build_path('monster-utf8.bib'), api=glottolog).iterentries()):
        if i % 10000 == 0:
            args.log.info('{0}: {1:.3}'.format(i, time() - s))
            s = time()
        ref = load_ref(data, entry, lgcodes, lgsources)
        if 'macro_area' in entry.fields:
            mas = []
            for ma in split_text(entry.fields['macro_area'], separators=',;', strip=True):
                ma = 'North America' if ma == 'Middle America' else ma
                ma = glottolog.macroareas.get('Papunesia' if ma == 'Papua' else ma)
                mas.append(ma.name)
            ref.macroareas = ', '.join(mas)
示例#38
0
文件: initdb.py 项目: clld/glottolog3
def load(args):
    glottolog = args.repos
    fts.index('fts_index', models.Ref.fts, DBSession.bind)
    DBSession.execute("CREATE EXTENSION IF NOT EXISTS unaccent WITH SCHEMA public;")
    version = assert_release(glottolog.repos)
    dataset = common.Dataset(
        id='glottolog',
        name="Glottolog {0}".format(version),
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="https://shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        domain='glottolog.org',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})

    data = Data()
    for i, (id_, name) in enumerate([
        ('hammarstroem', 'Harald Hammarström'),
        ('forkel', 'Robert Forkel'),
        ('haspelmath', 'Martin Haspelmath'),
    ]):
        ed = data.add(common.Contributor, id_, id=id_, name=name)
        common.Editor(dataset=dataset, contributor=ed, ord=i + 1)
    DBSession.add(dataset)

    clf = data.add(common.Contribution, 'clf', id='clf', name='Classification')
    DBSession.add(common.ContributionContributor(
        contribution=clf, contributor=data['Contributor']['hammarstroem']))

    for pid, pname in [
        ('fc', 'Family classification'),
        ('sc', 'Subclassification'),
        ('vitality', 'Degree of endangerment'),
    ]:
        data.add(common.Parameter, pid, id=pid, name=pname)

    legacy = jsonlib.load(gc2version(args))
    for gc, version in legacy.items():
        data.add(models.LegacyCode, gc, id=gc, version=version)

    for ma in Macroarea:
        data.add(
            models.Macroarea,
            ma.name,
            id=ma.name,
            name=ma.value,
            description=ma.description)

    for country in glottolog.countries:
        data.add(models.Country, country.id, id=country.id, name=country.name)

    lgcodes, mas, countries, lgsources = {}, {}, {}, defaultdict(list)
    languoids = list(glottolog.languoids())
    nodemap = {l.id: l for l in languoids}
    for lang in languoids:
        for ref in lang.sources:
            lgsources['{0.provider}#{0.bibkey}'.format(ref)].append(lang.id)
        load_languoid(data, lang, nodemap)
        mas[lang.id] = [ma.name for ma in lang.macroareas]
        countries[lang.id] = [c.id for c in lang.countries]
        lgcodes[lang.id] = lang.id
        if lang.hid:
            lgcodes[lang.hid] = lang.id
        if lang.iso:
            lgcodes[lang.iso] = lang.id

    for gc in glottolog.glottocodes:
        if gc not in data['Languoid'] and gc not in legacy:
            common.Config.add_replacement(gc, None, model=common.Language)

    for obj in jsonlib.load(glottolog.references_path('replacements.json')):
        common.Config.add_replacement(
            '{0}'.format(obj['id']),
            '{0}'.format(obj['replacement']) if obj['replacement'] else None,
            model=common.Source)

    DBSession.flush()
    for lid, maids in mas.items():
        for ma in maids:
            DBSession.add(models.Languoidmacroarea(
                languoid_pk=data['Languoid'][lid].pk,
                macroarea_pk=data['Macroarea'][ma].pk))

    for lid, cids in countries.items():
        for cid in cids:
            DBSession.add(models.Languoidcountry(
                languoid_pk=data['Languoid'][lid].pk,
                country_pk=data['Country'][cid].pk))

    for doctype in glottolog.hhtypes:
        data.add(
            models.Doctype, doctype.id, id=doctype.id,
            name=doctype.name,
            description=doctype.description,
            abbr=doctype.abbv,
            ord=doctype.rank)

    for bib in glottolog.bibfiles:
        data.add(
            models.Provider,
            bib.id,
            id=bib.id,
            name=bib.title,
            description=bib.description,
            abbr=bib.abbr,
            url=bib.url)
    DBSession.flush()

    s = time()
    for i, entry in enumerate(
            BibFile(glottolog.build_path('monster-utf8.bib')).iterentries()):
        if i % 10000 == 0:
            args.log.info('{0}: {1:.3}'.format(i, time() - s))
            s = time()
        ref = load_ref(data, entry, lgcodes, lgsources)
        if 'macro_area' in entry.fields:
            for ma in split_text(entry.fields['macro_area'], separators=',;', strip=True):
                ma = 'North America' if ma == 'Middle America' else ma
                ma = Macroarea.get('Papunesia' if ma == 'Papua' else ma)
                DBSession.add(models.Refmacroarea(
                    ref_pk=ref.pk, macroarea_pk=data['Macroarea'][ma.name].pk))
示例#39
0
def prime_cache(args):
    """If data needs to be denormalized for lookup, do that here.
    This procedure should be separate from the db initialization, because
    it will have to be run periodically whenever data has been updated.
    """
    labels = {}
    for type_, cls in [('source', common.Source), ('unit', common.Unit)]:
        labels[type_] = defaultdict(dict)
        for r in DBSession.query(cls.id, cls.name):
            sid, _, lid = r[0].partition('-')
            labels[type_][sid][lid] = r[1]

    link_map = {'entry': 'unit', 'source': 'source'}

    for d in DBSession.query(Dictionary):
        if d.description:
            soup = BeautifulSoup(markdown(d.description), 'html.parser')
            for a in soup.find_all('a', href=True):
                if a['href'] in link_map:
                    type_ = link_map[a['href']]
                    if a.string in labels[type_][d.id]:
                        a['href'] = args.env['request'].route_url(
                            type_, id='{0}-{1}'.format(d.id, a.string))
                        a.string = labels[type_][d.id][a.string]
                        if type_ == 'unit':
                            a['class'] = 'lemma'
            d.description, d.toc = toc(soup)

    for meaning in DBSession.query(ComparisonMeaning).options(
        joinedload_all(common.Parameter.valuesets, common.ValueSet.values)
    ):
        meaning.representation = sum([len(vs.values) for vs in meaning.valuesets])
        if meaning.representation == 0:
            meaning.active = False

    def joined(iterable):
        return ' / '.join(sorted(nfilter(set(iterable))))

    q = DBSession.query(Word)\
        .order_by(Word.dictionary_pk, common.Unit.name, common.Unit.pk)\
        .options(joinedload(Word.meanings), joinedload(Word.dictionary))
    for _, words in groupby(q, lambda u: u.name):
        words = list(words)
        for i, word in enumerate(words):
            word.description = ' / '.join(m.name for m in word.meanings)
            word.comparison_meanings = joined(m.reverse for m in word.meanings)
            word.semantic_domain = joined(m.semantic_domain for m in word.meanings)
            word.number = i + 1 if len(words) > 1 else 0

            for suffix in ['1', '2']:
                alt_t, alt_l = [], []
                for m in word.meanings:
                    if getattr(m, 'alt_translation' + suffix):
                        alt_l.append(getattr(m, 'alt_translation_language' + suffix))
                        alt_t.append(getattr(m, 'alt_translation' + suffix))
                if alt_t and len(set(alt_l)) == 1:
                    DBSession.add(common.Unit_data(
                        object_pk=word.pk, key='lang-' + alt_l.pop(), value=join(alt_t)))

    def count_unit_media_files(contrib, mtype, cls=common.Unit_files):
        if cls == common.Unit_files:
            return DBSession.query(common.Unit_files)\
                .join(Word, common.Unit_files.object_pk == Word.pk)\
                .filter(Word.dictionary_pk == contrib.pk)\
                .filter(common.Unit_files.mime_type.ilike(mtype + '/%'))\
                .count() + \
                DBSession.query(Meaning_files)\
                .join(Meaning, Meaning_files.object_pk == Meaning.pk)\
                .join(Word, Meaning.word_pk == Word.pk)\
                .filter(Word.dictionary_pk == contrib.pk)\
                .filter(Meaning_files.mime_type.ilike(mtype + '/%'))\
                .count()
        return DBSession.query(common.Sentence_files)\
            .join(common.Sentence, common.Sentence_files.object_pk == common.Sentence.pk)\
            .filter(Example.dictionary_pk == contrib.pk)\
            .filter(common.Sentence_files.mime_type.ilike(mtype + '/%'))\
            .count()

    for d in DBSession.query(Dictionary).options(joinedload(Dictionary.words)):
        d.count_words = len(d.words)
        sds = set(chain(*[w.semantic_domain_list for w in d.words]))
        d.semantic_domains = join(sorted(sds))
        d.count_audio = count_unit_media_files(d, 'audio')
        d.count_example_audio = count_unit_media_files(d, 'audio', cls=common.Sentence_files)
        d.count_image = count_unit_media_files(d, 'image')

        word_pks = [w.pk for w in d.words]
        choices = {}
        for col in d.jsondata.get('custom_fields', []):
            values = [
                r[0] for r in DBSession.query(common.Unit_data.value)
                .filter(common.Unit_data.object_pk.in_(word_pks))
                .filter(common.Unit_data.key == col)
                .distinct()]
            if len(values) < 40:
                choices[col] = sorted(values)
        d.update_jsondata(choices=choices)

    DBSession.execute("""
    UPDATE word
      SET example_count = s.c 
      FROM (
        SELECT m.word_pk AS wpk, count(ms.sentence_pk) AS c
        FROM meaning AS m, meaningsentence AS ms
        WHERE m.pk = ms.meaning_pk
        GROUP BY m.word_pk
      ) AS s
      WHERE word.pk = s.wpk
    """)
示例#40
0
def prime_cache(cfg):
    """If data needs to be denormalized for lookup, do that here.
    This procedure should be separate from the db initialization, because
    it will have to be run periodically whenever data has been updated.
    """
    for meaning in DBSession.query(ComparisonMeaning).options(
        joinedload_all(common.Parameter.valuesets, common.ValueSet.values)
    ):
        meaning.representation = sum([len(vs.values) for vs in meaning.valuesets])
        if meaning.representation == 0:
            meaning.active = False

    q = DBSession.query(Word)\
        .order_by(Word.dictionary_pk, common.Unit.name, common.Unit.pk)\
        .options(joinedload(Word.meanings), joinedload(Word.dictionary))
    for _, words in groupby(q, lambda u: u.name):
        words = list(words)
        for i, word in enumerate(words):
            word.description = ' / '.join(m.name for m in word.meanings)
            word.comparison_meanings = ' / '.join(nfilter(m.reverse for m in word.meanings))
            word.semantic_domain = ' / '.join(nfilter(m.semantic_domain for m in word.meanings))
            word.number = i + 1 if len(words) > 1 else 0

            for suffix in ['1', '2']:
                alt_t, alt_l = [], []
                for m in word.meanings:
                    if getattr(m, 'alt_translation' + suffix):
                        alt_l.append(getattr(m, 'alt_translation_language' + suffix))
                        alt_t.append(getattr(m, 'alt_translation' + suffix))
                if alt_t and len(set(alt_l)) == 1:
                    DBSession.add(common.Unit_data(
                        object_pk=word.pk, key='lang-' + alt_l.pop(), value=join(alt_t)))

    def count_unit_media_files(contrib, mtype):
        return DBSession.query(common.Unit_files)\
            .join(Word, common.Unit_files.object_pk == Word.pk)\
            .filter(Word.dictionary_pk == contrib.pk)\
            .filter(common.Unit_files.mime_type.ilike(mtype + '/%'))\
            .count()

    for d in DBSession.query(Dictionary).options(joinedload(Dictionary.words)):
        d.count_words = len(d.words)
        sds = set(chain(*[w.semantic_domain_list for w in d.words]))
        d.semantic_domains = join(sorted(sds))
        d.count_audio = count_unit_media_files(d, 'audio')
        d.count_image = count_unit_media_files(d, 'image')

        word_pks = [w.pk for w in d.words]
        choices = {}
        for col in d.jsondata.get('custom_fields', []):
            values = [
                r[0] for r in DBSession.query(common.Unit_data.value)
                .filter(common.Unit_data.object_pk.in_(word_pks))
                .filter(common.Unit_data.key == col)
                .distinct()]
            if len(values) < 40:
                choices[col] = sorted(values)
        d.update_jsondata(choices=choices)

    DBSession.execute("""
    UPDATE word
      SET example_count = s.c 
      FROM (
        SELECT m.word_pk AS wpk, count(ms.sentence_pk) AS c
        FROM meaning AS m, meaningsentence AS ms
        WHERE m.pk = ms.meaning_pk
        GROUP BY m.word_pk
      ) AS s
      WHERE word.pk = s.wpk
    """)
示例#41
0
def main(args):
    fts.index('fts_index', Word.fts, DBSession.bind)
    DBSession.execute("CREATE EXTENSION IF NOT EXISTS unaccent WITH SCHEMA public;")

    if DBSession.bind.dialect.name == 'postgresql':
        Index('ducet', collkey(common.Unit.name)).create(DBSession.bind)
    data = Data()

    dataset = common.Dataset(
        id=dictionaria.__name__,
        name="Dictionaria",
        description="The Dictionary Journal",
        published=date(2017, 3, 30),
        contact='*****@*****.**',
        domain='dictionaria.clld.org',
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="https://shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})

    for i, (id_, name) in enumerate([
        ('haspelmathmartin', 'Martin Haspelmath'),
        ('stiebelsbarbara', 'Barbara Stiebels')
    ]):
        ed = data.add(common.Contributor, id_, id=id_, name=name)
        common.Editor(dataset=dataset, contributor=ed, ord=i + 1)
    DBSession.add(dataset)

    for id_, name in LGR_ABBRS.items():
        DBSession.add(common.GlossAbbreviation(id=id_, name=name))

    comparison_meanings = {}

    print('loading concepts ...')

    glosses = set()
    concepticon = Concepticon(
        REPOS.joinpath('..', '..', 'concepticon', 'concepticon-data'))
    if not args.no_concepts:
        for conceptset in concepticon.conceptsets.values():
            if conceptset.gloss in glosses:
                continue
            glosses.add(conceptset.gloss)
            cm = data.add(
                ComparisonMeaning,
                conceptset.id,
                id=conceptset.id,
                name=conceptset.gloss.lower(),
                description=conceptset.definition,
                concepticon_url='http://concepticon.clld.org/parameters/%s' % conceptset.id)
            comparison_meanings[cm.id] = cm

    DBSession.flush()

    print('... done')

    comparison_meanings = {k: v.pk for k, v in comparison_meanings.items()}
    submissions = []

    for submission in REPOS.joinpath(
            'submissions-internal' if args.internal else 'submissions').glob('*'):
        if not submission.is_dir():
            continue

        try:
            submission = Submission(submission)
        except ValueError:
            continue

        md = submission.md
        if md is None:
            print('no md', submission.id)
            continue

        if not md['date_published']:
            print('no date', submission.id)
            continue

        id_ = submission.id
        if args.dict and args.dict != id_ and args.dict != 'all':
            print('not selected', submission.id)
            continue
        lmd = md['language']
        props = md.get('properties', {})
        props.setdefault('custom_fields', [])
        props['metalanguage_styles'] = {}
        for v, s in zip(props.get('metalanguages', {}).values(),
                        ['success', 'info', 'warning', 'important']):
            props['metalanguage_styles'][v] = s
        props['custom_fields'] = ['lang-' + f if f in props['metalanguage_styles'] else f
                                  for f in props['custom_fields']]
        props.setdefault('choices', {})

        language = data['Variety'].get(lmd['glottocode'])
        if not language:
            language = data.add(
                Variety, lmd['glottocode'], id=lmd['glottocode'], name=lmd['name'])

        md['date_published'] = md['date_published'] or date.today().isoformat()
        if '-' not in md['date_published']:
            md['date_published'] = md['date_published'] + '-01-01'
        dictionary = data.add(
            Dictionary,
            id_,
            id=id_,
            number=md.get('number'),
            name=props.get('title', lmd['name'] + ' dictionary'),
            description=submission.description,
            language=language,
            published=date(*map(int, md['date_published'].split('-'))),
            doi=md.get('doi'),
            jsondata=props)

        for i, spec in enumerate(md['authors']):
            if not isinstance(spec, dict):
                cname, address = spec, None
                spec = {}
            else:
                cname, address = spec['name'], spec.get('affiliation')
            name = HumanName(cname)
            cid = slug('%s%s' % (name.last, name.first))
            contrib = data['Contributor'].get(cid)
            if not contrib:
                contrib = data.add(
                    common.Contributor,
                    cid,
                    id=cid,
                    name=cname,
                    address=address,
                    url=spec.get('url'),
                    email=spec.get('email'))
            DBSession.add(common.ContributionContributor(
                ord=i + 1,
                primary=spec.get('primary', True),
                contributor=contrib,
                contribution=dictionary))

        submissions.append((dictionary.id, language.id, submission))
    transaction.commit()

    for did, lid, submission in submissions:
        transaction.begin()
        print('loading %s ...' % submission.id)
        dictdata = Data()
        lang = Variety.get(lid)
        submission.load_sources(Dictionary.get(did), dictdata)
        submission.load_examples(Dictionary.get(did), dictdata, lang)
        submission.dictionary.load(
            submission,
            dictdata,
            Dictionary.get(did),
            lang,
            comparison_meanings,
            OrderedDict(submission.md.get('properties', {}).get('labels', [])))
        transaction.commit()
        print('... done')

    transaction.begin()
    load_families(
        Data(),
        [v for v in DBSession.query(Variety) if re.match('[a-z]{4}[0-9]{4}', v.id)],
        glottolog_repos='../../glottolog/glottolog')
示例#42
0
文件: initdb.py 项目: clld/glottolog3
def prime(args):
    """If data needs to be denormalized for lookup, do that here.
    This procedure should be separate from the db initialization, because
    it will have to be run periodically whenever data has been updated.
    """
    recreate_treeclosure()

    for lpk, mas in DBSession.execute("""\
select
  l.pk, array_agg(distinct lma.macroarea_pk)
from
  language as l,
  treeclosuretable as t,
  languoidmacroarea as lma,
  macroarea as ma
where
  l.pk = t.parent_pk and
  t.child_pk = lma.languoid_pk and
  lma.macroarea_pk = ma.pk and
  l.pk not in (select languoid_pk from languoidmacroarea)
group by l.pk"""):
        for mapk in mas:
            DBSession.add(models.Languoidmacroarea(languoid_pk=lpk, macroarea_pk=mapk))

    for row in list(DBSession.execute(
        "select pk, pages, pages_int, startpage_int from source where pages_int < 0"
    )):
        pk, pages, number, start = row
        _start, _end, _number = compute_pages(pages)
        if _number > 0 and _number != number:
            DBSession.execute(
                "update source set pages_int = %s, startpage_int = %s where pk = %s" %
                (_number, _start, pk))
            DBSession.execute(
                "update ref set endpage_int = %s where pk = %s" %
                (_end, pk))

    version = assert_release(args.repos.repos)
    with jsonlib.update(gc2version(args), indent=4) as legacy:
        for lang in DBSession.query(common.Language):
            if lang.id not in legacy:
                lang.update_jsondata(new=True)
                legacy[lang.id] = version

    def items(s):
        if not s:
            return set()
        r = []
        for ss in set(s.strip().split()):
            if '**:' in ss:
                ss = ss.split('**:')[0] + '**'
            if ss.endswith(','):
                ss = ss[:-1].strip()
            r.append(ss)
        return set(r)

    refs = {
        r[0]: r[1]
        for r in DBSession.query(models.Refprovider.id, models.Refprovider.ref_pk)}
    valuesets = {
        r[0]: r[1] for r in DBSession.query(common.ValueSet.id, common.ValueSet.pk)}

    for lang in args.repos.languoids():
        if lang.category == models.BOOKKEEPING:
            continue
        clf = lang.classification_comment
        if clf:
            if clf.subrefs:
                if items(lang.cfg['classification']['subrefs']) != \
                        items(lang.cfg['classification'].get('sub')):
                    vspk = valuesets['sc-{0}'.format(lang.id)]
                    for ref in clf.subrefs:
                        spk = refs.get(ref.key)
                        DBSession.add(
                            common.ValueSetReference(source_pk=spk, valueset_pk=vspk))
            if clf.familyrefs:
                if items(lang.cfg['classification']['familyrefs']) != \
                        items(lang.cfg['classification'].get('family')):
                    vspk = valuesets['fc-{0}'.format(lang.id)]
                    for ref in clf.familyrefs:
                        spk = refs.get(ref.key)
                        if spk:
                            DBSession.add(
                                common.ValueSetReference(source_pk=spk, valueset_pk=vspk))
示例#43
0
def main(args):  # pragma: no cover
    bib = Database.from_file(args.data_file(args.version, 'refs.bib'), encoding='utf8')
    count = 0
    skipped = 0
    changes = {}

    with transaction.manager:
        update_providers(args)
        DBSession.flush()
        provider_map = get_map(Provider)
        macroarea_map = get_map(Macroarea)
        doctype_map = get_map(Doctype)

        known_ids = set(r[0] for r in DBSession.query(Ref.pk))

        languoid_map = {}
        for l in DBSession.query(Languoid):
            if l.hid:
                languoid_map[l.hid] = l
            languoid_map[l.id] = l

        for i, rec in enumerate(bib):
            if i and i % 1000 == 0:
                print i, 'records done', count, 'changed'

            if len(rec.keys()) < 6:
                # not enough information!
                skipped += 1
                continue

            changed = False
            assert rec.get('glottolog_ref_id')
            id_ = int(rec.get('glottolog_ref_id'))
            if args.mode != 'update' and id_ in known_ids:
                continue
            ref = DBSession.query(Source).get(id_)
            update = True if ref else False

            kw = {
                'pk': id_,
                'bibtex_type': rec.genre,
                'id': str(id_),
                'jsondata': {'bibtexkey': rec.id},
            }

            for source, target in FIELD_MAP.items():
                if target is None:
                    continue
                value = rec.get(source)
                if value:
                    value = unescape(value)
                    if target:
                        kw[target] = CONVERTER.get(source, lambda x: x)(value)
                    else:
                        kw['jsondata'][source] = value

            if kw['jsondata'].get('hhtype'):
                trigger = ca_trigger(kw['jsondata']['hhtype'])
                if trigger:
                    kw['ca_doctype_trigger'], kw['jsondata']['hhtype'] = trigger

            # try to extract numeric year, startpage, endpage, numberofpages, ...
            if kw.get('year'):
                # prefer years in brackets over the first 4-digit number.
                match = PREF_YEAR_PATTERN.search(kw.get('year'))
                if match:
                    kw['year_int'] = int(match.group('year'))
                else:
                    match = YEAR_PATTERN.search(kw.get('year'))
                    if match:
                        kw['year_int'] = int(match.group('year'))

            if kw.get('publisher'):
                p = kw.get('publisher')
                if ':' in p:
                    address, publisher = [s.strip() for s in kw['publisher'].split(':', 1)]
                    if not 'address' in kw or kw['address'] == address:
                        kw['address'], kw['publisher'] = address, publisher

            if rec.get('numberofpages'):
                try:
                    kw['pages_int'] = int(rec.get('numberofpages').strip())
                except ValueError:
                    pass

            if kw.get('pages'):
                start, end, number = compute_pages(kw['pages'])
                if start is not None:
                    kw['startpage_int'] = start
                if end is not None:
                    kw['endpage_int'] = end
                if number is not None and 'pages_int' not in kw:
                    kw['pages_int'] = number

            if update:
                for k in kw.keys():
                    if k == 'pk':
                        continue
                    v = getattr(ref, k)
                    if kw[k] != v:
                        if k == 'jsondata':
                            d = ref.jsondata or {}
                            d.update(**kw[k])
                            for s, t in FIELD_MAP.items():
                                if t is None and s in d:
                                    del d[s]
                            ref.jsondata = d
                        else:
                            print k, '--', v
                            print k, '++', kw[k]
                            setattr(ref, k, kw[k])
                            changed = True
                            if ref.id in changes:
                                changes[ref.id][k] = ('%s' % v, '%s' % kw[k])
                            else:
                                changes[ref.id] = {k: ('%s' % v, '%s' % kw[k])}
            else:
                changed = True
                ref = Ref(name='%s %s' % (kw.get('author', 'na'), kw.get('year', 'nd')), **kw)

            ref.description = ref.title or ref.booktitle
            ref.name = '%s %s' % (ref.author or 'n.a.', ref.year or 'n.d.')

            def append(attr, obj):
                if obj and obj not in attr:
                    attr.append(obj)
                    return True

            a, r = update_relationship(
                ref.macroareas,
                [macroarea_map[name] for name in
                 set(filter(None, [s.strip() for s in kw['jsondata'].get('macro_area', '').split(',')]))])
            changed = changed or a or r

            for name in set(filter(None, [s.strip() for s in kw['jsondata'].get('src', '').split(',')])):
                result = append(ref.providers, provider_map[slug(name)])
                changed = changed or result

            a, r = update_relationship(
                ref.doctypes,
                [doctype_map[m.group('name')] for m in
                 DOCTYPE_PATTERN.finditer(kw['jsondata'].get('hhtype', ''))])
            changed = changed or a or r

            if not update:
                DBSession.add(ref)

            if changed:
                count += 1
                ref.doctypes_str = ', '.join(o.id for o in ref.doctypes)
                ref.providers_str = ', '.join(o.id for o in ref.providers)

        print count, 'records updated or imported'
        print skipped, 'records skipped because of lack of information'

    DBSession.execute("update source set description = title where description is null and title is not null;")
    DBSession.execute("update source set description = booktitle where description is null and booktitle is not null;")

    for row in list(DBSession.execute(
            "select pk, pages, pages_int, startpage_int from source where pages_int < 0")):
        pk, pages, number, start = row
        _start, _end, _number = compute_pages(pages)
        if _number > 0 and _number != number:
            DBSession.execute(
                "update source set pages_int = %s, startpage_int = %s where pk = %s",
                (_number, _start, pk))
            DBSession.execute(
                "update ref set endpage_int = %s where pk = %s",
                (_end, pk))

    return changes
示例#44
0
def prime_cache(args):
    DBSession.execute('delete from treeclosuretable')
    SQL = models2.TreeClosureTable.__table__.insert()
    ltable = models2.Languoid.__table__

    # we compute the ancestry for each single languoid
    for lid, fid in DBSession.execute(
            'select pk, father_pk from languoid').fetchall():
        depth = 0
        DBSession.execute(SQL, dict(child_pk=lid, parent_pk=lid, depth=depth))

        # now follow up the line of ancestors
        while fid:
            depth += 1
            DBSession.execute(SQL,
                              dict(child_pk=lid, parent_pk=fid, depth=depth))
            fid = DBSession.execute(
                sql.select([ltable.c.father_pk
                            ]).where(ltable.c.pk == fid)).fetchone()[0]

    # we also pre-compute counts of descendants for each languoid:
    for level in ['language', 'dialect', 'family']:
        DBSession.execute("""\
UPDATE languoid SET child_%(level)s_count = (
    SELECT count(*)
    FROM treeclosuretable as t, languoid as l
    WHERE languoid.pk = t.parent_pk
    AND languoid.pk != t.child_pk AND t.child_pk = l.pk AND l.level = '%(level)s'
)""" % locals())

    DBSession.execute('COMMIT')
示例#45
0
 def handler(offset, batch):
     _values = [value(start + offset + i + 1, row) for i, row in enumerate(batch)]
     DBSession.execute(model.__table__.insert(), _values)
     values.extend(_values)
示例#46
0
def create(args):
    args.log.info('starting migration ...')
    data = Data()
    db = create_engine('postgresql://robert@/glottolog2')

    with transaction.manager:
        sn = data.add(common.Contributor, 'sn', id='sn', name='Sebastian Nordhoff')
        hh = data.add(common.Contributor, 'hh', id='hh', name='Harald Hammarström')
        rf = data.add(common.Contributor, 'rf', id='rf', name='Robert Forkel', url="https://github.com/xrotwang")
        mh = data.add(common.Contributor, 'mh', id='mh', name='Martin Haspelmath')
        contrib = data.add(common.Contribution, 'c', id='classification', name='Classification')
        data.add(common.ContributionContributor, 'hh', contribution=contrib, contributor=hh)
        params = dict(
            fc=data.add(common.Parameter, 'fc', id='fc', name='Family classification'),
            sc=data.add(common.Parameter, 'sc', id='sc', name='Subclassification'),
        )

        dataset = data.add(
            common.Dataset, 'd',
            id='glottolog',
            name='Glottolog 2.0',
            description='',
            published=datetime.date(2013, 8, 15),
            domain='glottolog.org',
            contact='*****@*****.**',
            jsondata={
                'license_icon': 'http://i.creativecommons.org/l/by-sa/3.0/88x31.png',
                'license_name':
                    'Creative Commons Attribution-ShareAlike 3.0 Unported License'})
        for i, ed in enumerate([sn, hh, rf, mh]):
            DBSession.add(common.Editor(dataset=dataset, contributor=ed, ord=i + 1))

        valuesets = {}

        def create_languoid(row, father_pk=None):
            glottocode = {'akun1242': 'akun1241'}.get(row['alnumcode'], row['alnumcode'])
            attrs = dict(
                pk=row['id'],
                id=glottocode,
                name=row['primaryname'],
                description=row['globalclassificationcomment'],
                level=getattr(models2.LanguoidLevel, row['level']),
                status=getattr(models2.LanguoidStatus, (row['status'] or '').replace(' ', '_'), None),
                father_pk=father_pk,
                created=row['updated'],
                jsondata={} if not row['hname'] else {'hname': row['hname']},
            )
            for attr in [
                'active',
                'updated',
                'hid',
                'latitude',
                'longitude',
            ]:
                attrs[attr] = row[attr]
            l = data.add(models2.Languoid, row['id'], **attrs)
            for type_ in params:
                id_ = '%s%s' % (type_, row['id'])
                vs = data.add(
                    common.ValueSet, id_,
                    id=id_,
                    description=row['classificationcomment'] if type_ == 'fc' else row['subclassificationcomment'],
                    language=l,
                    parameter=params[type_],
                    contribution=contrib)
                data.add(common.Value, id_, id=id_, name='%s - %s' % (row['level'], row['status']), valueset=vs)
                DBSession.flush()
                valuesets[id_] = vs.pk
            return str(row['id'])

        level = 0
        parents = [create_languoid(row) for row
                   in db.execute('select * from languoidbase where father_id is null')]
        while parents:
            args.log.info('level: %s' % level)
            level += 1
            parents = [
                create_languoid(row, father_pk=data['Languoid'][row['father_id']].pk)
                for row in db.execute(
                    'select * from languoidbase where father_id in (%s)'
                    % ','.join(parents))]

    def handler(offset, batch):
        svalues = []
        rvalues = []
        for row in batch:
            jsondata = json.loads(row['jsondata'] or "{}")
            jsondata['bibtexkey'] = row['bibtexkey']
            dicts = {
                's': dict(
                    pk=row['id'],
                    polymorphic_type='base',
                    id=str(row['id']),
                    name='%(author)s %(year)s' % row,
                    description=row['title'],
                    bibtex_type=getattr(EntryType, row['type']),
                    jsondata=jsondata),
                'r': dict(pk=row['id']),
            }
            for model, map_ in {
                's': {
                    'author': None,
                    'yearstring': 'year',
                    'year': 'year_int',
                    'startpage': 'startpage_int',
                    'numberofpages': 'pages_int',
                    'pages': None,
                    'edition': None,
                    'school': None,
                    'address': None,
                    'url': None,
                    'note': None,
                    'number': None,
                    'series': None,
                    'editor': None,
                    'booktitle': None,
                    'journal': None,
                    'volume': None,
                    'publisher': None,
                },
                'r': {
                    'endpage': 'endpage_int',
                    'inlg': None,
                    'inlg_code': None,
                    'subject': None,
                    'subject_headings': None,
                    'keywords': None,
                    'normalizedauthorstring': None,
                    'normalizededitorstring': None,
                    'ozbib_id': None,
                }
            }.items():
                for okey, nkey in map_.items():
                    dicts[model][nkey or okey] = row[okey]
            svalues.append(dicts['s'])
            rvalues.append(dicts['r'])
        DBSession.execute(common.Source.__table__.insert(), svalues)
        DBSession.execute(models2.Ref.__table__.insert(), rvalues)

    select(db, 'select * from refbase order by id', handler)
    DBSession.execute('COMMIT')

    for table, model, value, order in [
        ('macroarea', models2.Macroarea, lambda i, row: dict(
            pk=row['id'],
            id=slug(row['name']),
            name=row['name'],
            description=row['description']),
         None),
        ('country', models2.Country, lambda i, row: dict(
            pk=row['id'], id=row['alpha2'], name=row['name']),
         None),
        ('provider', models2.Provider, lambda i, row: dict(
            pk=row['id'],
            id=slug(row['name']),
            name=row['description'],
            description=row['comment'],
            abbr=row['abbr'],
            url=row['url'],
            refurl=row['refurl'],
            bibfield=row['bibfield']),
         None),
        ('doctype', models2.Doctype, lambda i, row: dict(
            pk=row['id'],
            id=slug(row['name']),
            abbr=row['abbr'],
            name=row['name'],
            description=row['description']),
         None),
        ('refprovider', models2.Refprovider, lambda i, row: dict(
            pk=i, provider_pk=row['provider_id'], ref_pk=row['refbase_id']),
         ('provider_id', 'refbase_id')),
        ('refdoctype', models2.Refdoctype, lambda i, row: dict(
            pk=i, doctype_pk=row['doctype_id'], ref_pk=row['refbase_id']),
         ('doctype_id', 'refbase_id')),
    ]:
        insert(db, table, model, value, order=order)

    names = dict(
        (int(d['id']), d['pk']) for d in
        insert(
            db,
            'namebase',
            common.Identifier,
            lambda i, row: dict(
                pk=i,
                id=str(row['id']),
                name=row['namestring'],
                type='name',
                description=row['nameprovider'],
                lang=row['inlg'] if row['inlg'] and len(row['inlg']) <= 3 else 'en'),
            order='id'))

    codes = dict(
        (int(d['id']), d['pk']) for d in
        insert(
            db,
            'codebase',
            common.Identifier, lambda i, row: dict(
                pk=i,
                id=str(row['id']),
                name=row['codestring'],
                type=common.IdentifierType.iso.value if row['codeprovider'] == 'ISO' else row['codeprovider']),
            start=len(names),
            order='id'))

    res = insert(
        db,
        'nodecodes',
        common.LanguageIdentifier,
        lambda i, row: dict(
            pk=i, language_pk=row['languoidbase_id'], identifier_pk=codes[row['codebase_id']]))

    insert(
        db,
        'nodenames',
        common.LanguageIdentifier,
        lambda i, row: dict(
            pk=i, language_pk=row['languoidbase_id'], identifier_pk=names[row['namebase_id']]),
        start=len(res))

    for table, model, value in [
        (
            'languoidmacroarea',
            models2.Languoidmacroarea,
            lambda i, row: dict(
                pk=i, languoid_pk=row['languoidbase_id'], macroarea_pk=row['macroarea_id'])),
        (
            'languoidcountry',
            models2.Languoidcountry,
            lambda i, row: dict(
                pk=i, languoid_pk=row['languoidbase_id'], country_pk=row['country_id'])),
        (
            'noderefs',
            common.LanguageSource,
            lambda i, row: dict(
                pk=i, language_pk=row['languoidbase_id'], source_pk=row['refbase_id'])),
        (
            'refmacroarea',
            models2.Refmacroarea,
            lambda i, row: dict(
                pk=i, macroarea_pk=row['macroarea_id'], ref_pk=row['refbase_id'])),
        (
            'refcountry',
            models2.Refcountry,
            lambda i, row: dict(
                pk=i, country_pk=row['country_id'], ref_pk=row['refbase_id'])),
        (
            'spuriousreplacements',
            models2.Superseded,
            lambda i, row: dict(
                pk=i,
                languoid_pk=row['languoidbase_id'],
                replacement_pk=row['replacement_id'],
                description=row['relation'])),

        (
            'justification',
            common.ValueSetReference,
            lambda i, row: dict(
                pk=i,
                valueset_pk=valuesets['%s%s' % (
                    'fc' if row['type'] == 'family' else 'sc', row['languoidbase_id'])],
                source_pk=row['refbase_id'],
                description=row['pages'])),
    ]:
        insert(db, table, model, value)
def main(args):
    stats = Counter(new=0, matches=0, migrations=0, nomatches=0)
    l, ll = Language.__table__.alias("l"), Languoid.__table__.alias("ll")
    gl_languoids = list(DBSession.execute(select([l, ll], use_labels=True).where(l.c.pk == ll.c.pk)).fetchall())

    # we collect a list of changes which we will store in a JSON file.
    changes = []

    hid_to_pk = {row["ll_hid"]: row["l_pk"] for row in gl_languoids if row["ll_hid"]}
    max_languoid_pk = max(*[row["l_pk"] for row in gl_languoids])
    new_glottocodes = {}
    pk_to_name = {row["l_pk"]: row["l_name"] for row in gl_languoids}

    # dict mapping branches (i.e. tuples of sub-family names) to dicts of H-languages
    hh_families = OrderedDict()

    # dict mapping identifiers (i.e. hid) of H-languages to branches
    hh_languages = OrderedDict()

    parse_families(args.data_dir.joinpath("languoids", "lff.txt"), hh_families, hh_languages)

    # handle isolates / collapse families with exactly one leaf:
    isolate_names = {}
    collapsed_names = {}
    for key in hh_families.keys():
        if len(hh_families[key]) == 1:
            if len(key) == 1:
                # isolate
                hh_languages[hh_families[key].keys()[0]][0] = None
                isolate_names[key[0]] = hh_families[key].keys()[0]  # map name to code
            else:
                hh_languages[hh_families[key].keys()[0]][0] = key[:-1]
                collapsed_names[key[-1]] = hh_families[key].keys()[0]
            del hh_families[key]

    # now add the unclassifiabble, unattested, un-whatever
    parse_families(args.data_dir.joinpath("languoids", "lof.txt"), hh_families, hh_languages)

    # we also want to be able to lookup families by name
    fname_to_branches = defaultdict(list)
    for branch in hh_families:
        fname_to_branches[branch[-1]].append(branch)

    new_hid_to_pk = {}
    for code, (hnode, status, name) in hh_languages.items():
        if code not in hid_to_pk:
            # we have to insert a new H-language!
            max_languoid_pk += 1
            new_hid_to_pk[code] = max_languoid_pk

            if name in pk_to_name.values():
                args.log.warn("new code {1} for existing name {0}".format(name, code))
            changes.append(
                languoid(
                    max_languoid_pk,
                    "language",
                    hid=code,
                    id=glottocode(unicode(name), DBSession, new_glottocodes),
                    name=name,
                    hname=name,
                    status=status,
                )
            )
            stats.update(["new_languages"])

    duplicate_leafset_to_branch = {}
    leafset_to_branch = {}
    for family, langs in hh_families.items():
        leafs = get_leafset(hid for hid in langs.keys() if hid in hid_to_pk)
        if not leafs:
            args.log.info("Family with only new languages: %s, %s" % (family, langs))
            continue

        if leafs in leafset_to_branch:
            # so we have already seen this exact set of leaves.
            #
            # special case: there may be additional "Unclassified something" nodes in
            # branch without any changes in the set of leafs ...
            if not [n for n in family if n.startswith("Unclassified")]:
                # ... or the full leafset contains new languages
                assert [hid for hid in hh_families[family[:-1]].keys() if hid in new_hid_to_pk]
            fset, rset = set(family), set(leafset_to_branch[leafs])
            assert rset.issubset(fset)
            assert leafs not in duplicate_leafset_to_branch
            duplicate_leafset_to_branch[leafs] = family
        else:
            leafset_to_branch[leafs] = family

    #
    # at this point leafset_to_branch is a consolidated mapping of sets of H-Languages
    # to branches in the new family tree.
    #

    # for set comparisons we compute a list of actual sets (not tuples) of leafs
    # ordered by length.
    leafsets = [set(t) for t in sorted(leafset_to_branch.keys(), key=lambda s: len(s))]

    todo = []

    gl_family_to_leafset = {}

    def select_leafs(pk):
        l, tc = Languoid.__table__.alias("l"), TreeClosureTable.__table__.alias("tc")
        return [
            r["l_hid"]
            for r in DBSession.execute(
                select([l, tc], use_labels=True).where(
                    and_(
                        l.c.pk == tc.c.child_pk,
                        l.c.hid != None,
                        l.c.status != LanguoidStatus.provisional,
                        tc.c.parent_pk == pk,
                    )
                )
            )
        ]

    for row in gl_languoids:
        if row["ll_level"] == LanguoidLevel.family and row["l_active"]:
            leafs = get_leafset(select_leafs(row["l_pk"]))
            assert leafs
            glnode = GLNode(
                row["l_pk"], row["l_name"], row["ll_level"].name, row["ll_father_pk"], row["l_jsondata"].get("hname")
            )
            gl_family_to_leafset[glnode] = leafs

    # note: for legacy gl nodes, we map leaf-tuples to lists of matching nodes!
    leafset_to_gl_family = defaultdict(list)
    for node, leafs in gl_family_to_leafset.items():
        leafset_to_gl_family[leafs].append(node)

    # now we look for matches between old and new classification:
    for leafs, nodes in leafset_to_gl_family.items():
        todo.extend(
            match_nodes(args, leafs, nodes, leafset_to_branch, duplicate_leafset_to_branch, leafsets, fname_to_branches)
        )

    # compile a mapping for exact matches:
    branch_to_pk = {}
    for m in todo:
        if m.hid:
            if m.hid in branch_to_pk:
                if branch_to_pk[m.hid] != m.pk:
                    # compare names:
                    if pk_to_name[m.pk] == m.hid[-1]:
                        args.log.info("#### type1")
                        branch_to_pk[m.hid] = m.pk
                    elif pk_to_name[branch_to_pk[m.hid]] == m.hid[-1]:
                        args.log.info("#### type2")
                    else:
                        raise ValueError
            else:
                branch_to_pk[m.hid] = m.pk

    for hnode in sorted(hh_families.keys(), key=lambda b: (len(b), b)):
        # loop through branches breadth first to determine what's to be inserted
        if hnode not in branch_to_pk:
            t = get_leafset(hh_families[hnode].keys())
            if t in leafset_to_gl_family:
                # the "Unclassified subfamily" special case from above:
                if not [n for n in hnode if n.startswith("Unclassified")]:
                    assert [hid for hid in hh_families[hnode[:-1]].keys() if hid in new_hid_to_pk]
                # make sure, the existing glottolog family for the set of leafs is mapped
                # to some other node in the new classification:
                assert leafset_to_gl_family[t][0].pk in [m.pk for m in todo if m.hid]

            max_languoid_pk += 1
            branch_to_pk[hnode] = max_languoid_pk
            pk_to_name[max_languoid_pk] = hnode[-1]
            attrs = languoid(
                max_languoid_pk,
                "family",
                id=glottocode(unicode(hnode[-1]), DBSession, new_glottocodes),
                name=hnode[-1],
                hname=hnode[-1],
            )
            if len(hnode) > 1:
                attrs["father_pk"] = branch_to_pk[tuple(list(hnode)[:-1])]
                assert attrs["father_pk"]
            stats.update(["new"])
            changes.append(attrs)

    # now on to the updates for families:
    for m in todo:
        attrs = languoid(m.pk, "family", name=pk_to_name[m.pk])
        if m.hid:
            stats.update(["matches"])
            if len(m.hid) > 1:
                attrs["father_pk"] = branch_to_pk[tuple(list(m.hid)[:-1])]
            if getattr(m, "rename", False):
                attrs["name"] = m.hid[-1]
            attrs["hname"] = m.hid[-1]
        else:
            attrs["active"] = False  # mark the languoid as obsolete.
            if getattr(m, "pointer", False):
                print "~~", m.pk, pk_to_name[m.pk].encode("utf8"), "->", ", ".join(m.pointer).encode("utf8")
                stats.update(["migrations"])
                attrs["replacement"] = branch_to_pk[m.pointer]
            else:
                stats.update(["nomatches"])
        changes.append(attrs)

    args.log.info("%s" % stats)

    risolate_names = dict(zip(isolate_names.values(), isolate_names.keys()))
    rcollapsed_names = dict(zip(collapsed_names.values(), collapsed_names.keys()))

    # and updates of father_pks for languages:
    for l, (hnode, status, name) in hh_languages.items():
        id_ = hid_to_pk.get(l)
        if not id_:
            id_ = new_hid_to_pk.get(l)
            attrs = languoid(id_, "language", status=status)
        else:
            attrs = languoid(id_, "language", status=status)
            # In case of existing languoids, we don't change the active flag!
            del attrs["active"]
        if id_ in pk_to_name and name != pk_to_name[id_]:
            if slug(pk_to_name[id_]) == slug(name):
                attrs["name"] = name
        if hnode:
            attrs["father_pk"] = branch_to_pk[hnode]
        # look for hnames!
        if l in risolate_names:
            attrs["hname"] = risolate_names[l]
        if l in rcollapsed_names:
            attrs["hname"] = rcollapsed_names[l]
        changes.append(attrs)

    for row in gl_languoids:
        hid = row["ll_hid"]
        if hid and "NOCODE" in hid and hid not in hh_languages:
            # languoids with Harald's private code that are no longer in use
            changes.append(languoid(row["l_pk"], "language", status="retired", active=False, father_pk=None))

    jsondump(changes, args.data_dir.joinpath("languoids", "changes.json"), indent=4)
示例#48
0
文件: iso.py 项目: kublaj/glottolog3
def update(args):
    author = 'ISO 639-3 Registration Authority'
    pid = 'iso6393'
    dtid = 'overview'
    dt = Doctype.get(dtid)
    provider = Provider.get(pid, default=None)
    if provider is None:
        provider = Provider(
            id=pid,
            abbr=pid,
            name=author,
            description="Change requests submitted to the ISO 639-3 registration authority.")
    iid = max(int(DBSession.execute(
        "select max(cast(id as integer)) from source").fetchone()[0]), 500000)
    pk = int(DBSession.execute("select max(pk) from source").fetchone()[0])
    for crno, affected in args.json['changerequests'].items():
        year, serial = crno.split('-')
        title = 'Change Request Number %s' % crno
        ref = Ref.get(title, key='title', default=None)

        if not ref:
            iid += 1
            pk += 1
            ref = Ref(
                pk=pk,
                id=str(iid),
                name='%s %s' % (author, year),
                bibtex_type=EntryType.misc,
                number=crno,
                description=title,
                year=year,
                year_int=int(year),
                title=title,
                author=author,
                address='Dallas',
                publisher='SIL International',
                url='http://www.sil.org/iso639-3/cr_files/%s.pdf' % crno,
                doctypes_str=dtid,
                providers_str=pid,
                language_note=', '.join('%(Language Name)s [%(Affected Identifier)s]' % spec for spec in affected),
                jsondata=dict(hhtype=dtid, src=pid))
            ref.doctypes.append(dt)
            ref.providers.append(provider)

        for spec in affected:
            lang = Languoid.get(spec['Affected Identifier'], key='hid', default=None)
            if lang and lang not in ref.languages:
                ref.languages.append(lang)
        DBSession.add(ref)

    transaction.commit()
    transaction.begin()

    matched = 0
    near = 0
    max_identifier_pk = DBSession.query(
        Identifier.pk).order_by(desc(Identifier.pk)).first()[0]
    families = []
    for family in DBSession.query(Languoid)\
            .filter(Languoid.level == LanguoidLevel.family)\
            .filter(Language.active == True)\
            .all():
        isoleafs = set()
        for row in DBSession.query(TreeClosureTable.child_pk, Languoid.hid)\
            .filter(family.pk == TreeClosureTable.parent_pk)\
            .filter(Languoid.pk == TreeClosureTable.child_pk)\
            .filter(Languoid.hid != None)\
            .filter(Languoid.level == LanguoidLevel.language)\
            .filter(Languoid.status == LanguoidStatus.established)\
            .all():
            if len(row[1]) == 3:
                isoleafs.add(row[1])
        families.append((family, isoleafs))

    families = sorted(families, key=lambda p: len(p[1]))

    for mid, leafs in args.json['macrolanguages'].items():
        leafs = set(leafs)
        found = False
        for family, isoleafs in families:
            if leafs == isoleafs:
                if mid not in [c.name for c in family.identifiers
                               if c.type == IdentifierType.iso.value]:
                    family.codes.append(Identifier(
                        id=str(max_identifier_pk + 1),
                        name=mid,
                        type=IdentifierType.iso.value))
                    max_identifier_pk += 1
                matched += 1
                found = True
                break
            elif leafs.issubset(isoleafs):
                print '~~~', family.name, '-->', mid, 'distance:', len(leafs), len(isoleafs)
                near += 1
                found = True
                break
        if not found:
            print '---', mid, leafs

    print 'matched', matched, 'of', len(args.json['macrolanguages']), 'macrolangs'
    print near
示例#49
0
def prime(args):
    """If data needs to be denormalized for lookup, do that here.
    This procedure should be separate from the db initialization, because
    it will have to be run periodically whenever data has been updated.
    """
    recreate_treeclosure()

    for lpk, mas in DBSession.execute("""\
select
  l.pk, array_agg(distinct lma.macroarea_pk)
from
  language as l,
  treeclosuretable as t,
  languoidmacroarea as lma,
  macroarea as ma
where
  l.pk = t.parent_pk and
  t.child_pk = lma.languoid_pk and
  lma.macroarea_pk = ma.pk and
  l.pk not in (select languoid_pk from languoidmacroarea)
group by l.pk"""):
        for mapk in mas:
            DBSession.add(models.Languoidmacroarea(languoid_pk=lpk, macroarea_pk=mapk))

    for row in list(DBSession.execute(
        "select pk, pages, pages_int, startpage_int from source where pages_int < 0"
    )):
        pk, pages, number, start = row
        _start, _end, _number = compute_pages(pages)
        if _number > 0 and _number != number:
            DBSession.execute(
                "update source set pages_int = %s, startpage_int = %s where pk = %s" %
                (_number, _start, pk))
            DBSession.execute(
                "update ref set endpage_int = %s where pk = %s" %
                (_end, pk))

    with jsonlib.update(gc2version(args), indent=4) as legacy:
        for lang in DBSession.query(common.Language):
            if lang.id not in legacy:
                lang.update_jsondata(new=True)
                legacy[lang.id] = args.args[0]

    def items(s):
        if not s:
            return set()
        r = []
        for ss in set(s.strip().split()):
            if '**:' in ss:
                ss = ss.split('**:')[0] + '**'
            if ss.endswith(','):
                ss = ss[:-1].strip()
            r.append(ss)
        return set(r)

    refs = {
        r[0]: r[1]
        for r in DBSession.query(models.Refprovider.id, models.Refprovider.ref_pk)}
    valuesets = {
        r[0]: r[1] for r in DBSession.query(common.ValueSet.id, common.ValueSet.pk)}

    for lang in args.repos.languoids():
        if lang.category == models.BOOKKEEPING:
            continue
        clf = lang.classification_comment
        if clf:
            if clf.subrefs:
                if items(lang.cfg['classification']['subrefs']) != \
                        items(lang.cfg['classification'].get('sub')):
                    vspk = valuesets['sc-{0}'.format(lang.id)]
                    for ref in clf.subrefs:
                        spk = refs.get(ref.key)
                        DBSession.add(
                            common.ValueSetReference(source_pk=spk, valueset_pk=vspk))
示例#50
0
def update(args):
    author = 'ISO 639-3 Registration Authority'
    pid = 'iso6393'
    dtid = 'overview'
    dt = Doctype.get(dtid)
    provider = Provider.get(pid, default=None)
    if provider is None:
        provider = Provider(
            id=pid,
            abbr=pid,
            name=author,
            description=
            "Change requests submitted to the ISO 639-3 registration authority."
        )
    iid = max(
        int(
            DBSession.execute(
                "select max(cast(id as integer)) from source").fetchone()[0]),
        500000)
    pk = int(DBSession.execute("select max(pk) from source").fetchone()[0])
    for crno, affected in args.json['changerequests'].items():
        year, serial = crno.split('-')
        title = 'Change Request Number %s' % crno
        ref = Ref.get(title, key='title', default=None)

        if not ref:
            iid += 1
            pk += 1
            ref = Ref(pk=pk,
                      id=str(iid),
                      name='%s %s' % (author, year),
                      bibtex_type=EntryType.misc,
                      number=crno,
                      description=title,
                      year=year,
                      year_int=int(year),
                      title=title,
                      author=author,
                      address='Dallas',
                      publisher='SIL International',
                      url='http://www.sil.org/iso639-3/cr_files/%s.pdf' % crno,
                      language_note=', '.join(
                          '%(Language Name)s [%(Affected Identifier)s]' % spec
                          for spec in affected),
                      jsondata=dict(hhtype=dtid, src=pid))
            ref.doctypes.append(dt)
            ref.providers.append(provider)

        for spec in affected:
            lang = Languoid.get(spec['Affected Identifier'],
                                key='hid',
                                default=None)
            if lang and lang not in ref.languages:
                ref.languages.append(lang)
        DBSession.add(ref)

    transaction.commit()
    transaction.begin()

    matched = 0
    near = 0
    max_identifier_pk = DBSession.query(Identifier.pk).order_by(
        desc(Identifier.pk)).first()[0]
    families = []
    for family in DBSession.query(Languoid)\
            .filter(Languoid.level == LanguoidLevel.family)\
            .filter(Language.active == True)\
            .all():
        isoleafs = set()
        for row in DBSession.query(TreeClosureTable.child_pk, Languoid.hid)\
            .filter(family.pk == TreeClosureTable.parent_pk)\
            .filter(Languoid.pk == TreeClosureTable.child_pk)\
            .filter(Languoid.hid != None)\
            .filter(Languoid.level == LanguoidLevel.language)\
            .filter(Languoid.status == LanguoidStatus.established)\
            .all():
            if len(row[1]) == 3:
                isoleafs.add(row[1])
        families.append((family, isoleafs))

    families = sorted(families, key=lambda p: len(p[1]))

    for mid, leafs in args.json['macrolanguages'].items():
        leafs = set(leafs)
        found = False
        for family, isoleafs in families:
            if leafs == isoleafs:
                if mid not in [
                        c.name for c in family.identifiers
                        if c.type == IdentifierType.iso.value
                ]:
                    family.codes.append(
                        Identifier(id=str(max_identifier_pk + 1),
                                   name=mid,
                                   type=IdentifierType.iso.value))
                    max_identifier_pk += 1
                matched += 1
                found = True
                break
            elif leafs.issubset(isoleafs):
                print '~~~', family.name, '-->', mid, 'distance:', len(
                    leafs), len(isoleafs)
                near += 1
                found = True
                break
        if not found:
            print '---', mid, leafs

    print 'matched', matched, 'of', len(
        args.json['macrolanguages']), 'macrolangs'
    print near
示例#51
0
def main(args):
    data = Data(created=utc.localize(datetime(2013, 11, 15)),
                updated=utc.localize(datetime(2013, 12, 12)))
    icons = issues.Icons()
    #print icons

    DBSession.execute("delete from Language")
    DBSession.execute("delete from Unit")
    DBSession.execute("delete from featuredomain")
    DBSession.execute("delete from family")
    DBSession.execute("delete from source")
    DBSession.execute("delete from parameter")
    DBSession.execute("delete from feature")
    DBSession.execute("delete from domainelement")
    DBSession.execute("delete from valueset")
    DBSession.execute("delete from value")
    DBSession.execute("delete from lsivalue")
    DBSession.execute("delete from dataset")
    DBSession.execute("delete from contributor")
    DBSession.execute("delete from lsilanguage")
    DBSession.execute("delete from contribution")
    DBSession.execute("delete from designer")

    DBSession.flush()

    dtab = partial(_dtab, args.data_file())

    #Languages

    #print args.data_file()
    #tabfns = ['%s' % fn.basename() for fn in args.data_file().files('nts_*.tab')]
    #tabfns = ['nts_18.tab']
    ##tabfns = os.listdir('/Users/virk/shafqat/postDoc-Swe/project/clld/clld/lsi/lsi/data')[1:]
    tabfns = os.listdir(
        '/Users/virk/shafqat/postDoc-Swe/project/clld/clld/lsi/lsi/lsi_data'
    )[1:]
    #print tabfns
    args.log.info("Sheets found: %s" % tabfns)
    ldps = []
    lgs = {}
    nfeatures = Counter()
    nlgs = Counter()

    for fn in tabfns:
        for ld in dtab(fn):

            if ld['language_id'] == 'qgr' or ld['language_id'] == '---' or ld[
                    'language_id'] == '':  # to exclude languages which do not have an iso-code
                continue
            if "feature_alphanumid" not in ld:
                args.log.info("NO FEATUREID %s %s" % (len(ld), ld))
            if not ld["feature_alphanumid"].startswith("DRS") \
                    and ld["feature_alphanumid"].find(".") == -1:
                ldps.append(dp_dict(ld))
                ##print ld
                lgs[ld['language_id']] = unescape(ld['language_name'])
                if ld["value"] != "?":
                    nfeatures.update([ld['language_id']])
                    nlgs.update([ld['feature_alphanumid']])

    ldps = sorted(ldps, key=lambda d: d['feature_alphanumid'])

    #lgs["ygr"] = "Hua"

    for lgid, lgname in lgs.items():
        data.add(models.lsiLanguage,
                 lgid,
                 id=lgid,
                 name=lgname,
                 representation=nfeatures.get(lgid, 0))
    DBSession.flush()
    #print "I am here"
    #print data['ntsLanguage'].values()[1].id
    load_families(
        data,
        ##[(NOCODE_TO_GLOTTOCODE.get(l.id, l.id), l) for l in data['lsiLanguage'].values()],
        [(NOCODE_TO_GLOTTOCODE.get(l.id, l.id), l)
         for l in data['lsiLanguage'].values()
         if l.id != '---' and l.id != ''],
        isolates_icon='tcccccc')
    #print 'family'
    #print data['Family'].get('sino1245').jsondata
    #Domains
    for domain in set(ld['feature_domain'] for ld in ldps):
        data.add(models.FeatureDomain, domain, name=domain)
    DBSession.flush()

    #Designers
    #for i, info in enumerate(dtab("ntscontributions.tab") + dtab("ntscontacts.tab")):
    for i, info in enumerate([{
            'designer': 'shafqat',
            'domain': '',
            'pdflink': '',
            'citation': ''
    }, {
            'designer': '-',
            'domain': '',
            'pdflink': '',
            'citation': ''
    }]):
        designer_id = str(i + 1)
        data.add(models.Designer,
                 info['designer'],
                 id=designer_id,
                 name=designer_id,
                 domain=info["domain"],
                 contributor=info['designer'],
                 pdflink=info["pdflink"],
                 citation=info["citation"])
    DBSession.flush()

    #Sources
    '''for k, (typ, bibdata) in [
        ktfbib(bibsource) for ld in ldps
        if ld.get(u'bibsources') for bibsource in ld['bibsources'].split(",,,")
    ]:
        if k not in data["Source"]:
            data.add(common.Source, k, _obj=bibtex2source(Record(typ, k, **bibdata)))
    DBSession.flush()'''

    #Features
    fs = [(fid, mergeds(lds))
          for fid, lds in groupby(ldps, key=lambda d: d['feature_alphanumid'])]

    fvdesc = [(fid, [
        (ld.get("feature_possible_values"), ld.get("fromfile")) for ld in lds
        if ld.get("feature_possible_values")
    ]) for fid, lds in groupby(ldps, key=lambda d: d['feature_alphanumid'])]
    fvdt = [(fid, grp2(vdescs)) for (fid, vdescs) in fvdesc]
    fvmis = [(fid, vdescs) for (fid, vdescs) in fvdt if len(vdescs) > 1]

    for _, dfsids in groupby(sorted(
        (f.get('feature_name', fid), fid) for fid, f in fs),
                             key=lambda t: t[0]):
        ##print [(k,v) for (k,v) in list(dfsids)],len(list(dfsids))
        assert len(list(dfsids)) == 1
    #print 'here is nlgs'

    for fid, f in fs:
        #print "lang name"
        #print ldps
        #print f.get('feature_possible_values', ""),
        if not fid.isdigit():
            args.log.info("NO INT FID %s" % f)
        feature = data.add(
            models.Feature,
            fid,
            id=fid,
            name=f.get('feature_name', f['feature_alphanumid']),
            doc=f.get('feature_information', ""),
            vdoc=f.get('feature_possible_values', ""),
            representation=nlgs.get(fid, 0),
            designer=data["Designer"][f['designer']],
            dependson=f.get("depends_on", ""),
            abbreviation=f.get("abbreviation", ""),
            featuredomain=data['FeatureDomain'][f["feature_domain"]],
            name_french=f.get('francais', ""),
            clarification=f.get(
                "draft of clarifying comments to outsiders (hedvig + dunn + harald + suzanne)",
                ""),
            alternative_id=f.get("old feature number", ""),
            jl_relevant_unit=f.get("relevant unit(s)", ""),
            jl_function=f.get("function", ""),
            jl_formal_means=f.get("formal means", ""),
            sortkey_str="",
            sortkey_int=int(fid))

        vdesclist = [veq.split("==") for veq in feature.vdoc.split("||")]
        vdesc = {v.replace(".", "-"): desc for [v, desc] in vdesclist}
        ##vdesc = {fmly+val2icon(v): desc for ((v,desc),fmly) in itertools.product([(vv,desc) for [vv, desc] in vdesclist],['c','d','f','t'])}

        vdesc.setdefault('?', 'Not known')
        if 'N/A' not in vdesc and feature.dependson:
            vdesc["N/A"] = "Not Applicable"
        vi = {v: i for (i, v) in enumerate(sorted(vdesc.keys()))}
        ##vicons = {f+val2icon(v):f+val2icon(v) for (v,f) in itertools.product(['0','1','2','3'],['c','d','f','t'])}
        ##vicons['?'] = 'c00ffff'
        ##vicons['N/A'] = 'c00ffff'
        ##vicons = icons.iconize(vi.keys())
        for (v, desc) in vdesc.items():
            #print v,vicons[v]
            data.add(common.DomainElement, (fid, v),
                     id='%s-%s' % (fid, v),
                     name=v,
                     description=desc,
                     jsondata={"icon": Colors[v]},
                     number=vi[v],
                     parameter=feature)
    DBSession.flush()

    for ((f, lg), ixs) in grp2([((ld['feature_alphanumid'], ld['language_id']),
                                 i) for i, ld in enumerate(ldps)]):
        ixvs = set([ldps[ix]['value'] for ix in ixs])
        if len(ixvs) == 1:
            continue
        args.log.warn("Dup value %s %s %s" %
                      (f, lg, [(ldps[ix]['value'], ldps[ix]['fromfile'])
                               for ix in ixs]))
        ##print "Dup value %s %s %s" % (f, lg, [(ldps[ix]['value'], ldps[ix]['fromfile'], ldps[ix].get('provenance')) for ix in ixs])
    errors = {}
    done = set()
    glottolog = Glottolog()
    for ld in ldps:

        ############################### for printing different map markers for different familys for features:shafqat
        #print data['Family']

        language = data['lsiLanguage'][ld['language_id']]
        if isinstance(language, (tuple, list)) and len(language) == 2:
            code, language = language
        else:
            code = language.id
        if code != '-':
            gl_language = glottolog.languoid(code)
            if gl_language:
                gl_family = gl_language.family
                if gl_family:
                    family = data['Family'].get(gl_family.id)

        ##ld['value'] = ld['value']+'-'+str(family)
        ##ld['value'] = combineValueFamily(ld['value'],str(family))
        #print family
        #####################################
        parameter = data['Feature'][ld['feature_alphanumid']]
        language = data['lsiLanguage'][ld['language_id']]

        id_ = '%s-%s' % (parameter.id, language.id)
        if id_ in done:
            continue

        if (ld['feature_alphanumid'],
                ld['value']) not in data['DomainElement']:
            if not ld["value"].strip():
                continue
            info = (ld['feature_alphanumid'],
                    ld.get('feature_name', "[Feature Name Lacking]"),
                    ld['language_id'], ld['value'], ld['fromfile'])
            msg = u"%s %s %s %s %s not in the set of legal values ({0})" % info
            args.log.error(
                msg.format(
                    sorted([
                        y for (x, y) in data['DomainElement'].keys()
                        if x == ld['feature_alphanumid']
                    ])))
            ##print msg.format(sorted(
            ##  [y for (x, y) in data['DomainElement'].keys()
            ## if x == ld['feature_alphanumid']]))
            errors[(ld['feature_alphanumid'], ld['language_id'])] = info
            continue

        vs = common.ValueSet(
            id=id_,
            language=language,
            parameter=parameter,
            source=ld["source"] or None,
            ##contribution=parameter.designer
        )
        #print
        #print "this one"
        #print ld['value'],family
        models.lsiValue(
            id=id_,
            domainelement=data['DomainElement'][(ld['feature_alphanumid'],
                                                 ld['value'])],
            jsondata={
                "icon":
                data['DomainElement'][(ld['feature_alphanumid'],
                                       ld['value'])].jsondata,
                "family":
                FamilyCodes[str(family)]
            },
            comment=ld["comment"],
            valueset=vs,
            contributed_datapoint=ld["contributor"])
        done.add(id_)
        '''if not ld.get('bibsources'):
            if 'bibsources' not in ld:
                args.log.warn("no bibsource %s" % ld)
            continue
        for k, _ in [ktfbib(bibsource) for bibsource in ld['bibsources'].split(",,,")]:
            common.ValueSetReference(valueset=vs, source=data['Source'][k])'''
    DBSession.flush()

    #To CLDF
    cldf = {}
    for ld in ldps:
        parameter = data['Feature'][ld['feature_alphanumid']]
        language = data['lsiLanguage'][ld['language_id']]
        id_ = '%s-%s' % (parameter.id, language.id)
        if not id_ in done:
            continue
        dt = (lgs[ld['language_id']], ld['language_id'],
              ld['feature_alphanumid'] + ". " + ld['feature_name'],
              ld["value"])
        cldf[dt] = None

    tab = lambda rows: u''.join([u'\t'.join(row) + u"\n" for row in rows])
    savu(tab([("Language", "iso-639-3", "Feature", "Value")] + cldf.keys()),
         "lsi.cldf")

    args.log.info('%s Errors' % len(errors))

    dataset = common.Dataset(
        id="LSI",
        name='Linguistic Survey of India',
        publisher_name="Sprakbanken",
        publisher_place="Gothenburg",
        publisher_url="to be given",
        description="this is to be followed",
        domain='http://lsi.clld.org',
        published=date(2016, 05, 16),
        contact='*****@*****.**',
        license='http://creativecommons.org/licenses/by-nc-nd/2.0/de/deed.en',
        jsondata={
            'license_icon':
            'http://wals.info/static/images/cc_by_nc_nd.png',
            'license_name':
            'Creative Commons Attribution-NonCommercial-NoDerivs 2.0 Germany'
        })

    # disabled for experimental purposes, names were appearing multiple times
    for i, contributor in enumerate([
            common.Contributor(id="Lars Borin",
                               name="Lars Borin",
                               email="*****@*****.**"),
            common.Contributor(id="Shafqat Mumtaz Virk",
                               name="Shafqat Mumtaz Virk",
                               email="*****@*****.**"),
            common.Contributor(id="Anju Saxena",
                               name="Anju Saxena",
                               email="*****@*****.**"),
            common.Contributor(id="Harald Hammarstrom",
                               name="Harald Hammarstrom",
                               email="*****@*****.**")
    ]):
        #print i
        common.Editor(dataset=dataset, contributor=contributor, ord=i)
    '''cont1 = common.Contributor(
            id="Harald Hammarstrom",
            name="Harald Hammarstrom",
            email="*****@*****.**")
    cont2= common.Contributor(
            id="Shafqat Mumtaz Virk",
            name="Shafqat Mumtaz Virk",
            email="*****@*****.**")
    cont3 = common.Contributor(
            id="Lars Borin",
            name="Lars Borin",
            email="*****@*****.**")
    for contributor in [cont1,cont2,cont3]:
        common.Editor(dataset=dataset, contributor=contributor,ord=1)'''

    DBSession.add(dataset)
    DBSession.flush()