Exemplo n.º 1
0
def justifications(args, languages):
    """
    - text goes into ValueSet.description
    - refs go into ValueSetReference objects
    """

    def normalized_pages(s):
        if PAGES_PATTERN.match(s or ""):
            return s or ""

    #
    # create mappings to look up glottolog languoids matching names in justification files
    #
    langs_by_hid = languages
    langs_by_hname = {}
    langs_by_name = {}

    for l in DBSession.query(Languoid).filter(Languoid.active == False):
        langs_by_hname[l.jsondatadict.get("hname")] = l
        langs_by_hid[l.hid] = l
        langs_by_name[l.name] = l

    for l in DBSession.query(Languoid).filter(Languoid.active == True):
        langs_by_hname[l.jsondatadict.get("hname")] = l
        langs_by_hid[l.hid] = l
        langs_by_name[l.name] = l

    for id_, type_ in [("fc", "family"), ("sc", "subclassification")]:
        for i, row in enumerate(dsv.reader(args.data_file("%s_justifications.tab" % type_))):
            name = row[0]
            name = name.replace("_", " ") if not name.startswith("NOCODE") else name
            l = langs_by_hname.get(name, langs_by_hid.get(name, langs_by_name.get(name)))
            if not l:
                args.log.warn("ignoring %s" % name)
                continue

            _r = 3 if type_ == "family" else 2
            comment = (row[_r].strip() or None) if len(row) > _r else None
            if comment and not WORD_PATTERN.search(comment):
                comment = None

            #
            # TODO: look for [NOCODE_ppp] patterns as well!?
            #

            refs = [(int(m.group("id")), normalized_pages(m.group("comment"))) for m in REF_PATTERN.finditer(row[2])]

            vs = None
            for _vs in l.valuesets:
                if _vs.parameter.id == id_:
                    vs = _vs
                    break

            if not vs:
                args.log.info("%s %s ++" % (l.id, type_))
                vs = ValueSet(
                    id="%s%s" % (type_, l.id),
                    description=comment,
                    language=l,
                    parameter=Parameter.get(id_),
                    contribution=Contribution.first(),
                )
                DBSession.add(Value(id="%s%s" % (type_, l.id), name="%s - %s" % (l.level, l.status), valueset=vs))
                DBSession.flush()
            else:
                if vs.description != comment:
                    args.log.info("%s %s ~~ description" % (l.id, type_))
                    vs.description = comment

            for r in vs.references:
                DBSession.delete(r)

            for r, pages in refs:
                vs.references.append(ValueSetReference(source=Source.get(str(r)), description=pages))

        args.log.info("%s %s" % (i, type_))
Exemplo n.º 2
0
def justifications(args, languages, stats):
    """
    - text goes into ValueSet.description
    - refs go into ValueSetReference objects
    """
    hh_bibkey_to_glottolog_id = {}
    for rec in get_bib(args):
        for provider, bibkeys in get_bibkeys(rec).items():
            if provider == 'hh':
                for bibkey in bibkeys:
                    hh_bibkey_to_glottolog_id[bibkey] = rec['glottolog_ref_id']
                break

    def substitute_hh_bibkeys(m):
        return '**%s**' % hh_bibkey_to_glottolog_id[m.group('bibkey')]

    #
    # create mappings to look up glottolog languoids matching names in justification files
    #
    langs_by_hid = languages
    langs_by_hname = {}
    langs_by_name = {}

    # order by active to make sure, we active languoid overwrite the data of obsolete ones.
    for l in DBSession.query(Languoid).order_by(Languoid.active):
        langs_by_hname[l.jsondata.get('hname')] = l
        langs_by_hid[l.hid] = l
        langs_by_name[l.name] = l

    def normalize_pages(s):
        return (s or '').strip().rstrip(',') or None

    for id_, type_ in [('fc', 'family'), ('sc', 'subclassification')]:
        for i, row in enumerate(dsv.reader(
                args.data_dir.joinpath('languoids', 'forkel_%s_justifications-utf8.tab' % type_))):
            name = row[0]
            name = name.replace('_', ' ') if not name.startswith('NOCODE') else name
            l = langs_by_hname.get(name, langs_by_hid.get(name, langs_by_name.get(name)))
            if not l:
                args.log.warn('ignoring %s' % name)
                continue

            _r = 3 if type_ == 'family' else 2
            comment = (row[_r].strip() or None) if len(row) > _r else None
            if comment and not WORD_PATTERN.search(comment):
                comment = None
            if comment:
                comment = re.sub('\*\*(?P<bibkey>[^\*]+)\*\*', substitute_hh_bibkeys, comment)

            #
            # TODO: look for [NOCODE_ppp] patterns as well!?
            #

            refs = [(int(m.group('id')), normalize_pages(m.group('pages')))
                    for m in REF_PATTERN.finditer(
                    re.sub('\*\*(?P<bibkey>[^\*]+)\*\*', substitute_hh_bibkeys, row[2]))]

            vs = None
            for _vs in l.valuesets:
                if _vs.parameter.id == id_:
                    vs = _vs
                    break

            if not vs:
                args.log.info('%s %s ++' % (l.id, type_))
                vs = ValueSet(
                    id='%s%s' % (id_, l.pk),
                    description=comment,
                    language=l,
                    parameter=Parameter.get(id_),
                    contribution=Contribution.first())
                DBSession.add(Value(
                    id='%s%s' % (id_, l.pk),
                    name='%s - %s' % (l.level, l.status),
                    valueset=vs))
                DBSession.flush()
            else:
                if vs.description != comment:
                    args.log.info('%s %s ~~ description: %s ---> %s' % (l.id, type_, vs.description, comment))
                    vs.description = comment
                    stats.update(['justifications-%s' % type_])

            for r in vs.references:
                DBSession.delete(r)

            for r, pages in refs:
                # FIXME: we must make sure not to link sources which will subsequently be
                # replaced!
                vs.references.append(ValueSetReference(
                    source=Source.get(str(r)),
                    description=pages))

        args.log.info('%s %s' % (i, type_))