Exemplo n.º 1
0
def main(args):
    active_only = not args.all
    coords = dict(
        (r[0], r[1:]) for r in dsv.rows(args.data_file('coordinates.tab')))
    codes = dict((row[0], row[1]) for row in DBSession.execute(
        "select ll.hid, l.pk from languoid as ll, language as l where ll.pk = l.pk and ll.hid is not null"
    ))

    maxid = DBSession.execute(
        "select pk from languoid order by pk desc limit 1").fetchone()[0]
    gcs = {}

    lnames = {}
    for row in DBSession.execute("select pk, name from language"):
        lnames[row[0]] = row[1]

    # dict mapping branches (i.e. tuples of sub-family names) to dicts of H-languages
    families = OrderedDict()

    # dict mapping identifiers of H-languages to branches
    languages = OrderedDict()

    parse_families(args.data_file('lff.txt'), families, languages)

    # handle isolates / collapse families with exactly one leaf:
    isolate_names = {}
    collapsed_names = {}
    for key in families.keys():
        if len(families[key]) == 1:
            if len(key) == 1:
                # isolate
                languages[families[key].keys()[0]][0] = None
                isolate_names[key[0]] = families[key].keys()[
                    0]  # map name to code
            else:
                languages[families[key].keys()[0]][0] = key[:-1]
                collapsed_names[key[-1]] = families[key].keys()[0]
            del families[key]

    # we also want to be able to lookup families by name
    names = {}
    for branch in families:
        name = branch[-1]
        if name in names:
            names[name].append(branch)
        else:
            names[name] = [branch]

    # now add the unclassifiabble, unattested, un-whatever
    parse_families(args.data_file('lof.txt'), families, languages)

    ncodes = {}
    languoids = []
    for code in languages:
        if code not in codes:
            maxid += 1
            ncodes[code] = maxid
            hnode, status, name, comment = languages[code]
            # we have to insert a new H-language!
            attrs = languoid(
                maxid,
                'language',
                hid=code,
                id=glottocode(unicode(name), DBSession, gcs),
                name=name,
                hname=name,
                status=status,
                globalclassificationcomment=comment or None,
            )
            print '++', attrs
            if coords.get(code):
                attrs['longitude'], attrs['latitude'] = map(
                    float, coords.get(code))
            languoids.append(attrs)

    urnodes = {}
    rnodes = {}
    for family in families:
        leafs = families[family]
        assert family[0] not in ['Speech Register', 'Spurious']
        leafs = tuple(
            sorted(code for code in families[family].keys() if code in codes))
        assert leafs
        if leafs in rnodes:
            # special case: there may be additional "Unclassified something" nodes in
            # branch without any changes in the set of leafs.
            assert [n for n in family if n.startswith('Unclassified')]
            fset, rset = set(family), set(rnodes[leafs])
            assert rset.issubset(fset)
            assert leafs not in urnodes
            urnodes[leafs] = family
            #if len(family) > rnodes[leafs]:
            #    rnodes[leafs] = family
        else:
            rnodes[leafs] = family

    #
    # at this point rnodes is a consolidated mapping of sets of H-Languages to branches in
    # the family tree.
    #

    # for set comparisons we compute a list of actual sets of leafs as well
    leafsets = [set(t) for t in sorted(rnodes.keys(), key=lambda s: len(s))]

    todo = []

    # dict mapping (id, name, level) tuples for gl languoids of level family to tuples of leafs
    glnodes = {}
    #
    # note: all languoids with level null have children, thus are not dialects!
    #
    sql = "select l.pk, l.name, ll.level, ll.father_pk from languoid as ll, language as l where ll.pk = l.pk and ll.level = 'family' or ll.level is null"
    if active_only:
        sql = "select l.pk, l.name, ll.level, ll.father_pk from languoid as ll, language as l where ll.pk = l.pk and ll.level = 'family' and l.active = true"

    for row in DBSession.execute(sql).fetchall():
        leafs = [
            r[0] for r in DBSession.execute(
                "select distinct l.hid from treeclosuretable as t, languoid as l where t.child_pk = l.pk and t.parent_pk = %s and l.hid is not null and l.status != 'provisional'"
                % row[0])
        ]
        if leafs:
            glnodes[(row[0], row[2], row[1], row[3])] = tuple(sorted(leafs))
        else:
            # families without leafs will be marked as retired
            if row[1] in names and len(names[row[1]]) == 1:
                # unique family name, good enough for a match!?
                todo.append(Migration(row[0], None, pointer=names[row[1]][0]))
            else:
                todo.append(Migration(row[0], None))

    # note: for legacy gl nodes, we map leaf-tuples to lists of matching nodes!
    rglnodes = {}
    for node, leafs in glnodes.items():
        if leafs in rglnodes:
            rglnodes[leafs].append(node)
        else:
            rglnodes[leafs] = [node]

    # now we look for matches between old and new classification:
    for leafs, nodes in rglnodes.items():
        assert leafs
        assert nodes
        todo.extend(match_nodes(leafs, nodes, rnodes, urnodes, leafsets,
                                names))

    # compile a mapping for exact matches:
    branch_to_pk = {}
    for m in todo:
        if m.hid:
            assert m.hid not in branch_to_pk
            branch_to_pk[m.hid] = m.pk

    new = 0
    for hnode in sorted(families.keys(), key=lambda b: (len(b), b)):
        # loop through branches breadth first to determine what's to be inserted
        if hnode not in branch_to_pk:
            t = tuple(sorted(families[hnode].keys()))
            if t in rglnodes:
                # the "Unclassified subfamily" special case from above:
                assert [n for n in hnode if n.startswith('Unclassified')]
                # make sure, the existing glottolog family for the set of leafs is mapped
                # to some other node in the new classification:
                assert rglnodes[t][0][0] in [m.pk for m in todo if m.hid]

            maxid += 1
            attrs = languoid(
                maxid,
                'family',
                id=glottocode(unicode(hnode[-1]), DBSession, gcs),
                name=hnode[-1],
                hname=hnode[-1],
            )
            branch_to_pk[hnode] = maxid
            lnames[maxid] = hnode[-1]
            if len(hnode) > 1:
                attrs['father_pk'] = branch_to_pk[tuple(list(hnode)[:-1])]
                assert attrs['father_pk']
            print '++', attrs
            new += 1
            languoids.append(attrs)

    # now on to the updates for families:
    matches, migrations, nomatches = 0, 0, 0
    for m in todo:
        attrs = languoid(m.pk, 'family', name=lnames[m.pk])
        if m.hid:
            #print '==', lnames[m.pk].encode('utf8'), '->', ', '.join(m.hid).encode('utf8')
            matches += 1

            if len(m.hid) > 1:
                attrs['father_pk'] = branch_to_pk[tuple(list(m.hid)[:-1])]
            if getattr(m, 'rename', False):
                attrs['name'] = m.hid[-1]
            attrs['hname'] = m.hid[-1]
        else:
            attrs['active'] = False
            if getattr(m, 'pointer', False):
                print '~~', lnames[m.pk].encode('utf8'), '->', ', '.join(
                    m.pointer).encode('utf8')
                migrations += 1

                attrs['replacement'] = branch_to_pk[m.pointer]
            else:
                print '--', lnames[m.pk].encode('utf8'), '->'
                nomatches += 1
        languoids.append(attrs)

    print matches, 'matches'
    print migrations, 'migrations'
    print nomatches, 'nomatches'
    print new, 'new nodes'

    risolate_names = dict(zip(isolate_names.values(), isolate_names.keys()))
    rcollapsed_names = dict(
        zip(collapsed_names.values(), collapsed_names.keys()))

    # and updates of father_pks for languages:
    for l in languages:
        hnode, status, name, comment = languages[l]
        id_ = codes.get(l, ncodes.get(l))
        attrs = languoid(id_, 'language', status=status)
        if hnode:
            attrs['father_pk'] = branch_to_pk[hnode]
        attrs['globalclassificationcomment'] = comment or None
        # look for hnames!
        if l in risolate_names:
            attrs['hname'] = risolate_names[l]
        if l in rcollapsed_names:
            attrs['hname'] = rcollapsed_names[l]
        languoids.append(attrs)

    for row in DBSession.execute(
            "select l.pk, ll.hid, l.name from languoid as ll, language as l where ll.pk = l.pk and ll.hid like '%NOCODE_%'"
    ).fetchall():
        if row[1] not in languages:
            # languoids with Harald's private code that are no longer in use
            attrs = languoid(row[0],
                             'language',
                             status='retired',
                             active=False,
                             father_pk=None)
            languoids.append(attrs)

    with open(args.data_file('languoids.json'), 'w') as fp:
        json.dump(languoids, fp)
def main(args):
    stats = Counter(new=0, matches=0, migrations=0, nomatches=0)
    l, ll = Language.__table__.alias("l"), Languoid.__table__.alias("ll")
    gl_languoids = list(DBSession.execute(select([l, ll], use_labels=True).where(l.c.pk == ll.c.pk)).fetchall())

    # we collect a list of changes which we will store in a JSON file.
    changes = []

    hid_to_pk = {row["ll_hid"]: row["l_pk"] for row in gl_languoids if row["ll_hid"]}
    max_languoid_pk = max(*[row["l_pk"] for row in gl_languoids])
    new_glottocodes = {}
    pk_to_name = {row["l_pk"]: row["l_name"] for row in gl_languoids}

    # dict mapping branches (i.e. tuples of sub-family names) to dicts of H-languages
    hh_families = OrderedDict()

    # dict mapping identifiers (i.e. hid) of H-languages to branches
    hh_languages = OrderedDict()

    parse_families(args.data_dir.joinpath("languoids", "lff.txt"), hh_families, hh_languages)

    # handle isolates / collapse families with exactly one leaf:
    isolate_names = {}
    collapsed_names = {}
    for key in hh_families.keys():
        if len(hh_families[key]) == 1:
            if len(key) == 1:
                # isolate
                hh_languages[hh_families[key].keys()[0]][0] = None
                isolate_names[key[0]] = hh_families[key].keys()[0]  # map name to code
            else:
                hh_languages[hh_families[key].keys()[0]][0] = key[:-1]
                collapsed_names[key[-1]] = hh_families[key].keys()[0]
            del hh_families[key]

    # now add the unclassifiabble, unattested, un-whatever
    parse_families(args.data_dir.joinpath("languoids", "lof.txt"), hh_families, hh_languages)

    # we also want to be able to lookup families by name
    fname_to_branches = defaultdict(list)
    for branch in hh_families:
        fname_to_branches[branch[-1]].append(branch)

    new_hid_to_pk = {}
    for code, (hnode, status, name) in hh_languages.items():
        if code not in hid_to_pk:
            # we have to insert a new H-language!
            max_languoid_pk += 1
            new_hid_to_pk[code] = max_languoid_pk

            if name in pk_to_name.values():
                args.log.warn("new code {1} for existing name {0}".format(name, code))
            changes.append(
                languoid(
                    max_languoid_pk,
                    "language",
                    hid=code,
                    id=glottocode(unicode(name), DBSession, new_glottocodes),
                    name=name,
                    hname=name,
                    status=status,
                )
            )
            stats.update(["new_languages"])

    duplicate_leafset_to_branch = {}
    leafset_to_branch = {}
    for family, langs in hh_families.items():
        leafs = get_leafset(hid for hid in langs.keys() if hid in hid_to_pk)
        if not leafs:
            args.log.info("Family with only new languages: %s, %s" % (family, langs))
            continue

        if leafs in leafset_to_branch:
            # so we have already seen this exact set of leaves.
            #
            # special case: there may be additional "Unclassified something" nodes in
            # branch without any changes in the set of leafs ...
            if not [n for n in family if n.startswith("Unclassified")]:
                # ... or the full leafset contains new languages
                assert [hid for hid in hh_families[family[:-1]].keys() if hid in new_hid_to_pk]
            fset, rset = set(family), set(leafset_to_branch[leafs])
            assert rset.issubset(fset)
            assert leafs not in duplicate_leafset_to_branch
            duplicate_leafset_to_branch[leafs] = family
        else:
            leafset_to_branch[leafs] = family

    #
    # at this point leafset_to_branch is a consolidated mapping of sets of H-Languages
    # to branches in the new family tree.
    #

    # for set comparisons we compute a list of actual sets (not tuples) of leafs
    # ordered by length.
    leafsets = [set(t) for t in sorted(leafset_to_branch.keys(), key=lambda s: len(s))]

    todo = []

    gl_family_to_leafset = {}

    def select_leafs(pk):
        l, tc = Languoid.__table__.alias("l"), TreeClosureTable.__table__.alias("tc")
        return [
            r["l_hid"]
            for r in DBSession.execute(
                select([l, tc], use_labels=True).where(
                    and_(
                        l.c.pk == tc.c.child_pk,
                        l.c.hid != None,
                        l.c.status != LanguoidStatus.provisional,
                        tc.c.parent_pk == pk,
                    )
                )
            )
        ]

    for row in gl_languoids:
        if row["ll_level"] == LanguoidLevel.family and row["l_active"]:
            leafs = get_leafset(select_leafs(row["l_pk"]))
            assert leafs
            glnode = GLNode(
                row["l_pk"], row["l_name"], row["ll_level"].name, row["ll_father_pk"], row["l_jsondata"].get("hname")
            )
            gl_family_to_leafset[glnode] = leafs

    # note: for legacy gl nodes, we map leaf-tuples to lists of matching nodes!
    leafset_to_gl_family = defaultdict(list)
    for node, leafs in gl_family_to_leafset.items():
        leafset_to_gl_family[leafs].append(node)

    # now we look for matches between old and new classification:
    for leafs, nodes in leafset_to_gl_family.items():
        todo.extend(
            match_nodes(args, leafs, nodes, leafset_to_branch, duplicate_leafset_to_branch, leafsets, fname_to_branches)
        )

    # compile a mapping for exact matches:
    branch_to_pk = {}
    for m in todo:
        if m.hid:
            if m.hid in branch_to_pk:
                if branch_to_pk[m.hid] != m.pk:
                    # compare names:
                    if pk_to_name[m.pk] == m.hid[-1]:
                        args.log.info("#### type1")
                        branch_to_pk[m.hid] = m.pk
                    elif pk_to_name[branch_to_pk[m.hid]] == m.hid[-1]:
                        args.log.info("#### type2")
                    else:
                        raise ValueError
            else:
                branch_to_pk[m.hid] = m.pk

    for hnode in sorted(hh_families.keys(), key=lambda b: (len(b), b)):
        # loop through branches breadth first to determine what's to be inserted
        if hnode not in branch_to_pk:
            t = get_leafset(hh_families[hnode].keys())
            if t in leafset_to_gl_family:
                # the "Unclassified subfamily" special case from above:
                if not [n for n in hnode if n.startswith("Unclassified")]:
                    assert [hid for hid in hh_families[hnode[:-1]].keys() if hid in new_hid_to_pk]
                # make sure, the existing glottolog family for the set of leafs is mapped
                # to some other node in the new classification:
                assert leafset_to_gl_family[t][0].pk in [m.pk for m in todo if m.hid]

            max_languoid_pk += 1
            branch_to_pk[hnode] = max_languoid_pk
            pk_to_name[max_languoid_pk] = hnode[-1]
            attrs = languoid(
                max_languoid_pk,
                "family",
                id=glottocode(unicode(hnode[-1]), DBSession, new_glottocodes),
                name=hnode[-1],
                hname=hnode[-1],
            )
            if len(hnode) > 1:
                attrs["father_pk"] = branch_to_pk[tuple(list(hnode)[:-1])]
                assert attrs["father_pk"]
            stats.update(["new"])
            changes.append(attrs)

    # now on to the updates for families:
    for m in todo:
        attrs = languoid(m.pk, "family", name=pk_to_name[m.pk])
        if m.hid:
            stats.update(["matches"])
            if len(m.hid) > 1:
                attrs["father_pk"] = branch_to_pk[tuple(list(m.hid)[:-1])]
            if getattr(m, "rename", False):
                attrs["name"] = m.hid[-1]
            attrs["hname"] = m.hid[-1]
        else:
            attrs["active"] = False  # mark the languoid as obsolete.
            if getattr(m, "pointer", False):
                print "~~", m.pk, pk_to_name[m.pk].encode("utf8"), "->", ", ".join(m.pointer).encode("utf8")
                stats.update(["migrations"])
                attrs["replacement"] = branch_to_pk[m.pointer]
            else:
                stats.update(["nomatches"])
        changes.append(attrs)

    args.log.info("%s" % stats)

    risolate_names = dict(zip(isolate_names.values(), isolate_names.keys()))
    rcollapsed_names = dict(zip(collapsed_names.values(), collapsed_names.keys()))

    # and updates of father_pks for languages:
    for l, (hnode, status, name) in hh_languages.items():
        id_ = hid_to_pk.get(l)
        if not id_:
            id_ = new_hid_to_pk.get(l)
            attrs = languoid(id_, "language", status=status)
        else:
            attrs = languoid(id_, "language", status=status)
            # In case of existing languoids, we don't change the active flag!
            del attrs["active"]
        if id_ in pk_to_name and name != pk_to_name[id_]:
            if slug(pk_to_name[id_]) == slug(name):
                attrs["name"] = name
        if hnode:
            attrs["father_pk"] = branch_to_pk[hnode]
        # look for hnames!
        if l in risolate_names:
            attrs["hname"] = risolate_names[l]
        if l in rcollapsed_names:
            attrs["hname"] = rcollapsed_names[l]
        changes.append(attrs)

    for row in gl_languoids:
        hid = row["ll_hid"]
        if hid and "NOCODE" in hid and hid not in hh_languages:
            # languoids with Harald's private code that are no longer in use
            changes.append(languoid(row["l_pk"], "language", status="retired", active=False, father_pk=None))

    jsondump(changes, args.data_dir.joinpath("languoids", "changes.json"), indent=4)
Exemplo n.º 3
0
def main(args):
    active_only = not args.all
    codes = dict((row[0], row[1]) for row in
                 DBSession.execute("select ll.hid, l.pk from languoid as ll, language as l where ll.pk = l.pk and ll.hid is not null"))

    maxid = DBSession.execute(
        "select pk from languoid order by pk desc limit 1").fetchone()[0]
    gcs = {}

    lnames = {}
    for row in DBSession.execute("select pk, name from language"):
        lnames[row[0]] = row[1]

    # dict mapping branches (i.e. tuples of sub-family names) to dicts of H-languages
    families = OrderedDict()

    # dict mapping identifiers of H-languages to branches
    languages = OrderedDict()

    parse_families(data_file(args, 'lff.txt'), families, languages)

    # handle isolates / collapse families with exactly one leaf:
    isolate_names = {}
    collapsed_names = {}
    for key in families.keys():
        if len(families[key]) == 1:
            if len(key) == 1:
                # isolate
                languages[families[key].keys()[0]][0] = None
                isolate_names[key[0]] = families[key].keys()[0]  # map name to code
            else:
                languages[families[key].keys()[0]][0] = key[:-1]
                collapsed_names[key[-1]] = families[key].keys()[0]
            del families[key]

    # we also want to be able to lookup families by name
    names = {}
    for branch in families:
        name = branch[-1]
        if name in names:
            names[name].append(branch)
        else:
            names[name] = [branch]

    # now add the unclassifiabble, unattested, un-whatever
    parse_families(data_file(args, 'lof.txt'), families, languages)

    ncodes = {}
    languoids = []
    for code in languages:
        if code not in codes:
            maxid += 1
            ncodes[code] = maxid
            hnode, status, name, comment = languages[code]
            # we have to insert a new H-language!
            attrs = languoid(
                maxid,
                'language',
                hid=code,
                id=glottocode(unicode(name), DBSession, gcs),
                name=name,
                hname=name,
                status=status,
                globalclassificationcomment=comment or None,
            )
            print '++', attrs
            languoids.append(attrs)

    urnodes = {}
    rnodes = {}
    for family in families:
        #leafs = families[family]
        assert family[0] not in ['Speech Register', 'Spurious']
        leafs = tuple(sorted(code for code in families[family].keys() if code in codes))
        try:
            assert leafs
        except:
            print 'Family with only new languages!!'
            print family
            continue
            #raise
        if leafs in rnodes:
            # so we have already seen this exact set of leaves.
            #
            # special case: there may be additional "Unclassified something" nodes in
            # branch without any changes in the set of leafs ...
            try:
                assert [n for n in family if n.startswith('Unclassified')]
            except:
                print family
                print leafs
                # ... or the full leafset contains new languages
                assert [code for code in families[family[:-1]].keys() if code in ncodes]
            fset, rset = set(family), set(rnodes[leafs])
            assert rset.issubset(fset)
            assert leafs not in urnodes
            urnodes[leafs] = family
            #if len(family) > rnodes[leafs]:
            #    rnodes[leafs] = family
        else:
            rnodes[leafs] = family

    #
    # at this point rnodes is a consolidated mapping of sets of H-Languages to branches in
    # the family tree.
    #

    # for set comparisons we compute a list of actual sets of leafs as well
    leafsets = [set(t) for t in sorted(rnodes.keys(), key=lambda s: len(s))]

    todo = []

    # dict mapping (id, name, level) tuples for gl languoids of level family to tuples of leafs
    glnodes = {}
    #
    # note: all languoids with level null have children, thus are not dialects!
    #
    sql = "select l.pk, l.name, ll.level, ll.father_pk from languoid as ll, language as l where ll.pk = l.pk and ll.level = 'family' or ll.level is null"
    if active_only:
        sql = "select l.pk, l.name, ll.level, ll.father_pk from languoid as ll, language as l where ll.pk = l.pk and ll.level = 'family' and l.active = true"

    for row in DBSession.execute(sql).fetchall():
        leafs = [r[0] for r in DBSession.execute(
            "select distinct l.hid from treeclosuretable as t, languoid as l where t.child_pk = l.pk and t.parent_pk = %s and l.hid is not null and l.status != 'provisional'"
            % row[0])]
        if leafs:
            glnodes[(row[0], row[2], row[1], row[3])] = tuple(sorted(leafs))
        else:
            # families without leafs will be marked as retired
            if row[1] in names and len(names[row[1]]) == 1:
                # unique family name, good enough for a match!?
                todo.append(Migration(row[0], None, pointer=names[row[1]][0]))
            else:
                todo.append(Migration(row[0], None))

    # note: for legacy gl nodes, we map leaf-tuples to lists of matching nodes!
    rglnodes = {}
    for node, leafs in glnodes.items():
        if leafs in rglnodes:
            rglnodes[leafs].append(node)
        else:
            rglnodes[leafs] = [node]

    # now we look for matches between old and new classification:
    for leafs, nodes in rglnodes.items():
        assert leafs
        assert nodes
        todo.extend(match_nodes(leafs, nodes, rnodes, urnodes, leafsets, names))

    # compile a mapping for exact matches:
    branch_to_pk = {}
    for m in todo:
        if m.hid:
            if m.hid in branch_to_pk:
                if branch_to_pk[m.hid] != m.pk:
                    # compare names:
                    if lnames[m.pk] == m.hid[-1]:
                        print '#### type1'
                        branch_to_pk[m.hid] = m.pk
                    elif lnames[branch_to_pk[m.hid]] == m.hid[-1]:
                        print '#### type2'
                        pass
                    else:
                        print m.hid
                        print m.hid[-1]
                        print lnames[m.pk]
                        print branch_to_pk[m.hid]
                        print m.pk
                        raise ValueError
            else:
                #assert m.hid not in branch_to_pk
                branch_to_pk[m.hid] = m.pk

    new = 0
    for hnode in sorted(families.keys(), key=lambda b: (len(b), b)):
        # loop through branches breadth first to determine what's to be inserted
        if hnode not in branch_to_pk:
            t = tuple(sorted(families[hnode].keys()))
            if t in rglnodes:
                # the "Unclassified subfamily" special case from above:
                try:
                    assert [n for n in hnode if n.startswith('Unclassified')]
                except:
                    # or the "new language inserted higher up" case!
                    assert [code for code in families[hnode[:-1]].keys() if code in ncodes]
                    #print hnode
                    #print t
                    #raise
                # make sure, the existing glottolog family for the set of leafs is mapped
                # to some other node in the new classification:
                assert rglnodes[t][0][0] in [m.pk for m in todo if m.hid]

            maxid += 1
            attrs = languoid(
                maxid,
                'family',
                id=glottocode(unicode(hnode[-1]), DBSession, gcs),
                name=hnode[-1],
                hname=hnode[-1],
            )
            branch_to_pk[hnode] = maxid
            lnames[maxid] = hnode[-1]
            if len(hnode) > 1:
                attrs['father_pk'] = branch_to_pk[tuple(list(hnode)[:-1])]
                assert attrs['father_pk']
            print '++', attrs
            new += 1
            languoids.append(attrs)

    # now on to the updates for families:
    matches, migrations, nomatches = 0, 0, 0
    for m in todo:
        attrs = languoid(m.pk, 'family', name=lnames[m.pk])
        if m.hid:
            #print '==', lnames[m.pk].encode('utf8'), '->', ', '.join(m.hid).encode('utf8')
            matches += 1

            if len(m.hid) > 1:
                attrs['father_pk'] = branch_to_pk[tuple(list(m.hid)[:-1])]
            if getattr(m, 'rename', False):
                attrs['name'] = m.hid[-1]
            attrs['hname'] = m.hid[-1]
        else:
            attrs['active'] = False
            if getattr(m, 'pointer', False):
                print '~~', lnames[m.pk].encode('utf8'), '->', ', '.join(m.pointer).encode('utf8')
                migrations += 1

                attrs['replacement'] = branch_to_pk[m.pointer]
            else:
                print '--', lnames[m.pk].encode('utf8'), '->'
                nomatches += 1
        languoids.append(attrs)

    print matches, 'matches'
    print migrations, 'migrations'
    print nomatches, 'nomatches'
    print new, 'new nodes'

    risolate_names = dict(zip(isolate_names.values(), isolate_names.keys()))
    rcollapsed_names = dict(zip(collapsed_names.values(), collapsed_names.keys()))

    # and updates of father_pks for languages:
    for l in languages:
        hnode, status, name, comment = languages[l]
        id_ = codes.get(l, ncodes.get(l))
        attrs = languoid(id_, 'language', status=status)
        if hnode:
            attrs['father_pk'] = branch_to_pk[hnode]
        attrs['globalclassificationcomment'] = comment or None
        # look for hnames!
        if l in risolate_names:
            attrs['hname'] = risolate_names[l]
        if l in rcollapsed_names:
            attrs['hname'] = rcollapsed_names[l]
        languoids.append(attrs)

    for row in DBSession.execute(
        "select l.pk, ll.hid, l.name from languoid as ll, language as l where ll.pk = l.pk and ll.hid like '%NOCODE_%'"
    ).fetchall():
        if row[1] not in languages:
            # languoids with Harald's private code that are no longer in use
            attrs = languoid(
                row[0], 'language', status='retired', active=False, father_pk=None)
            languoids.append(attrs)

    with open(data_file(args, 'languoids.json'), 'w') as fp:
        json.dump(languoids, fp)
def main(args):
    stats = Counter(new=0, matches=0, migrations=0, nomatches=0)
    l, ll = Language.__table__.alias('l'), Languoid.__table__.alias('ll')
    gl_languoids = list(
        DBSession.execute(
            select([l, ll],
                   use_labels=True).where(l.c.pk == ll.c.pk)).fetchall())

    # we collect a list of changes which we will store in a JSON file.
    changes = []

    hid_to_pk = {
        row['ll_hid']: row['l_pk']
        for row in gl_languoids if row['ll_hid']
    }
    max_languoid_pk = max(*[row['l_pk'] for row in gl_languoids])
    new_glottocodes = {}
    pk_to_name = {row['l_pk']: row['l_name'] for row in gl_languoids}

    # dict mapping branches (i.e. tuples of sub-family names) to dicts of H-languages
    hh_families = OrderedDict()

    # dict mapping identifiers (i.e. hid) of H-languages to branches
    hh_languages = OrderedDict()

    parse_families(args.data_dir.joinpath('languoids', 'lff.txt'), hh_families,
                   hh_languages)

    # handle isolates / collapse families with exactly one leaf:
    isolate_names = {}
    collapsed_names = {}
    for key in hh_families.keys():
        if len(hh_families[key]) == 1:
            if len(key) == 1:
                # isolate
                hh_languages[hh_families[key].keys()[0]][0] = None
                isolate_names[key[0]] = hh_families[key].keys()[
                    0]  # map name to code
            else:
                hh_languages[hh_families[key].keys()[0]][0] = key[:-1]
                collapsed_names[key[-1]] = hh_families[key].keys()[0]
            del hh_families[key]

    # now add the unclassifiabble, unattested, un-whatever
    parse_families(args.data_dir.joinpath('languoids', 'lof.txt'), hh_families,
                   hh_languages)

    # we also want to be able to lookup families by name
    fname_to_branches = defaultdict(list)
    for branch in hh_families:
        fname_to_branches[branch[-1]].append(branch)

    new_hid_to_pk = {}
    for code, (hnode, status, name) in hh_languages.items():
        if code not in hid_to_pk:
            # we have to insert a new H-language!
            max_languoid_pk += 1
            new_hid_to_pk[code] = max_languoid_pk

            if name in pk_to_name.values():
                args.log.warn('new code {1} for existing name {0}'.format(
                    name, code))
            changes.append(
                languoid(max_languoid_pk,
                         'language',
                         hid=code,
                         id=glottocode(unicode(name), DBSession,
                                       new_glottocodes),
                         name=name,
                         hname=name,
                         status=status))
            stats.update(['new_languages'])

    duplicate_leafset_to_branch = {}
    leafset_to_branch = {}
    for family, langs in hh_families.items():
        leafs = get_leafset(hid for hid in langs.keys() if hid in hid_to_pk)
        if not leafs:
            args.log.info('Family with only new languages: %s, %s' %
                          (family, langs))
            continue

        if leafs in leafset_to_branch:
            # so we have already seen this exact set of leaves.
            #
            # special case: there may be additional "Unclassified something" nodes in
            # branch without any changes in the set of leafs ...
            if not [n for n in family if n.startswith('Unclassified')]:
                # ... or the full leafset contains new languages
                assert [
                    hid for hid in hh_families[family[:-1]].keys()
                    if hid in new_hid_to_pk
                ]
            fset, rset = set(family), set(leafset_to_branch[leafs])
            assert rset.issubset(fset)
            assert leafs not in duplicate_leafset_to_branch
            duplicate_leafset_to_branch[leafs] = family
        else:
            leafset_to_branch[leafs] = family

    #
    # at this point leafset_to_branch is a consolidated mapping of sets of H-Languages
    # to branches in the new family tree.
    #

    # for set comparisons we compute a list of actual sets (not tuples) of leafs
    # ordered by length.
    leafsets = [
        set(t) for t in sorted(leafset_to_branch.keys(), key=lambda s: len(s))
    ]

    todo = []

    gl_family_to_leafset = {}

    def select_leafs(pk):
        l, tc = Languoid.__table__.alias(
            'l'), TreeClosureTable.__table__.alias('tc')
        return [
            r['l_hid'] for r in DBSession.execute(
                select([l, tc], use_labels=True).where(
                    and_(l.c.pk == tc.c.child_pk, l.c.hid != None, l.c.status
                         != LanguoidStatus.provisional, tc.c.parent_pk == pk)))
        ]

    for row in gl_languoids:
        if row['ll_level'] == LanguoidLevel.family and row['l_active']:
            leafs = get_leafset(select_leafs(row['l_pk']))
            assert leafs
            glnode = GLNode(row['l_pk'], row['l_name'], row['ll_level'].name,
                            row['ll_father_pk'],
                            row['l_jsondata'].get('hname'))
            gl_family_to_leafset[glnode] = leafs

    # note: for legacy gl nodes, we map leaf-tuples to lists of matching nodes!
    leafset_to_gl_family = defaultdict(list)
    for node, leafs in gl_family_to_leafset.items():
        leafset_to_gl_family[leafs].append(node)

    # now we look for matches between old and new classification:
    for leafs, nodes in leafset_to_gl_family.items():
        todo.extend(
            match_nodes(args, leafs, nodes, leafset_to_branch,
                        duplicate_leafset_to_branch, leafsets,
                        fname_to_branches))

    # compile a mapping for exact matches:
    branch_to_pk = {}
    for m in todo:
        if m.hid:
            if m.hid in branch_to_pk:
                if branch_to_pk[m.hid] != m.pk:
                    # compare names:
                    if pk_to_name[m.pk] == m.hid[-1]:
                        args.log.info('#### type1')
                        branch_to_pk[m.hid] = m.pk
                    elif pk_to_name[branch_to_pk[m.hid]] == m.hid[-1]:
                        args.log.info('#### type2')
                    else:
                        raise ValueError
            else:
                branch_to_pk[m.hid] = m.pk

    for hnode in sorted(hh_families.keys(), key=lambda b: (len(b), b)):
        # loop through branches breadth first to determine what's to be inserted
        if hnode not in branch_to_pk:
            t = get_leafset(hh_families[hnode].keys())
            if t in leafset_to_gl_family:
                # the "Unclassified subfamily" special case from above:
                if not [n for n in hnode if n.startswith('Unclassified')]:
                    assert [
                        hid for hid in hh_families[hnode[:-1]].keys()
                        if hid in new_hid_to_pk
                    ]
                # make sure, the existing glottolog family for the set of leafs is mapped
                # to some other node in the new classification:
                assert leafset_to_gl_family[t][0].pk in [
                    m.pk for m in todo if m.hid
                ]

            max_languoid_pk += 1
            branch_to_pk[hnode] = max_languoid_pk
            pk_to_name[max_languoid_pk] = hnode[-1]
            attrs = languoid(
                max_languoid_pk,
                'family',
                id=glottocode(unicode(hnode[-1]), DBSession, new_glottocodes),
                name=hnode[-1],
                hname=hnode[-1],
            )
            if len(hnode) > 1:
                attrs['father_pk'] = branch_to_pk[tuple(list(hnode)[:-1])]
                assert attrs['father_pk']
            stats.update(['new'])
            changes.append(attrs)

    # now on to the updates for families:
    for m in todo:
        attrs = languoid(m.pk, 'family', name=pk_to_name[m.pk])
        if m.hid:
            stats.update(['matches'])
            if len(m.hid) > 1:
                attrs['father_pk'] = branch_to_pk[tuple(list(m.hid)[:-1])]
            if getattr(m, 'rename', False):
                attrs['name'] = m.hid[-1]
            attrs['hname'] = m.hid[-1]
        else:
            attrs['active'] = False  # mark the languoid as obsolete.
            if getattr(m, 'pointer', False):
                print '~~', m.pk, pk_to_name[m.pk].encode('utf8'), '->', \
                    ', '.join(m.pointer).encode('utf8')
                stats.update(['migrations'])
                attrs['replacement'] = branch_to_pk[m.pointer]
            else:
                stats.update(['nomatches'])
        changes.append(attrs)

    args.log.info('%s' % stats)

    risolate_names = dict(zip(isolate_names.values(), isolate_names.keys()))
    rcollapsed_names = dict(
        zip(collapsed_names.values(), collapsed_names.keys()))

    # and updates of father_pks for languages:
    for l, (hnode, status, name) in hh_languages.items():
        id_ = hid_to_pk.get(l)
        if not id_:
            id_ = new_hid_to_pk.get(l)
            attrs = languoid(id_, 'language', status=status)
        else:
            attrs = languoid(id_, 'language', status=status)
            # In case of existing languoids, we don't change the active flag!
            del attrs['active']
        if id_ in pk_to_name and name != pk_to_name[id_]:
            if slug(pk_to_name[id_]) == slug(name):
                attrs['name'] = name
        if hnode:
            attrs['father_pk'] = branch_to_pk[hnode]
        # look for hnames!
        if l in risolate_names:
            attrs['hname'] = risolate_names[l]
        if l in rcollapsed_names:
            attrs['hname'] = rcollapsed_names[l]
        changes.append(attrs)

    for row in gl_languoids:
        hid = row['ll_hid']
        if hid and 'NOCODE' in hid and hid not in hh_languages:
            # languoids with Harald's private code that are no longer in use
            changes.append(
                languoid(row['l_pk'],
                         'language',
                         status='retired',
                         active=False,
                         father_pk=None))

    jsondump(changes,
             args.data_dir.joinpath('languoids', 'changes.json'),
             indent=4)