Пример #1
0
def make_index(level, repos=None):
    fname = dict(
        language='languages', family='families', dialect='dialects')[level.name]
    links = defaultdict(dict)
    for lang in walk_tree(tree=languoids_path('tree', repos=repos)):
        if lang.level == level:
            label = '{0.name} [{0.id}]'.format(lang)
            if lang.iso:
                label += '[%s]' % lang.iso
            links[slug(lang.name)[0]][label] = \
                lang.dir.joinpath(lang.fname('.ini'))\
                    .relative_to(languoids_path(repos=repos))

    res = [languoids_path(fname + '.md', repos=repos)]
    with res[0].open('w', encoding='utf8') as fp:
        fp.write('## %s\n\n' % fname.capitalize())
        fp.write(' '.join(
            '[-%s-](%s_%s.md)' % (i.upper(), fname, i) for i in sorted(links.keys())))
        fp.write('\n')

    for i, langs in links.items():
        res.append(languoids_path('%s_%s.md' % (fname, i), repos=repos))
        with res[-1].open('w', encoding='utf8') as fp:
            for label in sorted(langs.keys()):
                fp.write('- [%s](%s)\n' % (label, langs[label]))
    return res
Пример #2
0
def make_index(level, repos=None):
    fname = dict(
        language='languages', family='families', dialect='dialects')[level.name]
    links = defaultdict(dict)
    for lang in walk_tree(tree=languoids_path('tree', repos=repos)):
        if lang.level == level:
            label = '{0.name} [{0.id}]'.format(lang)
            if lang.iso:
                label += '[%s]' % lang.iso
            links[slug(lang.name)[0]][label] = \
                lang.dir.joinpath(lang.fname('.ini'))\
                    .relative_to(languoids_path(repos=repos))

    res = [languoids_path(fname + '.md', repos=repos)]
    with res[0].open('w', encoding='utf8') as fp:
        fp.write('## %s\n\n' % fname.capitalize())
        fp.write(' '.join(
            '[-%s-](%s_%s.md)' % (i.upper(), fname, i) for i in sorted(links.keys())))
        fp.write('\n')

    for i, langs in links.items():
        res.append(languoids_path('%s_%s.md' % (fname, i), repos=repos))
        with res[-1].open('w', encoding='utf8') as fp:
            for label in sorted(langs.keys()):
                fp.write('- [%s](%s)\n' % (label, langs[label]))
    return res
Пример #3
0
def check_tree(args):
    if args.args:
        iso = ISO(args.args[0] if Path(args.args[0]).exists() else None)
    else:
        iso = None

    tree = languoids_path('tree', repos=args.repos)
    glottocodes = Glottocodes()
    log.info('checking tree at %s' % tree)
    by_level = Counter()
    by_category = Counter()
    for lang in walk_tree(tree=tree):
        by_level.update([lang.level.name])
        if lang.level == Level.language:
            by_category.update([lang.category])

        if iso and lang.iso:
            if lang.iso not in iso:
                log.warn('invalid ISO-639-3 code: %s [%s]' % (lang.id, lang.iso))
            else:
                isocode = iso[lang.iso]
                if isocode.is_retired and lang.category != 'Bookkeeping':
                    msg = '%s %s' % (lang.id, repr(isocode))
                    if len(isocode.change_to) == 1:
                        msg += ' changed to %s' % repr(isocode.change_to[0])
                    log.warn(msg)

        if not lang.id.startswith('unun9') and lang.id not in glottocodes:
            log.error('unregistered glottocode %s' % lang.id)
        for attr in ['level', 'name', 'glottocode']:
            if not getattr(lang, attr):
                log.error('missing %s: %s' % (attr, lang.id))
        if not Glottocode.pattern.match(lang.dir.name):
            log.error('invalid directory name: %s' % lang.dir.name)
        if lang.level == Level.language:
            if lang.parent and lang.parent.level != Level.family:
                log.error('invalid nesting of language under {0}: {1}'.format(
                    lang.parent.level, lang.id))
            for child in lang.children:
                if child.level != Level.dialect:
                    log.error('invalid nesting of {0} under language: {1}'.format(
                        child.level, child.id))
        elif lang.level == Level.family:
            for d in lang.dir.iterdir():
                if d.is_dir():
                    break
            else:
                log.error('family without children: {0}'.format(lang.id))

    def log_counter(counter, name):
        msg = [name + ':']
        maxl = max([len(k) for k in counter.keys()]) + 1
        for k, l in counter.most_common():
            msg.append(('{0:<%s} {1:>8,}' % maxl).format(k + ':', l))
        msg.append(('{0:<%s} {1:>8,}' % maxl).format('', sum(list(counter.values()))))
        log.info('\n'.join(msg))

    log_counter(by_level, 'Languoids by level')
    log_counter(by_category, 'Languages by category')
    return by_level
Пример #4
0
def missing_iso(args):
    tree = languoids_path('tree', repos=args.repos)
    iso = ISO(args.args[0] if args.args else None)

    changed_to = []
    for code in iso.retirements:
        changed_to.extend(code.change_to)
    changed_to = set(changed_to)

    ingl = set()
    for lang in walk_tree(tree=tree):
        if lang.iso:
            ingl.add(lang.iso)
    for code in sorted(iso.languages):
        if code.type == 'Individual/Living':
            if code not in changed_to:
                if code.code not in ingl:
                    print(code, code.type)
Пример #5
0
def main(repos=DATA_DIR, rebuild=False):
    bibfiles = _bibfiles.Collection(references_path('bibtex', repos=repos))
    previous = references_path('monster.csv', repos=repos)
    replacements = build_path('monster-replacements.json', repos=repos)
    monster = _bibfiles.BibFile(
        build_path('monster-utf8.bib', repos=repos), encoding='utf-8', sortkey='bibkey')
    tree = languoids_path('tree', repos=repos)
    hht = HHTypes(repos=repos)

    print('%s open/rebuild bibfiles db' % time.ctime())
    db = bibfiles.to_sqlite(
        build_path('_bibfiles.sqlite3', repos=repos).as_posix(),
        rebuild=rebuild)

    print('%s compile_monster' % time.ctime())
    m = dict(db.merged())

    print('%s load hh.bib' % time.ctime())
    hhbib = bibfiles['hh.bib'].load()

    # Annotate with macro_area from lgcode when lgcode is assigned manually
    print('%s macro_area_from_lgcode' % time.ctime())
    m = macro_area_from_lgcode(m, tree)

    # Annotate with hhtype
    print('%s annotate hhtype' % time.ctime())
    m = markconservative(
        m,
        hht.triggers,
        hhbib,
        hht,
        build_path('monstermark-hht.txt', repos=repos),
        rank=lambda l: hht[l])

    ltriggers = languoids.load_triggers(tree=tree)

    # Annotate with lgcode
    print('%s annotate lgcode' % time.ctime())
    m = markconservative(
        m,
        ltriggers['lgcode'],
        hhbib,
        hht,
        build_path('monstermark-lgc.txt', repos=repos))

    # Annotate with inlg
    print('%s add_inlg_e' % time.ctime())
    m = add_inlg_e(m, ltriggers['inlg'])

    # Print some statistics
    stats = Counter()
    print(time.ctime())
    for t, f in m.values():
        stats.update(['entry'])
        for field in ['lgcode', 'hhtype', 'macro_area']:
            if field in f:
                stats.update([field])
    print("# entries", stats['entry'])
    for field in ['lgcode', 'hhtype', 'macro_area']:
        print("with " + field, stats[field])

    # Update the CSV with the previous mappings for later reference
    print('%s update_previous' % time.ctime())
    db.to_csvfile(previous)

    print('%s save_replacements' % time.ctime())
    db.to_replacements(replacements)

    # Trickling back
    print('%s trickle' % time.ctime())
    db.trickle(bibfiles)

    # Save
    print('%s save as utf8' % time.ctime())
    monster.save(m, verbose=False)

    print('%s done.' % time.ctime())
Пример #6
0
 def __init__(self, repos=None):
     self.repos = Path(repos) if repos else util.DATA_DIR
     self.tree = util.languoids_path('tree', data_dir=self.repos)
Пример #7
0
 def __init__(self, **kw):
     self._fname = languoids_path('glottocodes.json', **kw)
     self._store = jsonlib.load(self._fname)
Пример #8
0
import os
import re
from itertools import takewhile
from collections import defaultdict, OrderedDict

from enum import IntEnum
from six import text_type
from clldutils.misc import slug
from clldutils import jsonlib
from clldutils.path import Path, walk
from clldutils.inifile import INI

from pyglottolog.util import languoids_path, Trigger


TREE = languoids_path('tree')


class Level(IntEnum):
    family = 1
    language = 2
    dialect = 3


class Glottocodes(object):
    def __init__(self, **kw):
        self._fname = languoids_path('glottocodes.json', **kw)
        self._store = jsonlib.load(self._fname)

    def __contains__(self, item):
        alpha, num = Glottocode(item).split()
Пример #9
0
 def __init__(self, **kw):
     self._fname = languoids_path('glottocodes.json', **kw)
     self._store = jsonlib.load(self._fname)
Пример #10
0
import os
import re
from itertools import takewhile
from collections import defaultdict, OrderedDict

from enum import IntEnum
from six import text_type
from clldutils.misc import slug
from clldutils import jsonlib
from clldutils.path import Path, walk
from clldutils.inifile import INI

from pyglottolog.util import languoids_path, Trigger


TREE = languoids_path('tree')


class Level(IntEnum):
    family = 1
    language = 2
    dialect = 3


class Glottocodes(object):
    def __init__(self, **kw):
        self._fname = languoids_path('glottocodes.json', **kw)
        self._store = jsonlib.load(self._fname)

    def __contains__(self, item):
        alpha, num = Glottocode(item).split()
Пример #11
0
def tree(args):
    ascii_tree(
        args.args[0],
        getattr(Level, args.args[1], None) if len(args.args) > 1 else None,
        tree=languoids_path('tree', repos=args.repos))
Пример #12
0
def tree2lff(args):
    """Create lff.txt and dff.txt from the current languoid tree.

    glottolog tree2lff
    """
    lff.tree2lff(tree=languoids_path('tree', repos=args.repos))
Пример #13
0
def main(args):
    Index('ducet', collkey(common.Value.name)).create(DBSession.bind)
    data = Data()
    concept_list = Concepticon(CONCEPTICON_REPOS).conceptlist('Key-2016-1310')

    def concepticon_id(ids_code):
        for item in concept_list:
            if item['IDS_ID'] == ids_code:
                return int(item['CONCEPTICON_ID']) if item['CONCEPTICON_ID'] else None

    def read(table):
        fname = args.data_file(table + '.all.csv')
        if not fname.exists():
            fname = args.data_file(table + '.csv')
        return list(dsv.reader(fname, namedtuples=True))

    dataset = common.Dataset(
        id=ids.__name__,
        name="IDS",
        description="The Intercontinental Dictionary Series",
        published=date(2015, 5, 25),
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="http://www.eva.mpg.de",
        license='http://creativecommons.org/licenses/by/4.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name':
                'Creative Commons Attribution 4.0 International License',
        },
        domain='ids.clld.org')

    DBSession.add(dataset)

    for rec in Database.from_file(args.data_file('sources.bib'), lowercase=True):
        if rec.id not in data['Source']:
            data.add(common.Source, rec.id, _obj=bibtex2source(rec))
    DBSession.flush()

    data_desc = defaultdict(dict)
    for l in read('x_lg_data'):
        data_desc[l.lg_id][l.map_ids_data] = l.header

    # language lang
    iso_codes = {l.id: l.sil_code for l in read('sil_lang')}
    iso_codes = {l.lg_id: iso_codes[l.sil_id] for l in read('x_lg_sil')}
    languages = []

    exclude = []
    for l in read('lang'):
        if l.status == '1':
            exclude.append(l.lg_id)
            continue
        lang_changed = LANGS.get(int(l.lg_id), {})
        code = lang_changed.get('glotto') or lang_changed.get('iso') or iso_codes.get(l.lg_id)
        lang = data.add(models.IdsLanguage, l.lg_id, id=l.lg_id, name=lang_changed.get('name', l.lg_name))
        if code:
            languages.append((code, lang))
        data.add(
            models.Dictionary, l.lg_id,
            id=l.lg_id, name=l.lg_name,
            language=lang,
            default_representation=data_desc[l.lg_id].get('1'),
            alt_representation=data_desc[l.lg_id].get('2'),
            jsondata=dict(status=l.status, date=l.date))

    iso2glotto = {}
    for l in walk_tree(tree=languoids_path('tree', GLOTTOLOG_REPOS)):
        if l.iso:
            iso2glotto[l.iso] = l.id

    load_families(
        Data(), [(iso2glotto.get(c, c), l) for c, l in languages], glottolog=Glottolog(GLOTTOLOG_REPOS), isolates_icon='tcccccc')

    contributors = defaultdict(list)
    sources = defaultdict(list)
    for l in read('lang_compilers'):
        if l.lg_id in exclude:
            continue
        if l.name == "BIBIKO":
            continue
        #name	lg_id	what_did_id
        if int(l.what_did_id) in models.ROLES:
            contributors[slug(l.name)].append((l.name, int(l.what_did_id), l.lg_id))
        else:
            assert int(l.what_did_id) in [4, 395]
            sources[l.name].append(l.lg_id)

    for s, roles in contributors.items():
        name = roles[0][0]
        c = data.add(common.Contributor, s, id=s, name=name)
        if name == 'Mary Ritchie Key':
            c.address = 'University of California, Irvine'
        for lg, specs in groupby(sorted(roles, key=lambda r: r[2]), key=lambda r: r[2]):
            sroles = sorted(
                [s[1] for s in specs],
                reverse=True,
                key=lambda what: what + 2 if what == 2 else what)
            what = sroles[0]
            DBSession.add(common.ContributionContributor(
                contribution=data['Dictionary'][lg],
                contributor=c,
                ord=what,
                primary=what == 2))

    data.add(
        common.Contributor, 'bernardcomrie',
        id='bernardcomrie',
        name="Bernard Comrie",
        address="University of California, Santa Barbara")

    for i, editor in enumerate(['maryritchiekey', 'bernardcomrie']):
        common.Editor(dataset=dataset, contributor=data['Contributor'][editor], ord=i + 1)

    #for i, name in enumerate(sorted(sources.keys())):
    #    c = data.add(common.Source, name, id=str(i + 1), name=name, description=name)

    DBSession.flush()
    for name, lgs in sources.items():
        for _src in name.split(';'):
            src = data['Source'].get(_src.strip())
            if not src:
                print('-- missing source --', _src)
                raise ValueError
            for lg in lgs:
                if lg in exclude:
                    continue
                assert lg in data['Dictionary']
                DBSession.add(common.ContributionReference(
                    contribution_pk=data['Dictionary'][lg].pk, source_pk=src.pk))

    altnames = {}
    for i, l in enumerate(read('alt_names')):
        if l.name in altnames:
            identifier = altnames[l.name]
        else:
            identifier = data.add(
                common.Identifier, l.name,
                id='name-%s' % i, type='name', name=l.name, description='IDS')
            altnames[l.name] = identifier
        if l.lg_id not in exclude and l.name != data['IdsLanguage'][l.lg_id].name:
            DBSession.add(common.LanguageIdentifier(
                identifier=identifier,
                language=data['IdsLanguage'][l.lg_id]))

    # parameter chapter/entry
    for l in read('chapter'):
        data.add(models.Chapter, l.chap_id, id=l.chap_id, name=l.chap_title)

    entries = {}
    for l in read('entry'):
        id_ = '%s-%s' % (l.chap_id, l.entry_id)
        name = l.trans_english
        if name in entries:
            entries[name] += 1
            name = name + ' (%s)' % entries[name]
        else:
            entries[name] = 1
        kw = {
            'id': id_,
            'name': name,
            'concepticon_id': concepticon_id(id_),
            'chapter': data['Chapter'][l.chap_id]}
        for ll in 'french russian spanish portugese'.split():
            kw[ll] = getattr(l, 'trans_' + ll)
        data.add(models.Entry, id_, sub_code=l.entry_id, **kw)

    misaligned = []

    DBSession.flush()
    for entity in 'IdsLanguage Entry Chapter Dictionary'.split():
        for k in data[entity].keys()[:]:
            data[entity][k] = data[entity][k].pk

    synsets = set()
    counterparts = set()
    problems = defaultdict(list)

    for lg_id, entries in groupby(
            sorted(read('ids'), key=lambda t: t.lg_id), lambda k: k.lg_id):
        if lg_id in exclude or not lg_id:
            continue

        # keep the memory footprint reasonable
        transaction.commit()
        transaction.begin()

        language = common.Language.get(data['IdsLanguage'][lg_id])
        desc = data_desc.get(lg_id, {})
        words = defaultdict(list)
        for l in entries:
            if empty.match(l.data_1):
                continue

            entry_id = '%s-%s' % (l.chap_id, l.entry_id)
            if entry_id not in data['Entry']:
                continue
                #data.add(
                #    models.Entry, entry_id,
                #    id=entry_id,
                #    name=entry_id,
                #    concepticon_id=concepticon_id(entry_id),
                #    sub_code=l.entry_id,
                #    chapter_pk=data['Chapter'][l.chap_id])
                #DBSession.flush()
                #data['Entry'][entry_id] = data['Entry'][entry_id].pk

            id_ = '%s-%s' % (entry_id, l.lg_id)
            if id_ in synsets:
                vs = models.Synset.get(id_)
            else:
                vs = models.Synset(
                    id=id_,
                    comment=get_string(l.comment or ''),
                    alt_representation=get_string(l.data_2),
                    language=language,
                    contribution_pk=data['Dictionary'][l.lg_id],
                    parameter_pk=data['Entry'][entry_id])
                synsets.add(id_)

            trans1 = list(split_counterparts(l.data_1))
            trans2 = None if empty.match(l.data_2) else list(split_counterparts(l.data_2))

            if trans2:
                if len(trans2) != len(trans1):
                    if language.id != '238':
                        misaligned.append((l.chap_id, l.entry_id, l.lg_id))
                        #print('===', language.id, language.name)
                        #print(l.data_1)
                        #print(l.data_2)
                    # 83 cases of misaligned transcriptions
                    trans2 = None

            for i, word in enumerate(trans1):
                cid = id_ + '-' + str(i + 1 + len(vs.values))
                if cid not in counterparts:
                    v = models.Counterpart(
                        id=cid,
                        name=word,
                        description=desc.get('1'),
                        valueset=vs)
                    words[word].append((v, trans2[i] if trans2 else None))
                    counterparts.add(cid)
                else:
                    print(cid)
                    #12 - 420 - 811 - 3
                    #5 - 390 - 818 - 3
                    #2 - 930 - 819 - 3
                    #2 - 930 - 819 - 3
                    #3 - 120 - 819 - 3
                    #10 - 140 - 822 - 3
                    #9 - 160 - 825 - 3
                    #2 - 430 - 829 - 4

        for i, form in enumerate(words.keys()):
            # Since we identify words based on their string representation, we have to
            # make sure a word has the same alternative transcription for all meanings.
            if language.id == '238':
                alt_names = []
            else:
                alt_names = set(norm(w[1] or '', desc.get('2'), language.id)
                                for w in words[form])
            alt_names = nfilter(alt_names)
            try:
                assert len(alt_names) <= 1
            except AssertionError:
                problems[(language.id, language.name)].append(alt_names)
            word = models.Word(
                id='%s-%s' % (language.id, i + 1),
                name=form,
                description=desc.get('1'),
                language=language,
                alt_name=', '.join(alt_names) if alt_names else None,
                alt_description=desc.get('2')
            )
            for v, _ in words[form]:
                word.counterparts.append(v)
            DBSession.add(word)

        DBSession.flush()

    with dsv.UnicodeWriter(args.data_file('misaligned.csv')) as fp:
        fp.writerows(misaligned)

    # about 250 cases where alternative transcriotions do not covary across meanings.
    for k, v in problems.items():
        print(k, len(v))
Пример #14
0
def main(repos=DATA_DIR, rebuild=False):
    bibfiles = _bibfiles.Collection(references_path('bibtex', repos=repos))
    previous = references_path('monster.csv', repos=repos)
    replacements = build_path('monster-replacements.json', repos=repos)
    monster = _bibfiles.BibFile(
        build_path('monster-utf8.bib', repos=repos), encoding='utf-8', sortkey='bibkey')
    tree = languoids_path('tree', repos=repos)
    hht = HHTypes(repos=repos)

    print('%s open/rebuild bibfiles db' % time.ctime())
    db = bibfiles.to_sqlite(
        build_path('_bibfiles.sqlite3', repos=repos).as_posix(),
        rebuild=rebuild)

    print('%s compile_monster' % time.ctime())
    m = dict(db.merged())

    print('%s load hh.bib' % time.ctime())
    hhbib = bibfiles['hh.bib'].load()

    # Annotate with macro_area from lgcode when lgcode is assigned manually
    print('%s macro_area_from_lgcode' % time.ctime())
    m = macro_area_from_lgcode(m, tree)

    # Annotate with hhtype
    print('%s annotate hhtype' % time.ctime())
    m = markconservative(
        m,
        hht.triggers,
        hhbib,
        hht,
        build_path('monstermark-hht.txt', repos=repos),
        rank=lambda l: hht[l])

    ltriggers = languoids.load_triggers(tree=tree)

    # Annotate with lgcode
    print('%s annotate lgcode' % time.ctime())
    m = markconservative(
        m,
        ltriggers['lgcode'],
        hhbib,
        hht,
        build_path('monstermark-lgc.txt', repos=repos))

    # Annotate with inlg
    print('%s add_inlg_e' % time.ctime())
    m = add_inlg_e(m, ltriggers['inlg'])

    # Print some statistics
    stats = Counter()
    print(time.ctime())
    for t, f in m.values():
        stats.update(['entry'])
        for field in ['lgcode', 'hhtype', 'macro_area']:
            if field in f:
                stats.update([field])
    print("# entries", stats['entry'])
    for field in ['lgcode', 'hhtype', 'macro_area']:
        print("with " + field, stats[field])

    # Update the CSV with the previous mappings for later reference
    print('%s update_previous' % time.ctime())
    db.to_csvfile(previous)

    print('%s save_replacements' % time.ctime())
    db.to_replacements(replacements)

    # Trickling back
    print('%s trickle' % time.ctime())
    db.trickle(bibfiles)

    # Save
    print('%s save as utf8' % time.ctime())
    monster.save(m, verbose=False)

    print('%s done.' % time.ctime())
Пример #15
0
 def __init__(self, repos=None):
     self.repos = Path(repos) if repos else util.DATA_DIR
     self.tree = util.languoids_path('tree', data_dir=self.repos)