예제 #1
0
    def test_Reference(self):
        from pyglottolog.objects import Reference

        ref = Reference('bib:key', '12-34', 'German')
        self.assertEqual('{0}'.format(ref), '**bib:key**:12-34<trigger "German">')
        Reference.from_list(['{0}'.format(ref)])

        with self.assertRaises(ValueError):
            Reference.from_list(['abc'])
예제 #2
0
def show(args):
    """Display details of a Glottolog object.

    glottolog show <GLOTTOCODE>|<ISO-CODE>|<BIBTEXKEY>
    """
    if args.args and ':' in args.args[0]:
        if args.args[0].startswith('**'):
            ref = Reference.from_string(args.args[0])
        else:
            ref = Reference(key=args.args[0])
        sprint('Glottolog reference {0}'.format(ref),
               attrs=['bold', 'underline'])
        print()
        src = ref.get_source(args.repos)
        sprint(src.text())
        print()
        sprint(src)
        return
    lang = existing_lang(args)
    print()
    sprint('Glottolog languoid {0}'.format(lang.id),
           attrs=['bold', 'underline'])
    print()
    sprint('Classification:', attrs=['bold', 'underline'])
    args.repos.ascii_tree(lang, maxlevel=1)
    print()
    sprint('Info:', attrs=['bold', 'underline'])
    sprint('Path: {0}'.format(lang.fname), 'green', attrs=['bold'])
    sources = lang.sources
    if sources:
        del lang.cfg['sources']['glottolog']
        del lang.cfg['sources']
    for line in lang.cfg.write_string().split('\n'):
        if not line.startswith('#'):
            sprint(line, None, attrs=['bold'] if line.startswith('[') else [])
    sprint('Sources:', attrs=['bold', 'underline'])
    for src in sources:
        src = src.get_source(args.repos)
        sprint(src.id, color='green')
        sprint(src.text())
        print()
예제 #3
0
 def sources(self):
     if self.cfg.has_option('sources', 'glottolog'):
         return Reference.from_list(self.cfg.getlist(
             'sources', 'glottolog'))
     return []
예제 #4
0
def check(args):
    """Check the glottolog data for consistency.

    glottolog check [tree|refs]
    """
    def error(obj, msg):
        args.log.error(message(obj, msg))

    def warn(obj, msg):
        args.log.warn(message(obj, msg))

    def info(obj, msg):
        args.log.info(message(obj, msg))

    what = args.args[0] if args.args else 'all'

    if what in ['all', 'refs']:
        for bibfile in args.repos.bibfiles:
            bibfile.check(args.log)

    if what not in ['all', 'tree']:
        return

    hhkeys = args.repos.bibfiles['hh.bib'].keys()
    iso = args.repos.iso
    args.log.info('checking ISO codes against %s' % iso)
    args.log.info('checking tree at %s' % args.repos)
    by_level = Counter()
    by_category = Counter()
    iso_in_gl, languoids, iso_splits = {}, {}, []
    names = defaultdict(set)

    for lang in args.repos.languoids():
        # duplicate glottocodes:
        if lang.id in languoids:
            error(
                lang.id, 'duplicate glottocode\n{0}\n{1}'.format(
                    languoids[lang.id].dir, lang.dir))
        languoids[lang.id] = lang

    for lang in languoids.values():
        ancestors = lang.ancestors_from_nodemap(languoids)
        children = lang.children_from_nodemap(languoids)

        if lang.latitude and not (-90 <= lang.latitude <= 90):
            error(lang, 'invalid latitude: {0}'.format(lang.latitude))
        if lang.longitude and not (-180 <= lang.longitude <= 180):
            error(lang, 'invalid longitude: {0}'.format(lang.longitude))

        assert isinstance(lang.countries, list)
        assert isinstance(lang.macroareas, list)

        if 'sources' in lang.cfg:
            for ref in Reference.from_list(
                    lang.cfg.getlist('sources', 'glottolog')):
                if ref.provider == 'hh' and ref.key not in hhkeys:
                    error(lang, 'missing source: {0}'.format(ref))

        for attr in ['classification_comment', 'ethnologue_comment']:
            obj = getattr(lang, attr)
            if obj:
                obj.check(lang, hhkeys, args.log)

        names[lang.name].add(lang)
        by_level.update([lang.level.name])
        if lang.level == Level.language:
            by_category.update([lang.category])

        if iso and lang.iso:
            if lang.iso not in iso:
                warn(lang, 'invalid ISO-639-3 code [%s]' % lang.iso)
            else:
                isocode = iso[lang.iso]
                if lang.iso in iso_in_gl:
                    error(
                        isocode,
                        'duplicate: {0}, {1}'.format(iso_in_gl[lang.iso].id,
                                                     lang.id))
                iso_in_gl[lang.iso] = lang
                if isocode.is_retired and lang.category != 'Bookkeeping':
                    if isocode.type == 'Retirement/split':
                        iso_splits.append(lang)
                    else:
                        msg = repr(isocode)
                        level = info
                        if len(isocode.change_to) == 1:
                            level = warn
                            msg += ' changed to [%s]' % isocode.change_to[
                                0].code
                        level(lang, msg)

        if not lang.id.startswith(
                'unun9') and lang.id not in args.repos.glottocodes:
            error(lang, 'unregistered glottocode')
        for attr in ['level', 'name']:
            if not getattr(lang, attr):
                error(lang, 'missing %s' % attr)
        if lang.level == Level.language:
            parent = ancestors[-1] if ancestors else None
            if parent and parent.level != Level.family:
                error(
                    lang, 'invalid nesting of language under {0}'.format(
                        parent.level))
            for child in children:
                if child.level != Level.dialect:
                    error(
                        child, 'invalid nesting of {0} under language'.format(
                            child.level))
        elif lang.level == Level.family:
            for d in lang.dir.iterdir():
                if d.is_dir():
                    break
            else:
                error(lang, 'family without children')

    if iso:
        changed_to = set(chain(*[code.change_to for code in iso.retirements]))
        for code in sorted(iso.languages):
            if code.type == 'Individual/Living':
                if code not in changed_to:
                    if code.code not in iso_in_gl:
                        info(repr(code), 'missing')
        for lang in iso_splits:
            isocode = iso[lang.iso]
            missing = [
                s.code for s in isocode.change_to if s.code not in iso_in_gl
            ]
            if missing:
                warn(
                    lang, '{0} missing new codes: {1}'.format(
                        repr(isocode), ', '.join(missing)))

    for name, gcs in sorted(names.items()):
        if len(gcs) > 1:
            # duplicate names:
            method = error
            if len([1 for n in gcs if n.level != Level.dialect]) <= 1:
                # at most one of the languoids is not a dialect, just warn
                method = warn
            if len([
                    1 for n in gcs
                    if (not n.lineage) or (n.lineage[0][1] != 'book1242')
            ]) <= 1:
                # at most one of the languoids is not in bookkeping, just warn
                method = warn
            method(
                name, 'duplicate name: {0}'.format(', '.join(
                    sorted([
                        '{0} <{1}>'.format(n.id, n.level.name[0]) for n in gcs
                    ]))))

    def log_counter(counter, name):
        msg = [name + ':']
        maxl = max([len(k) for k in counter.keys()]) + 1
        for k, l in counter.most_common():
            msg.append(('{0:<%s} {1:>8,}' % maxl).format(k + ':', l))
        msg.append(
            ('{0:<%s} {1:>8,}' % maxl).format('', sum(list(counter.values()))))
        print('\n'.join(msg))

    log_counter(by_level, 'Languoids by level')
    log_counter(by_category, 'Languages by category')
    return by_level