예제 #1
0
    def test_add_inlg_e(self):
        from pyglottolog.monsterlib._libmonster import add_inlg_e, INLG

        res = add_inlg_e(
            {1: ('article', {'title': 'Grammar of language'})},
            load_triggers(tree=self.tree)[INLG],
            verbose=False)
        assert_equal(res[1][1][INLG], 'language [abc]')
예제 #2
0
    def test_markall(self):
        from pyglottolog.monsterlib._libmonster import markall

        bib = {
            1: ('article', {'title': "other grammar of lang"}),
            2: ('article', {'title': "grammar of lang and dial"}),
            3: ('article', {'title': "other"}),
            4: ('article', {'title': "grammar and phonologie and morphologie"})
        }
        hht = HHTypes(repos=self.repos)
        markall(bib, hht.triggers, verbose=False, rank=lambda l: hht[l].rank)
        self.assertIn('grammar', bib[1][1]['hhtype'])
        self.assertIn('morphologie and phonologie;grammar', bib[4][1]['hhtype'])

        markall(bib, load_triggers(tree=self.tree)['lgcode'], verbose=False)
        self.assertIn('language', bib[1][1]['lgcode'])
예제 #3
0
def add_inlg_e(e):
    inlg = languoids.load_triggers(type_='inlg')
    # FIXME: does not honor 'NOT' for now
    dh = {word: label  for (cls, label), triggers in inlg.iteritems()
        for t in triggers for flag, word in t}  
    ts = [(k, wrds(fields['title']) + wrds(fields.get('booktitle', ''))) for (k, (typ, fields)) in e.iteritems() if fields.has_key('title') and not fields.has_key('inlg')]
    print len(ts), "without", 'inlg'
    ann = [(k, set(dh[w] for w in tit if dh.has_key(w))) for (k, tit) in ts]
    unique = [(k, lgs.pop()) for (k, lgs) in ann if len(lgs) == 1]
    print len(unique), "cases of unique hits"
    fnups = [(k, 'inlg', v) for (k, v) in unique]
    t2 = renfn(e, fnups)
    #print len(unique), "updates"

    newtrain = grp2fd([(lgcodestr(fields['inlg'])[0], w) for (k, (typ, fields)) in t2.iteritems() if fields.has_key('title') and fields.has_key('inlg') if len(lgcodestr(fields['inlg'])) == 1 for w in wrds(fields['title'])])
    #newtrain = grp2fd([(cname(lgc), w) for (lgcs, w) in alc if len(lgcs) == 1 for lgc in lgcs])
    for (lg, wf) in sorted(newtrain.iteritems(), key=lambda x: len(x[1])):
        cm = [(1+f, float(1-f+sum(owf.get(w, 0) for owf in newtrain.itervalues())), w) for (w, f) in wf.iteritems() if f > 9]
        cms = [(f/fn, f, fn, w) for (f, fn, w) in cm]
        cms.sort(reverse=True)
        ##print lg, cms[:10]
        ##print ("h['%s'] = " % lg) + str([x[3] for x in cms[:10]])
    return t2
예제 #4
0
def main(repos=DATA_DIR, rebuild=False):
    bibfiles = _bibfiles.Collection(references_path('bibtex', repos=repos))
    previous = references_path('monster.csv', repos=repos)
    replacements = build_path('monster-replacements.json', repos=repos)
    monster = _bibfiles.BibFile(
        build_path('monster-utf8.bib', repos=repos), encoding='utf-8', sortkey='bibkey')
    tree = languoids_path('tree', repos=repos)
    hht = HHTypes(repos=repos)

    print('%s open/rebuild bibfiles db' % time.ctime())
    db = bibfiles.to_sqlite(
        build_path('_bibfiles.sqlite3', repos=repos).as_posix(),
        rebuild=rebuild)

    print('%s compile_monster' % time.ctime())
    m = dict(db.merged())

    print('%s load hh.bib' % time.ctime())
    hhbib = bibfiles['hh.bib'].load()

    # Annotate with macro_area from lgcode when lgcode is assigned manually
    print('%s macro_area_from_lgcode' % time.ctime())
    m = macro_area_from_lgcode(m, tree)

    # Annotate with hhtype
    print('%s annotate hhtype' % time.ctime())
    m = markconservative(
        m,
        hht.triggers,
        hhbib,
        hht,
        build_path('monstermark-hht.txt', repos=repos),
        rank=lambda l: hht[l])

    ltriggers = languoids.load_triggers(tree=tree)

    # Annotate with lgcode
    print('%s annotate lgcode' % time.ctime())
    m = markconservative(
        m,
        ltriggers['lgcode'],
        hhbib,
        hht,
        build_path('monstermark-lgc.txt', repos=repos))

    # Annotate with inlg
    print('%s add_inlg_e' % time.ctime())
    m = add_inlg_e(m, ltriggers['inlg'])

    # Print some statistics
    stats = Counter()
    print(time.ctime())
    for t, f in m.values():
        stats.update(['entry'])
        for field in ['lgcode', 'hhtype', 'macro_area']:
            if field in f:
                stats.update([field])
    print("# entries", stats['entry'])
    for field in ['lgcode', 'hhtype', 'macro_area']:
        print("with " + field, stats[field])

    # Update the CSV with the previous mappings for later reference
    print('%s update_previous' % time.ctime())
    db.to_csvfile(previous)

    print('%s save_replacements' % time.ctime())
    db.to_replacements(replacements)

    # Trickling back
    print('%s trickle' % time.ctime())
    db.trickle(bibfiles)

    # Save
    print('%s save as utf8' % time.ctime())
    monster.save(m, verbose=False)

    print('%s done.' % time.ctime())
예제 #5
0
    def test_load_triggers(self):
        from pyglottolog.languoids import load_triggers

        res = load_triggers(tree=self.tree)
        self.assertEqual(len(res), 2)
예제 #6
0
파일: monster.py 프로젝트: clld/glottolog
def main(repos=DATA_DIR, rebuild=False):
    bibfiles = _bibfiles.Collection(references_path('bibtex', repos=repos))
    previous = references_path('monster.csv', repos=repos)
    replacements = build_path('monster-replacements.json', repos=repos)
    monster = _bibfiles.BibFile(
        build_path('monster-utf8.bib', repos=repos), encoding='utf-8', sortkey='bibkey')
    tree = languoids_path('tree', repos=repos)
    hht = HHTypes(repos=repos)

    print('%s open/rebuild bibfiles db' % time.ctime())
    db = bibfiles.to_sqlite(
        build_path('_bibfiles.sqlite3', repos=repos).as_posix(),
        rebuild=rebuild)

    print('%s compile_monster' % time.ctime())
    m = dict(db.merged())

    print('%s load hh.bib' % time.ctime())
    hhbib = bibfiles['hh.bib'].load()

    # Annotate with macro_area from lgcode when lgcode is assigned manually
    print('%s macro_area_from_lgcode' % time.ctime())
    m = macro_area_from_lgcode(m, tree)

    # Annotate with hhtype
    print('%s annotate hhtype' % time.ctime())
    m = markconservative(
        m,
        hht.triggers,
        hhbib,
        hht,
        build_path('monstermark-hht.txt', repos=repos),
        rank=lambda l: hht[l])

    ltriggers = languoids.load_triggers(tree=tree)

    # Annotate with lgcode
    print('%s annotate lgcode' % time.ctime())
    m = markconservative(
        m,
        ltriggers['lgcode'],
        hhbib,
        hht,
        build_path('monstermark-lgc.txt', repos=repos))

    # Annotate with inlg
    print('%s add_inlg_e' % time.ctime())
    m = add_inlg_e(m, ltriggers['inlg'])

    # Print some statistics
    stats = Counter()
    print(time.ctime())
    for t, f in m.values():
        stats.update(['entry'])
        for field in ['lgcode', 'hhtype', 'macro_area']:
            if field in f:
                stats.update([field])
    print("# entries", stats['entry'])
    for field in ['lgcode', 'hhtype', 'macro_area']:
        print("with " + field, stats[field])

    # Update the CSV with the previous mappings for later reference
    print('%s update_previous' % time.ctime())
    db.to_csvfile(previous)

    print('%s save_replacements' % time.ctime())
    db.to_replacements(replacements)

    # Trickling back
    print('%s trickle' % time.ctime())
    db.trickle(bibfiles)

    # Save
    print('%s save as utf8' % time.ctime())
    monster.save(m, verbose=False)

    print('%s done.' % time.ctime())