def test_add_inlg_e(self): from pyglottolog.monsterlib._libmonster import add_inlg_e, INLG res = add_inlg_e( {1: ('article', {'title': 'Grammar of language'})}, load_triggers(tree=self.tree)[INLG], verbose=False) assert_equal(res[1][1][INLG], 'language [abc]')
def test_markall(self): from pyglottolog.monsterlib._libmonster import markall bib = { 1: ('article', {'title': "other grammar of lang"}), 2: ('article', {'title': "grammar of lang and dial"}), 3: ('article', {'title': "other"}), 4: ('article', {'title': "grammar and phonologie and morphologie"}) } hht = HHTypes(repos=self.repos) markall(bib, hht.triggers, verbose=False, rank=lambda l: hht[l].rank) self.assertIn('grammar', bib[1][1]['hhtype']) self.assertIn('morphologie and phonologie;grammar', bib[4][1]['hhtype']) markall(bib, load_triggers(tree=self.tree)['lgcode'], verbose=False) self.assertIn('language', bib[1][1]['lgcode'])
def add_inlg_e(e): inlg = languoids.load_triggers(type_='inlg') # FIXME: does not honor 'NOT' for now dh = {word: label for (cls, label), triggers in inlg.iteritems() for t in triggers for flag, word in t} ts = [(k, wrds(fields['title']) + wrds(fields.get('booktitle', ''))) for (k, (typ, fields)) in e.iteritems() if fields.has_key('title') and not fields.has_key('inlg')] print len(ts), "without", 'inlg' ann = [(k, set(dh[w] for w in tit if dh.has_key(w))) for (k, tit) in ts] unique = [(k, lgs.pop()) for (k, lgs) in ann if len(lgs) == 1] print len(unique), "cases of unique hits" fnups = [(k, 'inlg', v) for (k, v) in unique] t2 = renfn(e, fnups) #print len(unique), "updates" newtrain = grp2fd([(lgcodestr(fields['inlg'])[0], w) for (k, (typ, fields)) in t2.iteritems() if fields.has_key('title') and fields.has_key('inlg') if len(lgcodestr(fields['inlg'])) == 1 for w in wrds(fields['title'])]) #newtrain = grp2fd([(cname(lgc), w) for (lgcs, w) in alc if len(lgcs) == 1 for lgc in lgcs]) for (lg, wf) in sorted(newtrain.iteritems(), key=lambda x: len(x[1])): cm = [(1+f, float(1-f+sum(owf.get(w, 0) for owf in newtrain.itervalues())), w) for (w, f) in wf.iteritems() if f > 9] cms = [(f/fn, f, fn, w) for (f, fn, w) in cm] cms.sort(reverse=True) ##print lg, cms[:10] ##print ("h['%s'] = " % lg) + str([x[3] for x in cms[:10]]) return t2
def main(repos=DATA_DIR, rebuild=False): bibfiles = _bibfiles.Collection(references_path('bibtex', repos=repos)) previous = references_path('monster.csv', repos=repos) replacements = build_path('monster-replacements.json', repos=repos) monster = _bibfiles.BibFile( build_path('monster-utf8.bib', repos=repos), encoding='utf-8', sortkey='bibkey') tree = languoids_path('tree', repos=repos) hht = HHTypes(repos=repos) print('%s open/rebuild bibfiles db' % time.ctime()) db = bibfiles.to_sqlite( build_path('_bibfiles.sqlite3', repos=repos).as_posix(), rebuild=rebuild) print('%s compile_monster' % time.ctime()) m = dict(db.merged()) print('%s load hh.bib' % time.ctime()) hhbib = bibfiles['hh.bib'].load() # Annotate with macro_area from lgcode when lgcode is assigned manually print('%s macro_area_from_lgcode' % time.ctime()) m = macro_area_from_lgcode(m, tree) # Annotate with hhtype print('%s annotate hhtype' % time.ctime()) m = markconservative( m, hht.triggers, hhbib, hht, build_path('monstermark-hht.txt', repos=repos), rank=lambda l: hht[l]) ltriggers = languoids.load_triggers(tree=tree) # Annotate with lgcode print('%s annotate lgcode' % time.ctime()) m = markconservative( m, ltriggers['lgcode'], hhbib, hht, build_path('monstermark-lgc.txt', repos=repos)) # Annotate with inlg print('%s add_inlg_e' % time.ctime()) m = add_inlg_e(m, ltriggers['inlg']) # Print some statistics stats = Counter() print(time.ctime()) for t, f in m.values(): stats.update(['entry']) for field in ['lgcode', 'hhtype', 'macro_area']: if field in f: stats.update([field]) print("# entries", stats['entry']) for field in ['lgcode', 'hhtype', 'macro_area']: print("with " + field, stats[field]) # Update the CSV with the previous mappings for later reference print('%s update_previous' % time.ctime()) db.to_csvfile(previous) print('%s save_replacements' % time.ctime()) db.to_replacements(replacements) # Trickling back print('%s trickle' % time.ctime()) db.trickle(bibfiles) # Save print('%s save as utf8' % time.ctime()) monster.save(m, verbose=False) print('%s done.' % time.ctime())
def test_load_triggers(self): from pyglottolog.languoids import load_triggers res = load_triggers(tree=self.tree) self.assertEqual(len(res), 2)