def test_markconcservative(self): from pyglottolog.monsterlib._libmonster import markconservative res = markconservative({1: ('article', { 'title': 'Grammar' })}, self.api.hhtypes.triggers, {1: ('article', { 'title': 'Grammar' })}, self.api.hhtypes, self.tmp_path('marks.txt'), verbose=False) self.assertEqual(res[1][1]['hhtype'].split()[0], 'grammar') # If a higher hhtype is computed, this cancels out previous computations. res = markconservative( {1: ('article', { 'title': 'grammar', 'lgcode': 'abc' })}, self.api.hhtypes.triggers, { 1: ('article', { 'title': 'other', 'hhtype': 'other', 'lgcode': 'abc' }) }, self.api.hhtypes, self.tmp_path('marks.txt'), verbose=False) self.assertNotIn('hhtype', res[1][1])
def test_markconcservative(self): from pyglottolog.monsterlib._libmonster import markconservative hht = HHTypes(repos=self.repos) res = markconservative( {1: ('article', {'title': 'Grammar'})}, hht.triggers, {1: ('article', {'title': 'Grammar'})}, hht, self.tmp_path('marks.txt'), verbose=False) self.assertEqual(res[1][1]['hhtype'].split()[0], 'grammar') # If a higher hhtype is computed, this cancels out previous computations. res = markconservative( {1: ('article', {'title': 'grammar', 'lgcode': 'abc'})}, hht.triggers, {1: ('article', {'title': 'other', 'hhtype': 'other', 'lgcode': 'abc'})}, hht, self.tmp_path('marks.txt'), verbose=False) self.assertNotIn('hhtype', res[1][1])
def main(repos=DATA_DIR, rebuild=False): bibfiles = _bibfiles.Collection(references_path('bibtex', repos=repos)) previous = references_path('monster.csv', repos=repos) replacements = build_path('monster-replacements.json', repos=repos) monster = _bibfiles.BibFile( build_path('monster-utf8.bib', repos=repos), encoding='utf-8', sortkey='bibkey') tree = languoids_path('tree', repos=repos) hht = HHTypes(repos=repos) print('%s open/rebuild bibfiles db' % time.ctime()) db = bibfiles.to_sqlite( build_path('_bibfiles.sqlite3', repos=repos).as_posix(), rebuild=rebuild) print('%s compile_monster' % time.ctime()) m = dict(db.merged()) print('%s load hh.bib' % time.ctime()) hhbib = bibfiles['hh.bib'].load() # Annotate with macro_area from lgcode when lgcode is assigned manually print('%s macro_area_from_lgcode' % time.ctime()) m = macro_area_from_lgcode(m, tree) # Annotate with hhtype print('%s annotate hhtype' % time.ctime()) m = markconservative( m, hht.triggers, hhbib, hht, build_path('monstermark-hht.txt', repos=repos), rank=lambda l: hht[l]) ltriggers = languoids.load_triggers(tree=tree) # Annotate with lgcode print('%s annotate lgcode' % time.ctime()) m = markconservative( m, ltriggers['lgcode'], hhbib, hht, build_path('monstermark-lgc.txt', repos=repos)) # Annotate with inlg print('%s add_inlg_e' % time.ctime()) m = add_inlg_e(m, ltriggers['inlg']) # Print some statistics stats = Counter() print(time.ctime()) for t, f in m.values(): stats.update(['entry']) for field in ['lgcode', 'hhtype', 'macro_area']: if field in f: stats.update([field]) print("# entries", stats['entry']) for field in ['lgcode', 'hhtype', 'macro_area']: print("with " + field, stats[field]) # Update the CSV with the previous mappings for later reference print('%s update_previous' % time.ctime()) db.to_csvfile(previous) print('%s save_replacements' % time.ctime()) db.to_replacements(replacements) # Trickling back print('%s trickle' % time.ctime()) db.trickle(bibfiles) # Save print('%s save as utf8' % time.ctime()) monster.save(m, verbose=False) print('%s done.' % time.ctime())
def compile(api, log=None, rebuild=False): log = log or logging.getLogger('pyglottolog') previous = api.references_path('monster.csv') replacements = api.references_path('replacements.json') monster = BibFile(fname=api.build_path('monster-utf8.bib'), encoding='utf-8', sortkey='bibkey') log.info('%s open/rebuild bibfiles db' % time.ctime()) db = api.bibfiles.to_sqlite(api.build_path('_bibfiles.sqlite3'), rebuild=rebuild) log.info('%s compile_monster' % time.ctime()) m = dict(db.merged()) log.info('%s load hh.bib' % time.ctime()) hhbib = api.bibfiles['hh.bib'].load() # Annotate with macro_area from lgcode when lgcode is assigned manually log.info('%s macro_area_from_lgcode' % time.ctime()) m = macro_area_from_lgcode(m, api.macroarea_map) # Annotate with hhtype log.info('%s annotate hhtype' % time.ctime()) m = markconservative(m, api.hhtypes.triggers, hhbib, api.hhtypes, api.build_path('monstermark-hht.txt'), rank=lambda l: api.hhtypes[l]) # Annotate with lgcode log.info('%s annotate lgcode' % time.ctime()) m = markconservative(m, api.triggers['lgcode'], hhbib, api.hhtypes, api.build_path('monstermark-lgc.txt')) # Annotate with inlg log.info('%s add_inlg_e' % time.ctime()) m = add_inlg_e(m, api.triggers['inlg']) # Print some statistics stats = Counter() log.info(time.ctime()) for t, f in m.values(): stats.update(['entry']) for field in ['lgcode', 'hhtype', 'macro_area']: if field in f: stats.update([field]) log.info("# entries {0}".format(stats['entry'])) for field in ['lgcode', 'hhtype', 'macro_area']: log.info("with {0}: {1}".format(field, stats[field])) # Update the CSV with the previous mappings for later reference log.info('%s update_previous' % time.ctime()) db.to_csvfile(previous) log.info('%s save_replacements' % time.ctime()) db.to_replacements(replacements) # Trickling back log.info('%s trickle' % time.ctime()) db.trickle() # Save log.info('%s save as utf8' % time.ctime()) monster.save(m) log.info('%s done.' % time.ctime())