def read_lff(level, fp=None, dry_run=False): assert isinstance(level, Level) lang_line = re.compile("\s+" + NAME_AND_ID_REGEX + "(\[([a-z]{3}|NOCODE\_[^\]]+)?\])$") class_line = re.compile(NAME_AND_ID_REGEX + "(,\s*" + NAME_AND_ID_REGEX + ")*$") isolate_line = re.compile("([^\[]+)(\[-isolate-\])$") path = None with fp or build_path("%sff.txt" % level.name[0]).open(encoding="utf8") as fp: for line in fp: line = line.rstrip() if line.startswith("#") or not line.strip(): # ignore comments or empty lines continue match = lang_line.match(line) if match: assert path yield Languoid.from_lff(None if path == "isolate" else path, line.strip(), level, dry_run=dry_run) else: match = isolate_line.match(line) if match: path = "isolate" else: # assert it matches a classification line! if not class_line.match(line): raise ValueError(line) path = line.strip()
def read_lff(level, fp=None, dry_run=False): assert isinstance(level, Level) lang_line = re.compile('\s+' + NAME_AND_ID_REGEX + '(\[([a-z]{3}|NOCODE\_[^\]]+)?\])$') class_line = re.compile(NAME_AND_ID_REGEX + '(,\s*' + NAME_AND_ID_REGEX + ')*$') isolate_line = re.compile('([^\[]+)(\[-isolate-\])$') path = None with fp or build_path( '%sff.txt' % level.name[0]).open(encoding='utf8') as fp: for line in fp: line = line.rstrip() if line.startswith('#') or not line.strip(): # ignore comments or empty lines continue match = lang_line.match(line) if match: assert path yield Languoid.from_lff(None if path == 'isolate' else path, line.strip(), level, dry_run=dry_run) else: match = isolate_line.match(line) if match: path = 'isolate' else: # assert it matches a classification line! if not class_line.match(line): raise ValueError(line) path = line.strip()
def read_lff(level, fp=None): lang_line = re.compile('\s+' + NAME_AND_ID_REGEX + '(\[([a-z]{3})?\])$') class_line = re.compile(NAME_AND_ID_REGEX + '(,\s*' + NAME_AND_ID_REGEX + ')*$') isolate_line = re.compile('([^\[]+)(\[-isolate-\])$') path = None with fp or build_path('%sff.txt' % level[0]).open(encoding='utf8') as fp: for line in fp: line = line.rstrip() if line.startswith('#') or not line.strip(): # ignore comments or empty lines continue match = lang_line.match(line) if match: assert path yield Languoid.from_lff( None if path == 'isolate' else path, line.strip(), level) else: match = isolate_line.match(line) if match: path = 'isolate' else: # assert it matches a classification line! if not class_line.match(line): raise ValueError(line) path = line.strip()
def tree2lff(tree=TREE): languoids = dict(dialect=defaultdict(list), language=defaultdict(list)) nodes = {} for l in walk_tree(tree=tree, nodes=nodes): if l.level in languoids: languoids[l.level][l.lff_group()].append(l.lff_language()) for level, languages in languoids.items(): with build_path('%sff.txt' % level[0]).open('w', encoding='utf8') as fp: fp.write('# -*- coding: utf-8 -*-\n') for path in sorted(languages): fp.write(path + '\n') for l in sorted(languages[path]): fp.write(l + '\n')
def lff2tree(tree=TREE, outdir=None, builddir=None, lffs=None): """ - get mapping glottocode -> Languoid from old tree - assemble new directory tree - for each path component in lff/dff: - create new dir - copy info file from old tree (possibly updating the name) or - create info file - for each language/dialect in lff/dff: - create new dir - copy info file from old tree (possibly updating the name) or - create info file - rm old tree - copy new tree """ # FIXME: instead of removing trees, we should just move the current one # from outdir to build, and then recreate in outdir. builddir = Path(builddir) if builddir else build_path('tree') old_tree = {l.id: l for l in walk_tree(tree)} if tree else {} out = Path(outdir or tree) if not out.parent.exists(): out.parent.mkdir() if out.exists(): if builddir.exists(): try: rmtree(builddir) except: # pragma: no cover pass if builddir.exists(): # pragma: no cover raise ValueError('please remove %s before proceeding' % builddir) # move the old tree out of the way shutil.move(out.as_posix(), builddir.as_posix()) out.mkdir() lffs = lffs or {} languages = {} for lang in read_lff(Level.language, fp=lffs.get(Level.language)): languages[lang.id] = lang lang2tree(lang, lang.lineage, out, old_tree) for lang in read_lff(Level.dialect, fp=lffs.get(Level.dialect)): if not lang.lineage or lang.lineage[0][1] not in languages: raise ValueError('unattached dialect') # pragma: no cover lang2tree(lang, languages[lang.lineage[0][1]].lineage + lang.lineage, out, old_tree)
def lff2tree(tree=TREE, outdir=None, builddir=None, lffs=None): """ - get mapping glottocode -> Languoid from old tree - assemble new directory tree - for each path component in lff/dff: - create new dir - copy info file from old tree (possibly updating the name) or - create info file - for each language/dialect in lff/dff: - create new dir - copy info file from old tree (possibly updating the name) or - create info file - rm old tree - copy new tree """ # FIXME: instead of removing trees, we should just move the current one # from outdir to build, and then recreate in outdir. builddir = Path(builddir) if builddir else build_path("tree") old_tree = {l.id: l for l in walk_tree(tree)} if tree else {} out = Path(outdir or tree) if not out.parent.exists(): out.parent.mkdir() if out.exists(): if builddir.exists(): try: rmtree(builddir) except: # pragma: no cover pass if builddir.exists(): # pragma: no cover raise ValueError("please remove %s before proceeding" % builddir) # move the old tree out of the way shutil.move(out.as_posix(), builddir.as_posix()) out.mkdir() lffs = lffs or {} languages = {} for lang in read_lff(Level.language, fp=lffs.get(Level.language)): languages[lang.id] = lang lang2tree(lang, lang.lineage, out, old_tree) for lang in read_lff(Level.dialect, fp=lffs.get(Level.dialect)): if not lang.lineage or lang.lineage[0][1] not in languages: raise ValueError("unattached dialect") # pragma: no cover lang2tree(lang, languages[lang.lineage[0][1]].lineage + lang.lineage, out, old_tree)
def tree2lff(tree=TREE, out_paths=None): out_paths = out_paths or {} languoids = {Level.dialect: defaultdict(list), Level.language: defaultdict(list)} nodes = {} for l in walk_tree(tree=tree, nodes=nodes): if l.level in languoids: languoids[l.level][l.lff_group()].append(l.lff_language()) for level, languages in languoids.items(): out_path = out_paths.get(level, build_path("%sff.txt" % level.name[0])) with out_path.open("w", encoding="utf8") as fp: fp.write("# -*- coding: utf-8 -*-\n") for path in sorted(languages): fp.write(path + "\n") for l in sorted(languages[path]): fp.write(l + "\n")
def tree2lff(tree=TREE, out_paths=None): out_paths = out_paths or {} languoids = { Level.dialect: defaultdict(list), Level.language: defaultdict(list) } nodes = {} for l in walk_tree(tree=tree, nodes=nodes): if l.level in languoids: languoids[l.level][l.lff_group()].append(l.lff_language()) for level, languages in languoids.items(): out_path = out_paths.get(level, build_path('%sff.txt' % level.name[0])) with out_path.open('w', encoding='utf8') as fp: fp.write('# -*- coding: utf-8 -*-\n') for path in sorted(languages): fp.write(path + '\n') for l in sorted(languages[path]): fp.write(l + '\n')
def lff2tree(tree=TREE, outdir=None, test=False): """ - get mapping glottocode -> Languoid from old tree - assemble new directory tree - for each path component in lff/dff: - create new dir - copy info file from old tree (possibly updating the name) or - create info file - for each language/dialect in lff/dff: - create new dir - copy info file from old tree (possibly updating the name) or - create info file - rm old tree - copy new tree """ out = Path(outdir or build_path('tree')) if not out.parent.exists(): out.parent.mkdir() if out.exists(): rmtree(out) out.mkdir() old_tree = {l.id: l for l in walk_tree(tree)} if tree else {} languages = {} for lang in read_lff('language'): languages[lang.id] = lang lang2tree(lang, lang.lineage, out, old_tree) for lang in read_lff('dialect'): if not lang.lineage or lang.lineage[0][1] not in languages: raise ValueError('unattached dialect') lang2tree( lang, languages[lang.lineage[0][1]].lineage + lang.lineage, out, old_tree) if not test: rmtree(TREE, ignore_errors=True) copytree(out, TREE)
import difflib import operator import itertools import contextlib import collections from six import string_types, viewkeys from clldutils.dsv import UnicodeWriter from clldutils import jsonlib from pyglottolog.util import build_path, unique, group_first from pyglottolog.monsterlib import _bibtex __all__ = ['Database'] DBFILE = build_path('_bibfiles.sqlite3').as_posix() UNION_FIELDS = {'fn', 'asjp_name', 'isbn'} IGNORE_FIELDS = {'crossref', 'numnote', 'glotto_id'} class Database(object): """Bibfile collection parsed into an sqlite3 file.""" @staticmethod def _get_bibfiles(bibfiles): if bibfiles is None: # pragma: no cover from _bibfiles import Collection return Collection() return bibfiles
def main(repos=DATA_DIR, rebuild=False): bibfiles = _bibfiles.Collection(references_path('bibtex', repos=repos)) previous = references_path('monster.csv', repos=repos) replacements = build_path('monster-replacements.json', repos=repos) monster = _bibfiles.BibFile( build_path('monster-utf8.bib', repos=repos), encoding='utf-8', sortkey='bibkey') tree = languoids_path('tree', repos=repos) hht = HHTypes(repos=repos) print('%s open/rebuild bibfiles db' % time.ctime()) db = bibfiles.to_sqlite( build_path('_bibfiles.sqlite3', repos=repos).as_posix(), rebuild=rebuild) print('%s compile_monster' % time.ctime()) m = dict(db.merged()) print('%s load hh.bib' % time.ctime()) hhbib = bibfiles['hh.bib'].load() # Annotate with macro_area from lgcode when lgcode is assigned manually print('%s macro_area_from_lgcode' % time.ctime()) m = macro_area_from_lgcode(m, tree) # Annotate with hhtype print('%s annotate hhtype' % time.ctime()) m = markconservative( m, hht.triggers, hhbib, hht, build_path('monstermark-hht.txt', repos=repos), rank=lambda l: hht[l]) ltriggers = languoids.load_triggers(tree=tree) # Annotate with lgcode print('%s annotate lgcode' % time.ctime()) m = markconservative( m, ltriggers['lgcode'], hhbib, hht, build_path('monstermark-lgc.txt', repos=repos)) # Annotate with inlg print('%s add_inlg_e' % time.ctime()) m = add_inlg_e(m, ltriggers['inlg']) # Print some statistics stats = Counter() print(time.ctime()) for t, f in m.values(): stats.update(['entry']) for field in ['lgcode', 'hhtype', 'macro_area']: if field in f: stats.update([field]) print("# entries", stats['entry']) for field in ['lgcode', 'hhtype', 'macro_area']: print("with " + field, stats[field]) # Update the CSV with the previous mappings for later reference print('%s update_previous' % time.ctime()) db.to_csvfile(previous) print('%s save_replacements' % time.ctime()) db.to_replacements(replacements) # Trickling back print('%s trickle' % time.ctime()) db.trickle(bibfiles) # Save print('%s save as utf8' % time.ctime()) monster.save(m, verbose=False) print('%s done.' % time.ctime())
import difflib import operator import itertools import contextlib import collections from six import string_types from clldutils.dsv import UnicodeWriter from clldutils import jsonlib from pyglottolog.util import build_path, unique, group_first import _bibtex __all__ = ['Database'] DBFILE = build_path('_bibfiles.sqlite3').as_posix() UNION_FIELDS = {'fn', 'asjp_name', 'isbn'} IGNORE_FIELDS = {'crossref', 'numnote', 'glotto_id'} class Database(object): """Bibfile collection parsed into an sqlite3 file.""" @staticmethod def _get_bibfiles(bibfiles): if bibfiles is None: # pragma: no cover from _bibfiles import Collection return Collection() return bibfiles @staticmethod
import os import csv import json import sqlite3 import difflib import operator import itertools import contextlib import collections from pyglottolog.util import references_path, build_path import _bibtex __all__ = ['Database'] DBFILE = build_path('_bibfiles.sqlite3').as_posix() BIBFILE = build_path('monster-utf8.bib').as_posix() CSVFILE = references_path('monster.csv').as_posix() REPLACEMENTSFILE = build_path('monster-replacements.json').as_posix() UNION_FIELDS = {'fn', 'asjp_name', 'isbn'} IGNORE_FIELDS = {'crossref', 'numnote', 'glotto_id'} class Database(object): """Bibfile collection parsed into an sqlite3 file.""" @staticmethod def _get_bibfiles(bibfiles): if bibfiles is None: