def main(args): client = Client(args.host, 'apics_data', args.user, args.password) for layout, table in [ ('lect_description_references', 'Lect_description_references'), ('lect_descriptions', 'Lect_descriptions'), ('colours', 'Colours'), ('contributors', 'Contributors'), ("data (editors' layout)", 'Data'), ("data (apics-wals)", 'wals'), ("data references", "Data_references"), ("editors", "Editors"), ("examples", "ExamplesB"), ("examples (editors' layout)", "Examples"), ("feature references", "Feature_references"), ("features (publication)", "Featuresp"), ("features (value names)", "Featuresv"), ("features", "Features"), ("language references", "Language_references"), ("languages (editors' layout)", "Languages"), ("people", "People"), ("references", "References"), ("segment data (editors' layout)", "Segment_data"), ("segment data", "Segment_dataB"), ('segment features', 'Segment_features'), ("sociolinguistic data", "Sociolinguistic_data"), ("sociolinguistic data references", "Sociolinguistic_data_references"), ("sociolinguistic features", "Sociolinguistic_features"), ("value examples", "Value_examples"), ]: jsondump(client.get(layout), args.data_file('fm', '%s.json' % table))
def __call__(self, outdir): """ runs a parser workflow consisting of - preprocess - refactor - postprocess writes the results, an html, a css and a json file to disk. """ cssutils_logger = logging.getLogger('CSSUTILS') cssutils_logger.setLevel(logging.ERROR) print(self.fname.namebase.encode('utf8')) with open(self.fname, encoding='utf8') as fp: c = fp.read() soup = BeautifulSoup(self.preprocess(self._preprocess(c))) # extract css from the head section of the HTML doc: css = cssutils.parseString('\n') for style in soup.find('head').find_all('style'): for rule in self.cssrules(style): css.add(rule) md = dict(outline=[], refs=[], authors=[]) soup = self.refactor(soup, md) # enhance section headings: for section, t in tag_and_text(soup.find_all('h3')): t = t.split('[Note')[0] id_ = 'section-%s' % slug(t) md['outline'].append((t, id_)) section.attrs['id'] = id_ for s, attrs in [ (u'\u21eb', {'href': '#top', 'title': 'go to top of the page', 'style': 'vertical-align: bottom'}), ('¶', {'class': 'headerlink', 'href': '#' + id_, 'title': 'Permalink to this section'}), ]: append(section, soup.new_string('\n'), new_tag(soup, 'a', s, **attrs)) body = self.insert_links(unicode(soup.find('body')), md) # write output files: with open(outdir.joinpath('%s.html' % self.id), 'w', encoding='utf8') as fp: fp.write(self.wrap(self.postprocess(body))) with open(outdir.joinpath('%s.css' % self.id), 'wb') as fp: fp.write(self.csstext(css)) md['authors'] = list(self.yield_valid_authors(md['authors'])) jsondump(md, outdir.joinpath('%s.json' % self.id), indent=4)
def main(args): # pragma: no cover stats = Counter(new=0, updated=0, skipped=0) changes = {} with transaction.manager: update_providers(args) DBSession.flush() provider_map = get_map(Provider) macroarea_map = get_map(Macroarea) doctype_map = get_map(Doctype) languoid_map = {} for l in DBSession.query(Languoid): if l.hid: languoid_map[l.hid] = l languoid_map[l.id] = l for i, rec in enumerate(get_bib(args)): if i and i % 1000 == 0: print i, 'records done', stats['updated'] + stats['new'], 'changed' if len(rec.keys()) < 6: # not enough information! stats.update(['skipped']) continue changed = False assert rec.get('glottolog_ref_id') id_ = int(rec.get('glottolog_ref_id')) ref = DBSession.query(Source).get(id_) update = True if ref else False kw = { 'pk': id_, 'bibtex_type': rec.genre, 'id': str(id_), 'jsondata': {'bibtexkey': rec.id}, } for source, target in FIELD_MAP.items(): if target is None: continue value = rec.get(source) if value: value = unescape(value) if target: kw[target] = CONVERTER.get(source, lambda x: x)(value) else: kw['jsondata'][source] = value if kw['jsondata'].get('hhtype'): trigger = ca_trigger(kw['jsondata']['hhtype']) if trigger: kw['ca_doctype_trigger'], kw['jsondata']['hhtype'] = trigger # try to extract numeric year, startpage, endpage, numberofpages, ... if kw.get('year'): # prefer years in brackets over the first 4-digit number. match = PREF_YEAR_PATTERN.search(kw.get('year')) if match: kw['year_int'] = int(match.group('year')) else: match = YEAR_PATTERN.search(kw.get('year')) if match: kw['year_int'] = int(match.group('year')) if kw.get('publisher'): p = kw.get('publisher') if ':' in p: address, publisher = [s.strip() for s in kw['publisher'].split(':', 1)] if 'address' not in kw or kw['address'] == address: kw['address'], kw['publisher'] = address, publisher if rec.get('numberofpages'): try: kw['pages_int'] = int(rec.get('numberofpages').strip()) except ValueError: pass if kw.get('pages'): start, end, number = compute_pages(kw['pages']) if start is not None: kw['startpage_int'] = start if end is not None: kw['endpage_int'] = end if number is not None and 'pages_int' not in kw: kw['pages_int'] = number for k in kw.keys(): v = kw[k] if isinstance(v, basestring): v = v.strip() or None kw[k] = v if update: for k in kw.keys(): if k == 'pk': continue v = getattr(ref, k) if kw[k] != v: if k == 'jsondata': d = {k: v for k, v in ref.jsondata.items() if k in NONREF_JSONDATA} d.update(**kw[k]) ref.jsondata = d else: #print k, '--', v #print k, '++', kw[k] setattr(ref, k, kw[k]) changed = True if ref.id in changes: changes[ref.id][k] = ('%s' % v, '%s' % kw[k]) else: changes[ref.id] = {k: ('%s' % v, '%s' % kw[k])} else: changed = True ref = Ref(name='%s %s' % (kw.get('author', 'na'), kw.get('year', 'nd')), **kw) ref.description = ref.title or ref.booktitle originator = ref.author or ref.editor or 'Anonymous' ref.name = '%s %s' % (originator, ref.year or 'n.d.') a, r = update_relationship( ref.macroareas, [macroarea_map[name] for name in set(filter(None, [s.strip() for s in kw['jsondata'].get('macro_area', '').split(',')]))]) changed = changed or a or r src = [s.strip() for s in kw['jsondata'].get('src', '').split(',')] prv = {provider_map[slug(s)] for s in src if s} if set(ref.providers) != prv: ref.providers = list(prv) changed = True a, r = update_relationship( ref.doctypes, [doctype_map[m.group('name')] for m in DOCTYPE_PATTERN.finditer(kw['jsondata'].get('hhtype', ''))]) changed = changed or a or r if not update: stats.update(['new']) DBSession.add(ref) elif changed: stats.update(['updated']) args.log.info('%s' % stats) DBSession.execute("update source set description = title where description is null and title is not null;") DBSession.execute("update source set description = booktitle where description is null and booktitle is not null;") for row in list(DBSession.execute( "select pk, pages, pages_int, startpage_int from source where pages_int < 0")): pk, pages, number, start = row _start, _end, _number = compute_pages(pages) if _number > 0 and _number != number: DBSession.execute( "update source set pages_int = %s, startpage_int = %s where pk = %s" % (_number, _start, pk)) DBSession.execute( "update ref set endpage_int = %s where pk = %s" % (_end, pk)) jsondump(changes, args.data_dir.joinpath('references', 'changes.json'))
def main(args): stats = Counter(new=0, matches=0, migrations=0, nomatches=0) l, ll = Language.__table__.alias("l"), Languoid.__table__.alias("ll") gl_languoids = list(DBSession.execute(select([l, ll], use_labels=True).where(l.c.pk == ll.c.pk)).fetchall()) # we collect a list of changes which we will store in a JSON file. changes = [] hid_to_pk = {row["ll_hid"]: row["l_pk"] for row in gl_languoids if row["ll_hid"]} max_languoid_pk = max(*[row["l_pk"] for row in gl_languoids]) new_glottocodes = {} pk_to_name = {row["l_pk"]: row["l_name"] for row in gl_languoids} # dict mapping branches (i.e. tuples of sub-family names) to dicts of H-languages hh_families = OrderedDict() # dict mapping identifiers (i.e. hid) of H-languages to branches hh_languages = OrderedDict() parse_families(args.data_dir.joinpath("languoids", "lff.txt"), hh_families, hh_languages) # handle isolates / collapse families with exactly one leaf: isolate_names = {} collapsed_names = {} for key in hh_families.keys(): if len(hh_families[key]) == 1: if len(key) == 1: # isolate hh_languages[hh_families[key].keys()[0]][0] = None isolate_names[key[0]] = hh_families[key].keys()[0] # map name to code else: hh_languages[hh_families[key].keys()[0]][0] = key[:-1] collapsed_names[key[-1]] = hh_families[key].keys()[0] del hh_families[key] # now add the unclassifiabble, unattested, un-whatever parse_families(args.data_dir.joinpath("languoids", "lof.txt"), hh_families, hh_languages) # we also want to be able to lookup families by name fname_to_branches = defaultdict(list) for branch in hh_families: fname_to_branches[branch[-1]].append(branch) new_hid_to_pk = {} for code, (hnode, status, name) in hh_languages.items(): if code not in hid_to_pk: # we have to insert a new H-language! max_languoid_pk += 1 new_hid_to_pk[code] = max_languoid_pk if name in pk_to_name.values(): args.log.warn("new code {1} for existing name {0}".format(name, code)) changes.append( languoid( max_languoid_pk, "language", hid=code, id=glottocode(unicode(name), DBSession, new_glottocodes), name=name, hname=name, status=status, ) ) stats.update(["new_languages"]) duplicate_leafset_to_branch = {} leafset_to_branch = {} for family, langs in hh_families.items(): leafs = get_leafset(hid for hid in langs.keys() if hid in hid_to_pk) if not leafs: args.log.info("Family with only new languages: %s, %s" % (family, langs)) continue if leafs in leafset_to_branch: # so we have already seen this exact set of leaves. # # special case: there may be additional "Unclassified something" nodes in # branch without any changes in the set of leafs ... if not [n for n in family if n.startswith("Unclassified")]: # ... or the full leafset contains new languages assert [hid for hid in hh_families[family[:-1]].keys() if hid in new_hid_to_pk] fset, rset = set(family), set(leafset_to_branch[leafs]) assert rset.issubset(fset) assert leafs not in duplicate_leafset_to_branch duplicate_leafset_to_branch[leafs] = family else: leafset_to_branch[leafs] = family # # at this point leafset_to_branch is a consolidated mapping of sets of H-Languages # to branches in the new family tree. # # for set comparisons we compute a list of actual sets (not tuples) of leafs # ordered by length. leafsets = [set(t) for t in sorted(leafset_to_branch.keys(), key=lambda s: len(s))] todo = [] gl_family_to_leafset = {} def select_leafs(pk): l, tc = Languoid.__table__.alias("l"), TreeClosureTable.__table__.alias("tc") return [ r["l_hid"] for r in DBSession.execute( select([l, tc], use_labels=True).where( and_( l.c.pk == tc.c.child_pk, l.c.hid != None, l.c.status != LanguoidStatus.provisional, tc.c.parent_pk == pk, ) ) ) ] for row in gl_languoids: if row["ll_level"] == LanguoidLevel.family and row["l_active"]: leafs = get_leafset(select_leafs(row["l_pk"])) assert leafs glnode = GLNode( row["l_pk"], row["l_name"], row["ll_level"].name, row["ll_father_pk"], row["l_jsondata"].get("hname") ) gl_family_to_leafset[glnode] = leafs # note: for legacy gl nodes, we map leaf-tuples to lists of matching nodes! leafset_to_gl_family = defaultdict(list) for node, leafs in gl_family_to_leafset.items(): leafset_to_gl_family[leafs].append(node) # now we look for matches between old and new classification: for leafs, nodes in leafset_to_gl_family.items(): todo.extend( match_nodes(args, leafs, nodes, leafset_to_branch, duplicate_leafset_to_branch, leafsets, fname_to_branches) ) # compile a mapping for exact matches: branch_to_pk = {} for m in todo: if m.hid: if m.hid in branch_to_pk: if branch_to_pk[m.hid] != m.pk: # compare names: if pk_to_name[m.pk] == m.hid[-1]: args.log.info("#### type1") branch_to_pk[m.hid] = m.pk elif pk_to_name[branch_to_pk[m.hid]] == m.hid[-1]: args.log.info("#### type2") else: raise ValueError else: branch_to_pk[m.hid] = m.pk for hnode in sorted(hh_families.keys(), key=lambda b: (len(b), b)): # loop through branches breadth first to determine what's to be inserted if hnode not in branch_to_pk: t = get_leafset(hh_families[hnode].keys()) if t in leafset_to_gl_family: # the "Unclassified subfamily" special case from above: if not [n for n in hnode if n.startswith("Unclassified")]: assert [hid for hid in hh_families[hnode[:-1]].keys() if hid in new_hid_to_pk] # make sure, the existing glottolog family for the set of leafs is mapped # to some other node in the new classification: assert leafset_to_gl_family[t][0].pk in [m.pk for m in todo if m.hid] max_languoid_pk += 1 branch_to_pk[hnode] = max_languoid_pk pk_to_name[max_languoid_pk] = hnode[-1] attrs = languoid( max_languoid_pk, "family", id=glottocode(unicode(hnode[-1]), DBSession, new_glottocodes), name=hnode[-1], hname=hnode[-1], ) if len(hnode) > 1: attrs["father_pk"] = branch_to_pk[tuple(list(hnode)[:-1])] assert attrs["father_pk"] stats.update(["new"]) changes.append(attrs) # now on to the updates for families: for m in todo: attrs = languoid(m.pk, "family", name=pk_to_name[m.pk]) if m.hid: stats.update(["matches"]) if len(m.hid) > 1: attrs["father_pk"] = branch_to_pk[tuple(list(m.hid)[:-1])] if getattr(m, "rename", False): attrs["name"] = m.hid[-1] attrs["hname"] = m.hid[-1] else: attrs["active"] = False # mark the languoid as obsolete. if getattr(m, "pointer", False): print "~~", m.pk, pk_to_name[m.pk].encode("utf8"), "->", ", ".join(m.pointer).encode("utf8") stats.update(["migrations"]) attrs["replacement"] = branch_to_pk[m.pointer] else: stats.update(["nomatches"]) changes.append(attrs) args.log.info("%s" % stats) risolate_names = dict(zip(isolate_names.values(), isolate_names.keys())) rcollapsed_names = dict(zip(collapsed_names.values(), collapsed_names.keys())) # and updates of father_pks for languages: for l, (hnode, status, name) in hh_languages.items(): id_ = hid_to_pk.get(l) if not id_: id_ = new_hid_to_pk.get(l) attrs = languoid(id_, "language", status=status) else: attrs = languoid(id_, "language", status=status) # In case of existing languoids, we don't change the active flag! del attrs["active"] if id_ in pk_to_name and name != pk_to_name[id_]: if slug(pk_to_name[id_]) == slug(name): attrs["name"] = name if hnode: attrs["father_pk"] = branch_to_pk[hnode] # look for hnames! if l in risolate_names: attrs["hname"] = risolate_names[l] if l in rcollapsed_names: attrs["hname"] = rcollapsed_names[l] changes.append(attrs) for row in gl_languoids: hid = row["ll_hid"] if hid and "NOCODE" in hid and hid not in hh_languages: # languoids with Harald's private code that are no longer in use changes.append(languoid(row["l_pk"], "language", status="retired", active=False, father_pk=None)) jsondump(changes, args.data_dir.joinpath("languoids", "changes.json"), indent=4)
def main(args): stats = Counter(new=0, matches=0, migrations=0, nomatches=0) l, ll = Language.__table__.alias('l'), Languoid.__table__.alias('ll') gl_languoids = list( DBSession.execute( select([l, ll], use_labels=True).where(l.c.pk == ll.c.pk)).fetchall()) # we collect a list of changes which we will store in a JSON file. changes = [] hid_to_pk = { row['ll_hid']: row['l_pk'] for row in gl_languoids if row['ll_hid'] } max_languoid_pk = max(*[row['l_pk'] for row in gl_languoids]) new_glottocodes = {} pk_to_name = {row['l_pk']: row['l_name'] for row in gl_languoids} # dict mapping branches (i.e. tuples of sub-family names) to dicts of H-languages hh_families = OrderedDict() # dict mapping identifiers (i.e. hid) of H-languages to branches hh_languages = OrderedDict() parse_families(args.data_dir.joinpath('languoids', 'lff.txt'), hh_families, hh_languages) # handle isolates / collapse families with exactly one leaf: isolate_names = {} collapsed_names = {} for key in hh_families.keys(): if len(hh_families[key]) == 1: if len(key) == 1: # isolate hh_languages[hh_families[key].keys()[0]][0] = None isolate_names[key[0]] = hh_families[key].keys()[ 0] # map name to code else: hh_languages[hh_families[key].keys()[0]][0] = key[:-1] collapsed_names[key[-1]] = hh_families[key].keys()[0] del hh_families[key] # now add the unclassifiabble, unattested, un-whatever parse_families(args.data_dir.joinpath('languoids', 'lof.txt'), hh_families, hh_languages) # we also want to be able to lookup families by name fname_to_branches = defaultdict(list) for branch in hh_families: fname_to_branches[branch[-1]].append(branch) new_hid_to_pk = {} for code, (hnode, status, name) in hh_languages.items(): if code not in hid_to_pk: # we have to insert a new H-language! max_languoid_pk += 1 new_hid_to_pk[code] = max_languoid_pk if name in pk_to_name.values(): args.log.warn('new code {1} for existing name {0}'.format( name, code)) changes.append( languoid(max_languoid_pk, 'language', hid=code, id=glottocode(unicode(name), DBSession, new_glottocodes), name=name, hname=name, status=status)) stats.update(['new_languages']) duplicate_leafset_to_branch = {} leafset_to_branch = {} for family, langs in hh_families.items(): leafs = get_leafset(hid for hid in langs.keys() if hid in hid_to_pk) if not leafs: args.log.info('Family with only new languages: %s, %s' % (family, langs)) continue if leafs in leafset_to_branch: # so we have already seen this exact set of leaves. # # special case: there may be additional "Unclassified something" nodes in # branch without any changes in the set of leafs ... if not [n for n in family if n.startswith('Unclassified')]: # ... or the full leafset contains new languages assert [ hid for hid in hh_families[family[:-1]].keys() if hid in new_hid_to_pk ] fset, rset = set(family), set(leafset_to_branch[leafs]) assert rset.issubset(fset) assert leafs not in duplicate_leafset_to_branch duplicate_leafset_to_branch[leafs] = family else: leafset_to_branch[leafs] = family # # at this point leafset_to_branch is a consolidated mapping of sets of H-Languages # to branches in the new family tree. # # for set comparisons we compute a list of actual sets (not tuples) of leafs # ordered by length. leafsets = [ set(t) for t in sorted(leafset_to_branch.keys(), key=lambda s: len(s)) ] todo = [] gl_family_to_leafset = {} def select_leafs(pk): l, tc = Languoid.__table__.alias( 'l'), TreeClosureTable.__table__.alias('tc') return [ r['l_hid'] for r in DBSession.execute( select([l, tc], use_labels=True).where( and_(l.c.pk == tc.c.child_pk, l.c.hid != None, l.c.status != LanguoidStatus.provisional, tc.c.parent_pk == pk))) ] for row in gl_languoids: if row['ll_level'] == LanguoidLevel.family and row['l_active']: leafs = get_leafset(select_leafs(row['l_pk'])) assert leafs glnode = GLNode(row['l_pk'], row['l_name'], row['ll_level'].name, row['ll_father_pk'], row['l_jsondata'].get('hname')) gl_family_to_leafset[glnode] = leafs # note: for legacy gl nodes, we map leaf-tuples to lists of matching nodes! leafset_to_gl_family = defaultdict(list) for node, leafs in gl_family_to_leafset.items(): leafset_to_gl_family[leafs].append(node) # now we look for matches between old and new classification: for leafs, nodes in leafset_to_gl_family.items(): todo.extend( match_nodes(args, leafs, nodes, leafset_to_branch, duplicate_leafset_to_branch, leafsets, fname_to_branches)) # compile a mapping for exact matches: branch_to_pk = {} for m in todo: if m.hid: if m.hid in branch_to_pk: if branch_to_pk[m.hid] != m.pk: # compare names: if pk_to_name[m.pk] == m.hid[-1]: args.log.info('#### type1') branch_to_pk[m.hid] = m.pk elif pk_to_name[branch_to_pk[m.hid]] == m.hid[-1]: args.log.info('#### type2') else: raise ValueError else: branch_to_pk[m.hid] = m.pk for hnode in sorted(hh_families.keys(), key=lambda b: (len(b), b)): # loop through branches breadth first to determine what's to be inserted if hnode not in branch_to_pk: t = get_leafset(hh_families[hnode].keys()) if t in leafset_to_gl_family: # the "Unclassified subfamily" special case from above: if not [n for n in hnode if n.startswith('Unclassified')]: assert [ hid for hid in hh_families[hnode[:-1]].keys() if hid in new_hid_to_pk ] # make sure, the existing glottolog family for the set of leafs is mapped # to some other node in the new classification: assert leafset_to_gl_family[t][0].pk in [ m.pk for m in todo if m.hid ] max_languoid_pk += 1 branch_to_pk[hnode] = max_languoid_pk pk_to_name[max_languoid_pk] = hnode[-1] attrs = languoid( max_languoid_pk, 'family', id=glottocode(unicode(hnode[-1]), DBSession, new_glottocodes), name=hnode[-1], hname=hnode[-1], ) if len(hnode) > 1: attrs['father_pk'] = branch_to_pk[tuple(list(hnode)[:-1])] assert attrs['father_pk'] stats.update(['new']) changes.append(attrs) # now on to the updates for families: for m in todo: attrs = languoid(m.pk, 'family', name=pk_to_name[m.pk]) if m.hid: stats.update(['matches']) if len(m.hid) > 1: attrs['father_pk'] = branch_to_pk[tuple(list(m.hid)[:-1])] if getattr(m, 'rename', False): attrs['name'] = m.hid[-1] attrs['hname'] = m.hid[-1] else: attrs['active'] = False # mark the languoid as obsolete. if getattr(m, 'pointer', False): print '~~', m.pk, pk_to_name[m.pk].encode('utf8'), '->', \ ', '.join(m.pointer).encode('utf8') stats.update(['migrations']) attrs['replacement'] = branch_to_pk[m.pointer] else: stats.update(['nomatches']) changes.append(attrs) args.log.info('%s' % stats) risolate_names = dict(zip(isolate_names.values(), isolate_names.keys())) rcollapsed_names = dict( zip(collapsed_names.values(), collapsed_names.keys())) # and updates of father_pks for languages: for l, (hnode, status, name) in hh_languages.items(): id_ = hid_to_pk.get(l) if not id_: id_ = new_hid_to_pk.get(l) attrs = languoid(id_, 'language', status=status) else: attrs = languoid(id_, 'language', status=status) # In case of existing languoids, we don't change the active flag! del attrs['active'] if id_ in pk_to_name and name != pk_to_name[id_]: if slug(pk_to_name[id_]) == slug(name): attrs['name'] = name if hnode: attrs['father_pk'] = branch_to_pk[hnode] # look for hnames! if l in risolate_names: attrs['hname'] = risolate_names[l] if l in rcollapsed_names: attrs['hname'] = rcollapsed_names[l] changes.append(attrs) for row in gl_languoids: hid = row['ll_hid'] if hid and 'NOCODE' in hid and hid not in hh_languages: # languoids with Harald's private code that are no longer in use changes.append( languoid(row['l_pk'], 'language', status='retired', active=False, father_pk=None)) jsondump(changes, args.data_dir.joinpath('languoids', 'changes.json'), indent=4)