def main(bib, mode): # pragma: no cover count = 0 skipped = 0 with transaction.manager: provider_map = get_map(Provider) macroarea_map = get_map(Macroarea) doctype_map = get_map(Doctype) known_ids = set(r[0] for r in DBSession.query(Ref.pk)) languoid_map = {} for l in DBSession.query(Languoid): if l.hid: languoid_map[l.hid] = l languoid_map[l.id] = l for i, rec in enumerate(bib): if len(rec.keys()) < 6: skipped += 1 #print '---> skip', rec.id #print rec continue changed = False assert rec.get('glottolog_ref_id') id_ = int(rec.get('glottolog_ref_id')) if mode != 'update' and id_ in known_ids: continue ref = DBSession.query(Source).get(id_) update = True if ref else False kw = { 'pk': id_, 'bibtex_type': getattr(EntryType, rec.genre), 'id': str(id_), 'jsondata': { 'bibtexkey': rec.id }, } for source, target in FIELD_MAP.items(): value = rec.get(source) if value: value = unescape(value) if target: kw[target] = CONVERTER.get(source, lambda x: x)(value) else: kw['jsondata'][source] = value # try to extract numeric year, startpage, endpage, numberofpages, ... if rec.get('numberofpages'): try: kw['pages_int'] = int(rec.get('numberofpages').strip()) except ValueError: pass if kw.get('year'): match = YEAR_PATTERN.search(kw.get('year')) if match: kw['year_int'] = int(match.group('year')) if kw.get('publisher'): p = kw.get('publisher') if ':' in p: address, publisher = [ s.strip() for s in kw['publisher'].split(':', 1) ] if not 'address' in kw or kw['address'] == address: kw['address'], kw['publisher'] = address, publisher if kw.get('pages'): pages = kw.get('pages') match = ROMANPAGESPATTERNra.search(pages) if not match: match = ROMANPAGESPATTERNar.search(pages) if match: if 'pages_int' not in kw: kw['pages_int'] = roman_to_int(match.group('roman')) \ + int(match.group('arabic')) else: start = None number = None match = None for match in PAGES_PATTERN.finditer(pages): if start is None: start = int(match.group('start')) number = (number or 0) \ + (int(match.group('end')) - int(match.group('start')) + 1) if match: kw['endpage_int'] = int(match.group('end')) kw['startpage_int'] = start kw.setdefault('pages_int', number) else: try: kw['startpage_int'] = int(pages) except ValueError: pass if update: for k in kw.keys(): if k == 'pk': continue #if k == 'title': # v = ref.title or ref.description #else: if 1: v = getattr(ref, k) if kw[k] != v: # # TODO! # setattr(ref, k, kw[k]) #if k not in ['jsondata', 'publisher']: # print k, ref.pk # print kw[k] # print v # print '--------------' changed = True if ref.title: ref.description = ref.title else: changed = True ref = Ref(**kw) def append(attr, obj): if obj and obj not in attr: changed = True # # TODO! # attr.append(obj) for name in set( filter(None, [ s.strip() for s in kw['jsondata'].get( 'macro_area', '').split(',') ])): append(ref.macroareas, macroarea_map[name]) for name in set( filter(None, [ s.strip() for s in kw['jsondata'].get('src', '').split(',') ])): append(ref.providers, provider_map[slug(name)]) for m in DOCTYPE_PATTERN.finditer(kw['jsondata'].get('hhtype', '')): append(ref.doctypes, doctype_map[m.group('name')]) if len(kw['jsondata'].get('lgcode', '')) == 3: kw['jsondata']['lgcode'] = '[%s]' % kw['jsondata']['lgcode'] for m in CODE_PATTERN.finditer(kw['jsondata'].get('lgcode', '')): for code in set(m.group('code').split(',')): if code not in languoid_map: if code not in ['NOCODE_Payagua', 'emx']: print '--> unknown code:', code.encode('utf8') else: append(ref.languages, languoid_map[code]) for glottocode in filter( None, kw['jsondata'].get('alnumcodes', '').split(';')): if glottocode not in languoid_map: print '--> unknown glottocode:', glottocode.encode('utf8') else: append(ref.languages, languoid_map[glottocode]) if not update: #pass # # TODO! # DBSession.add(ref) if i % 100 == 0: print i, 'records done' if changed: count += 1 print count, 'records updated or imported' print skipped, 'records skipped because of lack of information'
def update(args): author = 'ISO 639-3 Registration Authority' pid = 'iso6393' dtid = 'overview' dt = Doctype.get(dtid) provider = Provider.get(pid, default=None) if provider is None: provider = Provider( id=pid, abbr=pid, name=author, description= "Change requests submitted to the ISO 639-3 registration authority." ) iid = max( int( DBSession.execute( "select max(cast(id as integer)) from source").fetchone()[0]), 500000) pk = int(DBSession.execute("select max(pk) from source").fetchone()[0]) for crno, affected in args.json['changerequests'].items(): year, serial = crno.split('-') title = 'Change Request Number %s' % crno ref = Ref.get(title, key='title', default=None) if not ref: iid += 1 pk += 1 ref = Ref(pk=pk, id=str(iid), name='%s %s' % (author, year), bibtex_type=EntryType.misc, number=crno, description=title, year=year, year_int=int(year), title=title, author=author, address='Dallas', publisher='SIL International', url='http://www.sil.org/iso639-3/cr_files/%s.pdf' % crno, language_note=', '.join( '%(Language Name)s [%(Affected Identifier)s]' % spec for spec in affected), jsondata=dict(hhtype=dtid, src=pid)) ref.doctypes.append(dt) ref.providers.append(provider) for spec in affected: lang = Languoid.get(spec['Affected Identifier'], key='hid', default=None) if lang and lang not in ref.languages: ref.languages.append(lang) DBSession.add(ref) transaction.commit() transaction.begin() matched = 0 near = 0 max_identifier_pk = DBSession.query(Identifier.pk).order_by( desc(Identifier.pk)).first()[0] families = [] for family in DBSession.query(Languoid)\ .filter(Languoid.level == LanguoidLevel.family)\ .filter(Language.active == True)\ .all(): isoleafs = set() for row in DBSession.query(TreeClosureTable.child_pk, Languoid.hid)\ .filter(family.pk == TreeClosureTable.parent_pk)\ .filter(Languoid.pk == TreeClosureTable.child_pk)\ .filter(Languoid.hid != None)\ .filter(Languoid.level == LanguoidLevel.language)\ .filter(Languoid.status == LanguoidStatus.established)\ .all(): if len(row[1]) == 3: isoleafs.add(row[1]) families.append((family, isoleafs)) families = sorted(families, key=lambda p: len(p[1])) for mid, leafs in args.json['macrolanguages'].items(): leafs = set(leafs) found = False for family, isoleafs in families: if leafs == isoleafs: if mid not in [ c.name for c in family.identifiers if c.type == IdentifierType.iso.value ]: family.codes.append( Identifier(id=str(max_identifier_pk + 1), name=mid, type=IdentifierType.iso.value)) max_identifier_pk += 1 matched += 1 found = True break elif leafs.issubset(isoleafs): print '~~~', family.name, '-->', mid, 'distance:', len( leafs), len(isoleafs) near += 1 found = True break if not found: print '---', mid, leafs print 'matched', matched, 'of', len( args.json['macrolanguages']), 'macrolangs' print near
def main(args): # pragma: no cover stats = Counter(new=0, updated=0, skipped=0) changes = {} with transaction.manager: update_providers(args) DBSession.flush() provider_map = get_map(Provider) macroarea_map = get_map(Macroarea) doctype_map = get_map(Doctype) languoid_map = {} for l in DBSession.query(Languoid): if l.hid: languoid_map[l.hid] = l languoid_map[l.id] = l for i, rec in enumerate(get_bib(args)): if i and i % 1000 == 0: print i, 'records done', stats['updated'] + stats['new'], 'changed' if len(rec.keys()) < 6: # not enough information! stats.update(['skipped']) continue changed = False assert rec.get('glottolog_ref_id') id_ = int(rec.get('glottolog_ref_id')) ref = DBSession.query(Source).get(id_) update = True if ref else False kw = { 'pk': id_, 'bibtex_type': rec.genre, 'id': str(id_), 'jsondata': {'bibtexkey': rec.id}, } for source, target in FIELD_MAP.items(): if target is None: continue value = rec.get(source) if value: value = unescape(value) if target: kw[target] = CONVERTER.get(source, lambda x: x)(value) else: kw['jsondata'][source] = value if kw['jsondata'].get('hhtype'): trigger = ca_trigger(kw['jsondata']['hhtype']) if trigger: kw['ca_doctype_trigger'], kw['jsondata']['hhtype'] = trigger # try to extract numeric year, startpage, endpage, numberofpages, ... if kw.get('year'): # prefer years in brackets over the first 4-digit number. match = PREF_YEAR_PATTERN.search(kw.get('year')) if match: kw['year_int'] = int(match.group('year')) else: match = YEAR_PATTERN.search(kw.get('year')) if match: kw['year_int'] = int(match.group('year')) if kw.get('publisher'): p = kw.get('publisher') if ':' in p: address, publisher = [s.strip() for s in kw['publisher'].split(':', 1)] if 'address' not in kw or kw['address'] == address: kw['address'], kw['publisher'] = address, publisher if rec.get('numberofpages'): try: kw['pages_int'] = int(rec.get('numberofpages').strip()) except ValueError: pass if kw.get('pages'): start, end, number = compute_pages(kw['pages']) if start is not None: kw['startpage_int'] = start if end is not None: kw['endpage_int'] = end if number is not None and 'pages_int' not in kw: kw['pages_int'] = number for k in kw.keys(): v = kw[k] if isinstance(v, basestring): v = v.strip() or None kw[k] = v if update: for k in kw.keys(): if k == 'pk': continue v = getattr(ref, k) if kw[k] != v: if k == 'jsondata': d = {k: v for k, v in ref.jsondata.items() if k in NONREF_JSONDATA} d.update(**kw[k]) ref.jsondata = d else: #print k, '--', v #print k, '++', kw[k] setattr(ref, k, kw[k]) changed = True if ref.id in changes: changes[ref.id][k] = ('%s' % v, '%s' % kw[k]) else: changes[ref.id] = {k: ('%s' % v, '%s' % kw[k])} else: changed = True ref = Ref(name='%s %s' % (kw.get('author', 'na'), kw.get('year', 'nd')), **kw) ref.description = ref.title or ref.booktitle originator = ref.author or ref.editor or 'Anonymous' ref.name = '%s %s' % (originator, ref.year or 'n.d.') a, r = update_relationship( ref.macroareas, [macroarea_map[name] for name in set(filter(None, [s.strip() for s in kw['jsondata'].get('macro_area', '').split(',')]))]) changed = changed or a or r src = [s.strip() for s in kw['jsondata'].get('src', '').split(',')] prv = {provider_map[slug(s)] for s in src if s} if set(ref.providers) != prv: ref.providers = list(prv) changed = True a, r = update_relationship( ref.doctypes, [doctype_map[m.group('name')] for m in DOCTYPE_PATTERN.finditer(kw['jsondata'].get('hhtype', ''))]) changed = changed or a or r if not update: stats.update(['new']) DBSession.add(ref) elif changed: stats.update(['updated']) args.log.info('%s' % stats) DBSession.execute("update source set description = title where description is null and title is not null;") DBSession.execute("update source set description = booktitle where description is null and booktitle is not null;") for row in list(DBSession.execute( "select pk, pages, pages_int, startpage_int from source where pages_int < 0")): pk, pages, number, start = row _start, _end, _number = compute_pages(pages) if _number > 0 and _number != number: DBSession.execute( "update source set pages_int = %s, startpage_int = %s where pk = %s" % (_number, _start, pk)) DBSession.execute( "update ref set endpage_int = %s where pk = %s" % (_end, pk)) jsondump(changes, args.data_dir.joinpath('references', 'changes.json'))