def macroareas(args, languages, stats): ma_map = get_map(Macroarea) # we store references to languages to make computation of cumulated macroareas for # families easier lang_map = {} for hid, info in get_lginfo(args, lambda x: x.macro_area): if hid not in languages: languages[hid] = Languoid.get(hid, key='hid', default=None) if not languages[hid]: continue lang_map[languages[hid].pk] = languages[hid] a, r = update_relationship(languages[hid].macroareas, [ma_map[info.macro_area]]) if a or r: stats.update(['macroarea']) for family in DBSession.query(Languoid)\ .filter(Languoid.level == LanguoidLevel.family)\ .filter(Language.active == true()): mas = [] for lang in DBSession.query(TreeClosureTable.child_pk)\ .filter(TreeClosureTable.parent_pk == family.pk): if lang[0] in lang_map: mas.extend(lang_map[lang[0]].macroareas) a, r = update_relationship(family.macroareas, mas) if a or r: stats.update(['macroarea']) args.log.info('macroareas done')
def update_providers(args, verbose=False): filepath = args.data_dir.joinpath('references', 'bibtex', 'BIBFILES.ini') p = RawConfigParser() with io.open(filepath, encoding='utf-8-sig') as fp: p.readfp(fp) provider_map = get_map(Provider) for section in p.sections(): sectname = section[:-4] if section.endswith('.bib') else section id_ = slug(sectname) attrs = { 'name': p.get(section, 'title'), 'description': p.get(section, 'description'), 'abbr': p.get(section, 'abbr'), } if id_ in provider_map: provider = provider_map[id_] for a in list(attrs): before, after = getattr(provider, a), attrs[a] if before == after: del attrs[a] else: setattr(provider, a, after) attrs[a] = (before, after) if attrs: args.log.info('updating provider %s %s' % (slug(id_), sorted(attrs))) if verbose: for a, (before, after) in attrs.items(): before, after = (' '.join(_.split()) for _ in (before, after)) if before != after: args.log.info('%s\n%r\n%r' % (a, before, after)) else: args.log.info('adding provider %s' % slug(id_)) DBSession.add(Provider(id=id_, **attrs))
def update_providers(args): if not args.data_file(args.version, 'provider.txt').exists(): return with open(args.data_file(args.version, 'provider.txt')) as fp: content = fp.read().decode('latin1') if '\r\n' in content: content = content.replace('\r\n', '\n') provider_map = get_map(Provider) for block in content.split('\n\n\n\n'): lines = block.split('\n') id_, abbr = lines[0].strip().split(':') id_ = id_.split('.')[0] description = unescape('\n'.join(lines[1:])) name = description.split('.')[0] if id_ == 'hedvig-tirailleur': id_ = u'skirgard' if slug(id_) not in provider_map: args.log.info('adding provider %s' % slug(id_)) DBSession.add( Provider(id=slug(id_), name=name, description=description, abbr=abbr))
def macroareas(args, languages): ma_map = get_map(Macroarea) # we store references to languages to make computation of cumulated macroareas for # families easier lang_map = {} for hid, macroarea in dsv.reader(args.data_file("macroareas.tab")): if hid not in languages: languages[hid] = Languoid.get(hid, key="hid", default=None) if not languages[hid]: continue lang_map[languages[hid].pk] = languages[hid] update_relationship(languages[hid].macroareas, [ma_map[macroarea]], log=args.log) for family in ( DBSession.query(Languoid).filter(Languoid.level == LanguoidLevel.family).filter(Language.active == True) ): mas = [] for lang in DBSession.query(TreeClosureTable.child_pk).filter(TreeClosureTable.parent_pk == family.pk): if lang[0] in lang_map: mas.extend(lang_map[lang[0]].macroareas) update_relationship(family.macroareas, mas, log=args.log) print "macroareas done"
def main(args): # pragma: no cover with transaction.manager: max_identifier_pk = DBSession.query( Identifier.pk).order_by(desc(Identifier.pk)).first()[0] ma_map = get_map(Macroarea) languoids = dict((l.pk, l) for l in DBSession.query(Languoid)) with open(args.data_file('languoids.json')) as fp: for attrs in json.load(fp): ma = attrs.pop('macroarea', None) replacement = attrs.pop('replacement', None) hname = attrs.pop('hname', None) l = languoids.get(attrs['pk']) if l: for k, v in attrs.items(): if k == 'globalclassificationcomment': continue cv = getattr(l, k) if isinstance(cv, EnumSymbol): cv = cv.value assert v == cv #setattr(l, k, v) if len(l.hid or '') == 3: assert l.iso_code #if not l.iso_code: # l.identifiers.append( # Identifier( # id=str(max_identifier_pk + 1), # name=l.hid, # type=IdentifierType.iso.value)) # max_identifier_pk += 1 else: raise ValueError() try: l = Languoid(**attrs) except Exception: print attrs raise DBSession.add(l) languoids[l.pk] = l if len(attrs.get('hid', '')) == 3: l.identifiers.append( Identifier( id=str(max_identifier_pk + 1), name=attrs['hid'], type=IdentifierType.iso.value)) max_identifier_pk += 1 if ma: l.macroareas.append(ma_map[ma]) l.identifiers.append( Identifier( id=str(max_identifier_pk + 1), name=l.name, description='Glottolog', type='name')) max_identifier_pk += 1 if hname: assert l.jsondata['hname'] == hname #l.hname = hname if replacement: raise ValueError() DBSession.add(Superseded( languoid_pk=l.pk, replacement_pk=replacement, relation='classification update'))
def main(args): # pragma: no cover global MAX_IDENTIFIER_PK with transaction.manager: MAX_IDENTIFIER_PK = DBSession.query( Identifier.pk).order_by(desc(Identifier.pk)).first()[0] gc_names = {i.name: i for i in DBSession.query(Identifier).filter( Identifier.type == 'name').filter(Identifier.description == 'Glottolog')} ma_map = get_map(Macroarea) languoids = dict((l.pk, l) for l in DBSession.query(Languoid)) with open(args.data_file(args.version, 'languoids.json')) as fp: for attrs in json.load(fp): ma = attrs.pop('macroarea', None) replacement = attrs.pop('replacement', None) hname = attrs.pop('hname', None) for name, enum in [('level', LanguoidLevel), ('status', LanguoidStatus)]: if name in attrs: attrs[name] = enum.from_string(attrs[name]) l = languoids.get(attrs['pk']) if l: for k, v in attrs.items(): if k == 'globalclassificationcomment': continue setattr(l, k, v) if len(l.hid or '') == 3: if not l.iso_code: create_identifier( None, l, name=l.hid, type=IdentifierType.iso.value) else: l = Languoid(**attrs) DBSession.add(l) languoids[l.pk] = l if len(attrs.get('hid', '')) == 3: create_identifier( None, l, name=attrs['hid'], type=IdentifierType.iso.value) if ma: l.macroareas.append(ma_map[ma]) create_identifier( gc_names.get(l.name), l, name=l.name, description='Glottolog', type='name') if hname: l.update_jsondata(hname=hname) if replacement: DBSession.add(Superseded( languoid_pk=l.pk, replacement_pk=replacement, relation='classification update')) DBSession.flush() recreate_treeclosure()
def main(args): # pragma: no cover bib = Database.from_file(args.data_file(args.version, 'refs.bib'), encoding='utf8') count = 0 skipped = 0 changes = {} with transaction.manager: update_providers(args) DBSession.flush() provider_map = get_map(Provider) macroarea_map = get_map(Macroarea) doctype_map = get_map(Doctype) known_ids = set(r[0] for r in DBSession.query(Ref.pk)) languoid_map = {} for l in DBSession.query(Languoid): if l.hid: languoid_map[l.hid] = l languoid_map[l.id] = l for i, rec in enumerate(bib): if i and i % 1000 == 0: print i, 'records done', count, 'changed' if len(rec.keys()) < 6: # not enough information! skipped += 1 continue changed = False assert rec.get('glottolog_ref_id') id_ = int(rec.get('glottolog_ref_id')) if args.mode != 'update' and id_ in known_ids: continue ref = DBSession.query(Source).get(id_) update = True if ref else False kw = { 'pk': id_, 'bibtex_type': rec.genre, 'id': str(id_), 'jsondata': {'bibtexkey': rec.id}, } for source, target in FIELD_MAP.items(): if target is None: continue value = rec.get(source) if value: value = unescape(value) if target: kw[target] = CONVERTER.get(source, lambda x: x)(value) else: kw['jsondata'][source] = value if kw['jsondata'].get('hhtype'): trigger = ca_trigger(kw['jsondata']['hhtype']) if trigger: kw['ca_doctype_trigger'], kw['jsondata']['hhtype'] = trigger # try to extract numeric year, startpage, endpage, numberofpages, ... if kw.get('year'): # prefer years in brackets over the first 4-digit number. match = PREF_YEAR_PATTERN.search(kw.get('year')) if match: kw['year_int'] = int(match.group('year')) else: match = YEAR_PATTERN.search(kw.get('year')) if match: kw['year_int'] = int(match.group('year')) if kw.get('publisher'): p = kw.get('publisher') if ':' in p: address, publisher = [s.strip() for s in kw['publisher'].split(':', 1)] if not 'address' in kw or kw['address'] == address: kw['address'], kw['publisher'] = address, publisher if rec.get('numberofpages'): try: kw['pages_int'] = int(rec.get('numberofpages').strip()) except ValueError: pass if kw.get('pages'): start, end, number = compute_pages(kw['pages']) if start is not None: kw['startpage_int'] = start if end is not None: kw['endpage_int'] = end if number is not None and 'pages_int' not in kw: kw['pages_int'] = number if update: for k in kw.keys(): if k == 'pk': continue v = getattr(ref, k) if kw[k] != v: if k == 'jsondata': d = ref.jsondata or {} d.update(**kw[k]) for s, t in FIELD_MAP.items(): if t is None and s in d: del d[s] ref.jsondata = d else: print k, '--', v print k, '++', kw[k] setattr(ref, k, kw[k]) changed = True if ref.id in changes: changes[ref.id][k] = ('%s' % v, '%s' % kw[k]) else: changes[ref.id] = {k: ('%s' % v, '%s' % kw[k])} else: changed = True ref = Ref(name='%s %s' % (kw.get('author', 'na'), kw.get('year', 'nd')), **kw) ref.description = ref.title or ref.booktitle ref.name = '%s %s' % (ref.author or 'n.a.', ref.year or 'n.d.') def append(attr, obj): if obj and obj not in attr: attr.append(obj) return True a, r = update_relationship( ref.macroareas, [macroarea_map[name] for name in set(filter(None, [s.strip() for s in kw['jsondata'].get('macro_area', '').split(',')]))]) changed = changed or a or r for name in set(filter(None, [s.strip() for s in kw['jsondata'].get('src', '').split(',')])): result = append(ref.providers, provider_map[slug(name)]) changed = changed or result a, r = update_relationship( ref.doctypes, [doctype_map[m.group('name')] for m in DOCTYPE_PATTERN.finditer(kw['jsondata'].get('hhtype', ''))]) changed = changed or a or r if not update: DBSession.add(ref) if changed: count += 1 ref.doctypes_str = ', '.join(o.id for o in ref.doctypes) ref.providers_str = ', '.join(o.id for o in ref.providers) print count, 'records updated or imported' print skipped, 'records skipped because of lack of information' DBSession.execute("update source set description = title where description is null and title is not null;") DBSession.execute("update source set description = booktitle where description is null and booktitle is not null;") for row in list(DBSession.execute( "select pk, pages, pages_int, startpage_int from source where pages_int < 0")): pk, pages, number, start = row _start, _end, _number = compute_pages(pages) if _number > 0 and _number != number: DBSession.execute( "update source set pages_int = %s, startpage_int = %s where pk = %s", (_number, _start, pk)) DBSession.execute( "update ref set endpage_int = %s where pk = %s", (_end, pk)) return changes
def main(args): # pragma: no cover with transaction.manager: max_identifier_pk = DBSession.query(Identifier.pk).order_by( desc(Identifier.pk)).first()[0] ma_map = get_map(Macroarea) languoids = dict((l.pk, l) for l in DBSession.query(Languoid)) with open(args.data_file('languoids.json')) as fp: for attrs in json.load(fp): ma = attrs.pop('macroarea', None) replacement = attrs.pop('replacement', None) hname = attrs.pop('hname', None) l = languoids.get(attrs['pk']) if l: for k, v in attrs.items(): if k == 'globalclassificationcomment': continue cv = getattr(l, k) if isinstance(cv, EnumSymbol): cv = cv.value assert v == cv #setattr(l, k, v) if len(l.hid or '') == 3: assert l.iso_code #if not l.iso_code: # l.identifiers.append( # Identifier( # id=str(max_identifier_pk + 1), # name=l.hid, # type=IdentifierType.iso.value)) # max_identifier_pk += 1 else: raise ValueError() try: l = Languoid(**attrs) except Exception: print attrs raise DBSession.add(l) languoids[l.pk] = l if len(attrs.get('hid', '')) == 3: l.identifiers.append( Identifier(id=str(max_identifier_pk + 1), name=attrs['hid'], type=IdentifierType.iso.value)) max_identifier_pk += 1 if ma: l.macroareas.append(ma_map[ma]) l.identifiers.append( Identifier(id=str(max_identifier_pk + 1), name=l.name, description='Glottolog', type='name')) max_identifier_pk += 1 if hname: assert l.jsondata['hname'] == hname #l.hname = hname if replacement: raise ValueError() DBSession.add( Superseded(languoid_pk=l.pk, replacement_pk=replacement, relation='classification update'))
def main(bib, mode): # pragma: no cover count = 0 skipped = 0 with transaction.manager: provider_map = get_map(Provider) macroarea_map = get_map(Macroarea) doctype_map = get_map(Doctype) known_ids = set(r[0] for r in DBSession.query(Ref.pk)) languoid_map = {} for l in DBSession.query(Languoid): if l.hid: languoid_map[l.hid] = l languoid_map[l.id] = l for i, rec in enumerate(bib): if len(rec.keys()) < 6: skipped += 1 #print '---> skip', rec.id #print rec continue changed = False assert rec.get('glottolog_ref_id') id_ = int(rec.get('glottolog_ref_id')) if mode != 'update' and id_ in known_ids: continue ref = DBSession.query(Source).get(id_) update = True if ref else False kw = { 'pk': id_, 'bibtex_type': getattr(EntryType, rec.genre), 'id': str(id_), 'jsondata': { 'bibtexkey': rec.id }, } for source, target in FIELD_MAP.items(): value = rec.get(source) if value: value = unescape(value) if target: kw[target] = CONVERTER.get(source, lambda x: x)(value) else: kw['jsondata'][source] = value # try to extract numeric year, startpage, endpage, numberofpages, ... if rec.get('numberofpages'): try: kw['pages_int'] = int(rec.get('numberofpages').strip()) except ValueError: pass if kw.get('year'): match = YEAR_PATTERN.search(kw.get('year')) if match: kw['year_int'] = int(match.group('year')) if kw.get('publisher'): p = kw.get('publisher') if ':' in p: address, publisher = [ s.strip() for s in kw['publisher'].split(':', 1) ] if not 'address' in kw or kw['address'] == address: kw['address'], kw['publisher'] = address, publisher if kw.get('pages'): pages = kw.get('pages') match = ROMANPAGESPATTERNra.search(pages) if not match: match = ROMANPAGESPATTERNar.search(pages) if match: if 'pages_int' not in kw: kw['pages_int'] = roman_to_int(match.group('roman')) \ + int(match.group('arabic')) else: start = None number = None match = None for match in PAGES_PATTERN.finditer(pages): if start is None: start = int(match.group('start')) number = (number or 0) \ + (int(match.group('end')) - int(match.group('start')) + 1) if match: kw['endpage_int'] = int(match.group('end')) kw['startpage_int'] = start kw.setdefault('pages_int', number) else: try: kw['startpage_int'] = int(pages) except ValueError: pass if update: for k in kw.keys(): if k == 'pk': continue #if k == 'title': # v = ref.title or ref.description #else: if 1: v = getattr(ref, k) if kw[k] != v: # # TODO! # setattr(ref, k, kw[k]) #if k not in ['jsondata', 'publisher']: # print k, ref.pk # print kw[k] # print v # print '--------------' changed = True if ref.title: ref.description = ref.title else: changed = True ref = Ref(**kw) def append(attr, obj): if obj and obj not in attr: changed = True # # TODO! # attr.append(obj) for name in set( filter(None, [ s.strip() for s in kw['jsondata'].get( 'macro_area', '').split(',') ])): append(ref.macroareas, macroarea_map[name]) for name in set( filter(None, [ s.strip() for s in kw['jsondata'].get('src', '').split(',') ])): append(ref.providers, provider_map[slug(name)]) for m in DOCTYPE_PATTERN.finditer(kw['jsondata'].get('hhtype', '')): append(ref.doctypes, doctype_map[m.group('name')]) if len(kw['jsondata'].get('lgcode', '')) == 3: kw['jsondata']['lgcode'] = '[%s]' % kw['jsondata']['lgcode'] for m in CODE_PATTERN.finditer(kw['jsondata'].get('lgcode', '')): for code in set(m.group('code').split(',')): if code not in languoid_map: if code not in ['NOCODE_Payagua', 'emx']: print '--> unknown code:', code.encode('utf8') else: append(ref.languages, languoid_map[code]) for glottocode in filter( None, kw['jsondata'].get('alnumcodes', '').split(';')): if glottocode not in languoid_map: print '--> unknown glottocode:', glottocode.encode('utf8') else: append(ref.languages, languoid_map[glottocode]) if not update: #pass # # TODO! # DBSession.add(ref) if i % 100 == 0: print i, 'records done' if changed: count += 1 print count, 'records updated or imported' print skipped, 'records skipped because of lack of information'
def main(args): # pragma: no cover stats = Counter(new=0, updated=0, skipped=0) changes = {} with transaction.manager: update_providers(args) DBSession.flush() provider_map = get_map(Provider) macroarea_map = get_map(Macroarea) doctype_map = get_map(Doctype) languoid_map = {} for l in DBSession.query(Languoid): if l.hid: languoid_map[l.hid] = l languoid_map[l.id] = l for i, rec in enumerate(get_bib(args)): if i and i % 1000 == 0: print i, 'records done', stats['updated'] + stats['new'], 'changed' if len(rec.keys()) < 6: # not enough information! stats.update(['skipped']) continue changed = False assert rec.get('glottolog_ref_id') id_ = int(rec.get('glottolog_ref_id')) ref = DBSession.query(Source).get(id_) update = True if ref else False kw = { 'pk': id_, 'bibtex_type': rec.genre, 'id': str(id_), 'jsondata': {'bibtexkey': rec.id}, } for source, target in FIELD_MAP.items(): if target is None: continue value = rec.get(source) if value: value = unescape(value) if target: kw[target] = CONVERTER.get(source, lambda x: x)(value) else: kw['jsondata'][source] = value if kw['jsondata'].get('hhtype'): trigger = ca_trigger(kw['jsondata']['hhtype']) if trigger: kw['ca_doctype_trigger'], kw['jsondata']['hhtype'] = trigger # try to extract numeric year, startpage, endpage, numberofpages, ... if kw.get('year'): # prefer years in brackets over the first 4-digit number. match = PREF_YEAR_PATTERN.search(kw.get('year')) if match: kw['year_int'] = int(match.group('year')) else: match = YEAR_PATTERN.search(kw.get('year')) if match: kw['year_int'] = int(match.group('year')) if kw.get('publisher'): p = kw.get('publisher') if ':' in p: address, publisher = [s.strip() for s in kw['publisher'].split(':', 1)] if 'address' not in kw or kw['address'] == address: kw['address'], kw['publisher'] = address, publisher if rec.get('numberofpages'): try: kw['pages_int'] = int(rec.get('numberofpages').strip()) except ValueError: pass if kw.get('pages'): start, end, number = compute_pages(kw['pages']) if start is not None: kw['startpage_int'] = start if end is not None: kw['endpage_int'] = end if number is not None and 'pages_int' not in kw: kw['pages_int'] = number for k in kw.keys(): v = kw[k] if isinstance(v, basestring): v = v.strip() or None kw[k] = v if update: for k in kw.keys(): if k == 'pk': continue v = getattr(ref, k) if kw[k] != v: if k == 'jsondata': d = {k: v for k, v in ref.jsondata.items() if k in NONREF_JSONDATA} d.update(**kw[k]) ref.jsondata = d else: #print k, '--', v #print k, '++', kw[k] setattr(ref, k, kw[k]) changed = True if ref.id in changes: changes[ref.id][k] = ('%s' % v, '%s' % kw[k]) else: changes[ref.id] = {k: ('%s' % v, '%s' % kw[k])} else: changed = True ref = Ref(name='%s %s' % (kw.get('author', 'na'), kw.get('year', 'nd')), **kw) ref.description = ref.title or ref.booktitle originator = ref.author or ref.editor or 'Anonymous' ref.name = '%s %s' % (originator, ref.year or 'n.d.') a, r = update_relationship( ref.macroareas, [macroarea_map[name] for name in set(filter(None, [s.strip() for s in kw['jsondata'].get('macro_area', '').split(',')]))]) changed = changed or a or r src = [s.strip() for s in kw['jsondata'].get('src', '').split(',')] prv = {provider_map[slug(s)] for s in src if s} if set(ref.providers) != prv: ref.providers = list(prv) changed = True a, r = update_relationship( ref.doctypes, [doctype_map[m.group('name')] for m in DOCTYPE_PATTERN.finditer(kw['jsondata'].get('hhtype', ''))]) changed = changed or a or r if not update: stats.update(['new']) DBSession.add(ref) elif changed: stats.update(['updated']) args.log.info('%s' % stats) DBSession.execute("update source set description = title where description is null and title is not null;") DBSession.execute("update source set description = booktitle where description is null and booktitle is not null;") for row in list(DBSession.execute( "select pk, pages, pages_int, startpage_int from source where pages_int < 0")): pk, pages, number, start = row _start, _end, _number = compute_pages(pages) if _number > 0 and _number != number: DBSession.execute( "update source set pages_int = %s, startpage_int = %s where pk = %s" % (_number, _start, pk)) DBSession.execute( "update ref set endpage_int = %s where pk = %s" % (_end, pk)) jsondump(changes, args.data_dir.joinpath('references', 'changes.json'))
def main(bib, mode): # pragma: no cover count = 0 skipped = 0 with transaction.manager: provider_map = get_map(Provider) macroarea_map = get_map(Macroarea) doctype_map = get_map(Doctype) known_ids = set(r[0] for r in DBSession.query(Ref.pk)) languoid_map = {} for l in DBSession.query(Languoid): if l.hid: languoid_map[l.hid] = l languoid_map[l.id] = l for i, rec in enumerate(bib): if len(rec.keys()) < 6: skipped += 1 #print '---> skip', rec.id #print rec continue changed = False assert rec.get('glottolog_ref_id') id_ = int(rec.get('glottolog_ref_id')) if mode != 'update' and id_ in known_ids: continue ref = DBSession.query(Source).get(id_) update = True if ref else False kw = { 'pk': id_, 'bibtex_type': getattr(EntryType, rec.genre), 'id': str(id_), 'jsondata': {'bibtexkey': rec.id}, } for source, target in FIELD_MAP.items(): value = rec.get(source) if value: value = unescape(value) if target: kw[target] = CONVERTER.get(source, lambda x: x)(value) else: kw['jsondata'][source] = value # try to extract numeric year, startpage, endpage, numberofpages, ... if rec.get('numberofpages'): try: kw['pages_int'] = int(rec.get('numberofpages').strip()) except ValueError: pass if kw.get('year'): match = YEAR_PATTERN.search(kw.get('year')) if match: kw['year_int'] = int(match.group('year')) if kw.get('publisher'): p = kw.get('publisher') if ':' in p: address, publisher = [s.strip() for s in kw['publisher'].split(':', 1)] if not 'address' in kw or kw['address'] == address: kw['address'], kw['publisher'] = address, publisher if kw.get('pages'): pages = kw.get('pages') match = ROMANPAGESPATTERNra.search(pages) if not match: match = ROMANPAGESPATTERNar.search(pages) if match: if 'pages_int' not in kw: kw['pages_int'] = roman_to_int(match.group('roman')) \ + int(match.group('arabic')) else: start = None number = None match = None for match in PAGES_PATTERN.finditer(pages): if start is None: start = int(match.group('start')) number = (number or 0) \ + (int(match.group('end')) - int(match.group('start')) + 1) if match: kw['endpage_int'] = int(match.group('end')) kw['startpage_int'] = start kw.setdefault('pages_int', number) else: try: kw['startpage_int'] = int(pages) except ValueError: pass if update: for k in kw.keys(): if k == 'pk': continue #if k == 'title': # v = ref.title or ref.description #else: if 1: v = getattr(ref, k) if kw[k] != v: # # TODO! # setattr(ref, k, kw[k]) #if k not in ['jsondata', 'publisher']: # print k, ref.pk # print kw[k] # print v # print '--------------' changed = True if ref.title: ref.description = ref.title else: changed = True ref = Ref(**kw) def append(attr, obj): if obj and obj not in attr: changed = True # # TODO! # attr.append(obj) for name in set(filter(None, [s.strip() for s in kw['jsondata'].get('macro_area', '').split(',')])): append(ref.macroareas, macroarea_map[name]) for name in set(filter(None, [s.strip() for s in kw['jsondata'].get('src', '').split(',')])): append(ref.providers, provider_map[slug(name)]) for m in DOCTYPE_PATTERN.finditer(kw['jsondata'].get('hhtype', '')): append(ref.doctypes, doctype_map[m.group('name')]) if len(kw['jsondata'].get('lgcode', '')) == 3: kw['jsondata']['lgcode'] = '[%s]' % kw['jsondata']['lgcode'] for m in CODE_PATTERN.finditer(kw['jsondata'].get('lgcode', '')): for code in set(m.group('code').split(',')): if code not in languoid_map: if code not in ['NOCODE_Payagua', 'emx']: print '--> unknown code:', code.encode('utf8') else: append(ref.languages, languoid_map[code]) for glottocode in filter(None, kw['jsondata'].get('alnumcodes', '').split(';')): if glottocode not in languoid_map: print '--> unknown glottocode:', glottocode.encode('utf8') else: append(ref.languages, languoid_map[glottocode]) if not update: #pass # # TODO! # DBSession.add(ref) if i % 100 == 0: print i, 'records done' if changed: count += 1 print count, 'records updated or imported' print skipped, 'records skipped because of lack of information'