def update_providers(args): if not args.data_file(args.version, 'provider.txt').exists(): return with open(args.data_file(args.version, 'provider.txt')) as fp: content = fp.read().decode('latin1') if '\r\n' in content: content = content.replace('\r\n', '\n') provider_map = get_map(Provider) for block in content.split('\n\n\n\n'): lines = block.split('\n') id_, abbr = lines[0].strip().split(':') id_ = id_.split('.')[0] description = unescape('\n'.join(lines[1:])) name = description.split('.')[0] if id_ == 'hedvig-tirailleur': id_ = u'skirgard' if slug(id_) not in provider_map: args.log.info('adding provider %s' % slug(id_)) DBSession.add( Provider(id=slug(id_), name=name, description=description, abbr=abbr))
def split_families(fp): """generator for (node, leafs) pairs parsed from Harald's classification format. """ def normalized_branch(line): """parse a line specifying a language family as comma separated list of ancestors. """ branch = [unescape(n.strip().replace('_', ' ')) for n in line.split(',')] name_map = { 'Deaf Sign Language': 'Sign Languages', 'Unclassifiable': 'Unclassified', 'Artificial Language': 'Artificial Language', 'Mixed Language': 'Mixed Language', 'Pidgin': 'Pidgin', #'Unattested': 'Unattested', } if branch[0] in name_map: return ( [name_map[branch[0]]], 'established' if branch[0] != 'Unattested' else 'unattested', ', '.join(branch[1:])) if branch[0] in ['Spurious', 'Speech Register', 'Unattested']: comment = '' if branch[0] == 'Speech Register': status = 'established' comment = 'speech register' else: status = branch[0].lower() if branch[0] == 'Unattested' and len(branch) == 1: # unattested languages without classification should not be treated as # isolates! branch[0] = 'Unclassified' else: branch = branch[1:] if branch and branch[0] in ['Retired']: status += ' retired' branch = branch[1:] return branch, status, '' return branch, 'established', '' family = None for line in fp.read().split('\n'): if not line.strip(): continue if line.startswith(' '): name, code = line.strip().split('[') code = code.split(']')[0].replace('\\', '').replace('"', '').replace("'", '') code = code.replace('NOCODE-', 'NOCODE_') assert code assert len(code) == 3 or NOCODE_PATTERN.match(code) family[1][code] = unescape(name.strip().replace('_', ' ')) else: if family: yield family family = [normalized_branch(line), {}] yield family
def normalized_branch(line): """parse a line specifying a language family as comma separated list of ancestors. """ branch = [ unescape(n.strip().replace('_', ' ')) for n in line.split(',') ] name_map = { 'Deaf Sign Language': 'Sign Languages', 'Unclassifiable': 'Unclassified', 'Artificial Language': 'Artificial Language', 'Mixed Language': 'Mixed Language', 'Pidgin': 'Pidgin', #'Unattested': 'Unattested', } if branch[0] in name_map: return ([ name_map[branch[0]] ], 'established' if branch[0] != 'Unattested' else 'unattested', ', '.join(branch[1:])) if branch[0] in ['Spurious', 'Speech Register', 'Unattested']: comment = '' if branch[0] == 'Speech Register': status = 'established' comment = 'speech register' else: status = branch[0].lower() if branch[0] == 'Unattested' and len(branch) == 1: # unattested languages without classification should not be treated as # isolates! branch[0] = 'Unclassified' else: branch = branch[1:] if branch and branch[0] in ['Retired']: status += ' retired' branch = branch[1:] return branch, status, '' return branch, 'established', ''
def normalized_branch(line): """parse a line specifying a language family as comma separated list of ancestors. """ branch = [unescape(n.strip().replace('_', ' ')) for n in line.split(',')] name_map = { 'Deaf Sign Language': 'Sign Languages', 'Unclassifiable': 'Unclassified', 'Artificial Language': 'Artificial Language', 'Mixed Language': 'Mixed Language', 'Pidgin': 'Pidgin', #'Unattested': 'Unattested', } if branch[0] in name_map: return ( [name_map[branch[0]]], 'established' if branch[0] != 'Unattested' else 'unattested', ', '.join(branch[1:])) comment = '' if branch[0] in ['Spurious', 'Speech Register', 'Unattested']: if branch[0] == 'Speech Register': status = 'established' comment = 'speech register' else: status = branch[0].lower() if branch[0] == 'Unattested' and len(branch) == 1: # unattested languages without classification should not be treated as # isolates! branch[0] = 'Unclassified' else: branch = branch[1:] if branch and branch[0] in ['Retired']: status += ' retired' branch = branch[1:] return branch, status, '' return branch, 'established', comment
def normalized_branch(line): """parse a line specifying a language family as comma separated list of ancestors. """ name_map = { 'Unattested', # keep top-level family as subfamily 'Unclassifiable', # keep top-level family as subfamily 'Pidgin', # keep top-level family as subfamily 'Mixed Language', # keep top-level family as subfamily 'Artificial Language', # keep top-level family as subfamily 'Speech Register', # keep top-level family as subfamily # FIXME: also 'Sign Language'? 'Spurious', # bookkeeping 'Preliminary' } branch = [ unescape(n.strip().replace('_', ' ')) for n in line.split(',') ] if branch[0] not in name_map: return branch, 'established' family = branch.pop(0) subfamily = None retired = False if branch: # there's a second level! if family == 'Spurious': if branch[0] == 'Retired': retired = True branch.pop(0) else: subfamily = '%s (%s)' % (branch.pop(0), family) status = 'established' if family in ['Spurious', 'Unattested']: status = family.lower() if retired: status += ' retired' if family == 'Spurious': family = BOOKKEEPING return nfilter([family, subfamily]), status
def normalized_branch(line): """parse a line specifying a language family as comma separated list of ancestors. """ name_map = { "Unattested", # keep top-level family as subfamily "Unclassifiable", # keep top-level family as subfamily "Pidgin", # keep top-level family as subfamily "Mixed Language", # keep top-level family as subfamily "Artificial Language", # keep top-level family as subfamily "Speech Register", # keep top-level family as subfamily # FIXME: also 'Sign Language'? "Spurious", # bookkeeping 'Preliminary' } branch = [unescape(n.strip().replace("_", " ")) for n in line.split(",")] if branch[0] not in name_map: return branch, "established" family = branch.pop(0) subfamily = None retired = False if branch: # there's a second level! if family == "Spurious": if branch[0] == "Retired": retired = True branch.pop(0) else: subfamily = "%s (%s)" % (branch.pop(0), family) status = "established" if family in ["Spurious", "Unattested"]: status = family.lower() if retired: status += " retired" if family == "Spurious": family = BOOKKEEPING return nfilter([family, subfamily]), status
def main(args): # pragma: no cover bib = Database.from_file(args.data_file(args.version, 'refs.bib'), encoding='utf8') count = 0 skipped = 0 changes = {} with transaction.manager: update_providers(args) DBSession.flush() provider_map = get_map(Provider) macroarea_map = get_map(Macroarea) doctype_map = get_map(Doctype) known_ids = set(r[0] for r in DBSession.query(Ref.pk)) languoid_map = {} for l in DBSession.query(Languoid): if l.hid: languoid_map[l.hid] = l languoid_map[l.id] = l for i, rec in enumerate(bib): if i and i % 1000 == 0: print i, 'records done', count, 'changed' if len(rec.keys()) < 6: # not enough information! skipped += 1 continue changed = False assert rec.get('glottolog_ref_id') id_ = int(rec.get('glottolog_ref_id')) if args.mode != 'update' and id_ in known_ids: continue ref = DBSession.query(Source).get(id_) update = True if ref else False kw = { 'pk': id_, 'bibtex_type': rec.genre, 'id': str(id_), 'jsondata': {'bibtexkey': rec.id}, } for source, target in FIELD_MAP.items(): if target is None: continue value = rec.get(source) if value: value = unescape(value) if target: kw[target] = CONVERTER.get(source, lambda x: x)(value) else: kw['jsondata'][source] = value if kw['jsondata'].get('hhtype'): trigger = ca_trigger(kw['jsondata']['hhtype']) if trigger: kw['ca_doctype_trigger'], kw['jsondata']['hhtype'] = trigger # try to extract numeric year, startpage, endpage, numberofpages, ... if kw.get('year'): # prefer years in brackets over the first 4-digit number. match = PREF_YEAR_PATTERN.search(kw.get('year')) if match: kw['year_int'] = int(match.group('year')) else: match = YEAR_PATTERN.search(kw.get('year')) if match: kw['year_int'] = int(match.group('year')) if kw.get('publisher'): p = kw.get('publisher') if ':' in p: address, publisher = [s.strip() for s in kw['publisher'].split(':', 1)] if not 'address' in kw or kw['address'] == address: kw['address'], kw['publisher'] = address, publisher if rec.get('numberofpages'): try: kw['pages_int'] = int(rec.get('numberofpages').strip()) except ValueError: pass if kw.get('pages'): start, end, number = compute_pages(kw['pages']) if start is not None: kw['startpage_int'] = start if end is not None: kw['endpage_int'] = end if number is not None and 'pages_int' not in kw: kw['pages_int'] = number if update: for k in kw.keys(): if k == 'pk': continue v = getattr(ref, k) if kw[k] != v: if k == 'jsondata': d = ref.jsondata or {} d.update(**kw[k]) for s, t in FIELD_MAP.items(): if t is None and s in d: del d[s] ref.jsondata = d else: print k, '--', v print k, '++', kw[k] setattr(ref, k, kw[k]) changed = True if ref.id in changes: changes[ref.id][k] = ('%s' % v, '%s' % kw[k]) else: changes[ref.id] = {k: ('%s' % v, '%s' % kw[k])} else: changed = True ref = Ref(name='%s %s' % (kw.get('author', 'na'), kw.get('year', 'nd')), **kw) ref.description = ref.title or ref.booktitle ref.name = '%s %s' % (ref.author or 'n.a.', ref.year or 'n.d.') def append(attr, obj): if obj and obj not in attr: attr.append(obj) return True a, r = update_relationship( ref.macroareas, [macroarea_map[name] for name in set(filter(None, [s.strip() for s in kw['jsondata'].get('macro_area', '').split(',')]))]) changed = changed or a or r for name in set(filter(None, [s.strip() for s in kw['jsondata'].get('src', '').split(',')])): result = append(ref.providers, provider_map[slug(name)]) changed = changed or result a, r = update_relationship( ref.doctypes, [doctype_map[m.group('name')] for m in DOCTYPE_PATTERN.finditer(kw['jsondata'].get('hhtype', ''))]) changed = changed or a or r if not update: DBSession.add(ref) if changed: count += 1 ref.doctypes_str = ', '.join(o.id for o in ref.doctypes) ref.providers_str = ', '.join(o.id for o in ref.providers) print count, 'records updated or imported' print skipped, 'records skipped because of lack of information' DBSession.execute("update source set description = title where description is null and title is not null;") DBSession.execute("update source set description = booktitle where description is null and booktitle is not null;") for row in list(DBSession.execute( "select pk, pages, pages_int, startpage_int from source where pages_int < 0")): pk, pages, number, start = row _start, _end, _number = compute_pages(pages) if _number > 0 and _number != number: DBSession.execute( "update source set pages_int = %s, startpage_int = %s where pk = %s", (_number, _start, pk)) DBSession.execute( "update ref set endpage_int = %s where pk = %s", (_end, pk)) return changes
def split_families(fp): """generator for (node, leafs) pairs parsed from Harald's classification format. """ def normalized_branch(line): """parse a line specifying a language family as comma separated list of ancestors. """ branch = [ unescape(n.strip().replace('_', ' ')) for n in line.split(',') ] name_map = { 'Deaf Sign Language': 'Sign Languages', 'Unclassifiable': 'Unclassified', 'Artificial Language': 'Artificial Language', 'Mixed Language': 'Mixed Language', 'Pidgin': 'Pidgin', #'Unattested': 'Unattested', } if branch[0] in name_map: return ([ name_map[branch[0]] ], 'established' if branch[0] != 'Unattested' else 'unattested', ', '.join(branch[1:])) if branch[0] in ['Spurious', 'Speech Register', 'Unattested']: comment = '' if branch[0] == 'Speech Register': status = 'established' comment = 'speech register' else: status = branch[0].lower() if branch[0] == 'Unattested' and len(branch) == 1: # unattested languages without classification should not be treated as # isolates! branch[0] = 'Unclassified' else: branch = branch[1:] if branch and branch[0] in ['Retired']: status += ' retired' branch = branch[1:] return branch, status, '' return branch, 'established', '' family = None for line in fp.read().split('\n'): if not line.strip(): continue if line.startswith(' '): name, code = line.strip().split('[') code = code.split(']')[0].replace('\\', '').replace('"', '').replace("'", '') code = code.replace('NOCODE-', 'NOCODE_') assert code assert len(code) == 3 or NOCODE_PATTERN.match(code) family[1][code] = unescape(name.strip().replace('_', ' ')) else: if family: yield family family = [normalized_branch(line), {}] yield family
def main(bib, mode): # pragma: no cover count = 0 skipped = 0 with transaction.manager: provider_map = get_map(Provider) macroarea_map = get_map(Macroarea) doctype_map = get_map(Doctype) known_ids = set(r[0] for r in DBSession.query(Ref.pk)) languoid_map = {} for l in DBSession.query(Languoid): if l.hid: languoid_map[l.hid] = l languoid_map[l.id] = l for i, rec in enumerate(bib): if len(rec.keys()) < 6: skipped += 1 #print '---> skip', rec.id #print rec continue changed = False assert rec.get('glottolog_ref_id') id_ = int(rec.get('glottolog_ref_id')) if mode != 'update' and id_ in known_ids: continue ref = DBSession.query(Source).get(id_) update = True if ref else False kw = { 'pk': id_, 'bibtex_type': getattr(EntryType, rec.genre), 'id': str(id_), 'jsondata': { 'bibtexkey': rec.id }, } for source, target in FIELD_MAP.items(): value = rec.get(source) if value: value = unescape(value) if target: kw[target] = CONVERTER.get(source, lambda x: x)(value) else: kw['jsondata'][source] = value # try to extract numeric year, startpage, endpage, numberofpages, ... if rec.get('numberofpages'): try: kw['pages_int'] = int(rec.get('numberofpages').strip()) except ValueError: pass if kw.get('year'): match = YEAR_PATTERN.search(kw.get('year')) if match: kw['year_int'] = int(match.group('year')) if kw.get('publisher'): p = kw.get('publisher') if ':' in p: address, publisher = [ s.strip() for s in kw['publisher'].split(':', 1) ] if not 'address' in kw or kw['address'] == address: kw['address'], kw['publisher'] = address, publisher if kw.get('pages'): pages = kw.get('pages') match = ROMANPAGESPATTERNra.search(pages) if not match: match = ROMANPAGESPATTERNar.search(pages) if match: if 'pages_int' not in kw: kw['pages_int'] = roman_to_int(match.group('roman')) \ + int(match.group('arabic')) else: start = None number = None match = None for match in PAGES_PATTERN.finditer(pages): if start is None: start = int(match.group('start')) number = (number or 0) \ + (int(match.group('end')) - int(match.group('start')) + 1) if match: kw['endpage_int'] = int(match.group('end')) kw['startpage_int'] = start kw.setdefault('pages_int', number) else: try: kw['startpage_int'] = int(pages) except ValueError: pass if update: for k in kw.keys(): if k == 'pk': continue #if k == 'title': # v = ref.title or ref.description #else: if 1: v = getattr(ref, k) if kw[k] != v: # # TODO! # setattr(ref, k, kw[k]) #if k not in ['jsondata', 'publisher']: # print k, ref.pk # print kw[k] # print v # print '--------------' changed = True if ref.title: ref.description = ref.title else: changed = True ref = Ref(**kw) def append(attr, obj): if obj and obj not in attr: changed = True # # TODO! # attr.append(obj) for name in set( filter(None, [ s.strip() for s in kw['jsondata'].get( 'macro_area', '').split(',') ])): append(ref.macroareas, macroarea_map[name]) for name in set( filter(None, [ s.strip() for s in kw['jsondata'].get('src', '').split(',') ])): append(ref.providers, provider_map[slug(name)]) for m in DOCTYPE_PATTERN.finditer(kw['jsondata'].get('hhtype', '')): append(ref.doctypes, doctype_map[m.group('name')]) if len(kw['jsondata'].get('lgcode', '')) == 3: kw['jsondata']['lgcode'] = '[%s]' % kw['jsondata']['lgcode'] for m in CODE_PATTERN.finditer(kw['jsondata'].get('lgcode', '')): for code in set(m.group('code').split(',')): if code not in languoid_map: if code not in ['NOCODE_Payagua', 'emx']: print '--> unknown code:', code.encode('utf8') else: append(ref.languages, languoid_map[code]) for glottocode in filter( None, kw['jsondata'].get('alnumcodes', '').split(';')): if glottocode not in languoid_map: print '--> unknown glottocode:', glottocode.encode('utf8') else: append(ref.languages, languoid_map[glottocode]) if not update: #pass # # TODO! # DBSession.add(ref) if i % 100 == 0: print i, 'records done' if changed: count += 1 print count, 'records updated or imported' print skipped, 'records skipped because of lack of information'
def main(args): # pragma: no cover stats = Counter(new=0, updated=0, skipped=0) changes = {} with transaction.manager: update_providers(args) DBSession.flush() provider_map = get_map(Provider) macroarea_map = get_map(Macroarea) doctype_map = get_map(Doctype) languoid_map = {} for l in DBSession.query(Languoid): if l.hid: languoid_map[l.hid] = l languoid_map[l.id] = l for i, rec in enumerate(get_bib(args)): if i and i % 1000 == 0: print i, 'records done', stats['updated'] + stats['new'], 'changed' if len(rec.keys()) < 6: # not enough information! stats.update(['skipped']) continue changed = False assert rec.get('glottolog_ref_id') id_ = int(rec.get('glottolog_ref_id')) ref = DBSession.query(Source).get(id_) update = True if ref else False kw = { 'pk': id_, 'bibtex_type': rec.genre, 'id': str(id_), 'jsondata': {'bibtexkey': rec.id}, } for source, target in FIELD_MAP.items(): if target is None: continue value = rec.get(source) if value: value = unescape(value) if target: kw[target] = CONVERTER.get(source, lambda x: x)(value) else: kw['jsondata'][source] = value if kw['jsondata'].get('hhtype'): trigger = ca_trigger(kw['jsondata']['hhtype']) if trigger: kw['ca_doctype_trigger'], kw['jsondata']['hhtype'] = trigger # try to extract numeric year, startpage, endpage, numberofpages, ... if kw.get('year'): # prefer years in brackets over the first 4-digit number. match = PREF_YEAR_PATTERN.search(kw.get('year')) if match: kw['year_int'] = int(match.group('year')) else: match = YEAR_PATTERN.search(kw.get('year')) if match: kw['year_int'] = int(match.group('year')) if kw.get('publisher'): p = kw.get('publisher') if ':' in p: address, publisher = [s.strip() for s in kw['publisher'].split(':', 1)] if 'address' not in kw or kw['address'] == address: kw['address'], kw['publisher'] = address, publisher if rec.get('numberofpages'): try: kw['pages_int'] = int(rec.get('numberofpages').strip()) except ValueError: pass if kw.get('pages'): start, end, number = compute_pages(kw['pages']) if start is not None: kw['startpage_int'] = start if end is not None: kw['endpage_int'] = end if number is not None and 'pages_int' not in kw: kw['pages_int'] = number for k in kw.keys(): v = kw[k] if isinstance(v, basestring): v = v.strip() or None kw[k] = v if update: for k in kw.keys(): if k == 'pk': continue v = getattr(ref, k) if kw[k] != v: if k == 'jsondata': d = {k: v for k, v in ref.jsondata.items() if k in NONREF_JSONDATA} d.update(**kw[k]) ref.jsondata = d else: #print k, '--', v #print k, '++', kw[k] setattr(ref, k, kw[k]) changed = True if ref.id in changes: changes[ref.id][k] = ('%s' % v, '%s' % kw[k]) else: changes[ref.id] = {k: ('%s' % v, '%s' % kw[k])} else: changed = True ref = Ref(name='%s %s' % (kw.get('author', 'na'), kw.get('year', 'nd')), **kw) ref.description = ref.title or ref.booktitle originator = ref.author or ref.editor or 'Anonymous' ref.name = '%s %s' % (originator, ref.year or 'n.d.') a, r = update_relationship( ref.macroareas, [macroarea_map[name] for name in set(filter(None, [s.strip() for s in kw['jsondata'].get('macro_area', '').split(',')]))]) changed = changed or a or r src = [s.strip() for s in kw['jsondata'].get('src', '').split(',')] prv = {provider_map[slug(s)] for s in src if s} if set(ref.providers) != prv: ref.providers = list(prv) changed = True a, r = update_relationship( ref.doctypes, [doctype_map[m.group('name')] for m in DOCTYPE_PATTERN.finditer(kw['jsondata'].get('hhtype', ''))]) changed = changed or a or r if not update: stats.update(['new']) DBSession.add(ref) elif changed: stats.update(['updated']) args.log.info('%s' % stats) DBSession.execute("update source set description = title where description is null and title is not null;") DBSession.execute("update source set description = booktitle where description is null and booktitle is not null;") for row in list(DBSession.execute( "select pk, pages, pages_int, startpage_int from source where pages_int < 0")): pk, pages, number, start = row _start, _end, _number = compute_pages(pages) if _number > 0 and _number != number: DBSession.execute( "update source set pages_int = %s, startpage_int = %s where pk = %s" % (_number, _start, pk)) DBSession.execute( "update ref set endpage_int = %s where pk = %s" % (_end, pk)) jsondump(changes, args.data_dir.joinpath('references', 'changes.json'))
def split_families(fp): """generator for (node, leafs) pairs parsed from Harald's classification format. """ def normalized_branch(line): """parse a line specifying a language family as comma separated list of ancestors. """ name_map = { "Unattested", # keep top-level family as subfamily "Unclassifiable", # keep top-level family as subfamily "Pidgin", # keep top-level family as subfamily "Mixed Language", # keep top-level family as subfamily "Artificial Language", # keep top-level family as subfamily "Speech Register", # keep top-level family as subfamily # FIXME: also 'Sign Language'? "Spurious", # bookkeeping 'Preliminary' } branch = [unescape(n.strip().replace("_", " ")) for n in line.split(",")] if branch[0] not in name_map: return branch, "established" family = branch.pop(0) subfamily = None retired = False if branch: # there's a second level! if family == "Spurious": if branch[0] == "Retired": retired = True branch.pop(0) else: subfamily = "%s (%s)" % (branch.pop(0), family) status = "established" if family in ["Spurious", "Unattested"]: status = family.lower() if retired: status += " retired" if family == "Spurious": family = BOOKKEEPING return nfilter([family, subfamily]), status family = None for line in fp.read().split("\n"): if not line.strip(): continue if line.strip().endswith("TODO"): print "ignoring:", line continue if line.startswith(" "): name, code = line.strip().split("[") code = code.split("]")[0].replace("\\", "").replace('"', "").replace("'", "") code = code.replace("NOCODE-", "NOCODE_") try: assert len(code) == 3 or NOCODE_PATTERN.match(code) except: raise ValueError(code) family[1][code] = unescape(name.strip().replace("_", " ")) else: if family: yield family family = [normalized_branch(line), {}] yield family
def split_families(fp): """generator for (node, leafs) pairs parsed from Harald's classification format. """ def normalized_branch(line): """parse a line specifying a language family as comma separated list of ancestors. """ name_map = { 'Unattested', # keep top-level family as subfamily 'Unclassifiable', # keep top-level family as subfamily 'Pidgin', # keep top-level family as subfamily 'Mixed Language', # keep top-level family as subfamily 'Artificial Language', # keep top-level family as subfamily 'Speech Register', # keep top-level family as subfamily # FIXME: also 'Sign Language'? 'Spurious', # bookkeeping 'Preliminary' } branch = [ unescape(n.strip().replace('_', ' ')) for n in line.split(',') ] if branch[0] not in name_map: return branch, 'established' family = branch.pop(0) subfamily = None retired = False if branch: # there's a second level! if family == 'Spurious': if branch[0] == 'Retired': retired = True branch.pop(0) else: subfamily = '%s (%s)' % (branch.pop(0), family) status = 'established' if family in ['Spurious', 'Unattested']: status = family.lower() if retired: status += ' retired' if family == 'Spurious': family = BOOKKEEPING return nfilter([family, subfamily]), status family = None for line in fp.read().split('\n'): if not line.strip(): continue if line.strip().endswith('TODO'): print 'ignoring:', line continue if line.startswith(' '): name, code = line.strip().split('[') code = code.split(']')[0].replace('\\', '').replace('"', '').replace("'", '') code = code.replace('NOCODE-', 'NOCODE_') try: assert len(code) == 3 or NOCODE_PATTERN.match(code) except: raise ValueError(code) family[1][code] = unescape(name.strip().replace('_', ' ')) else: if family: yield family family = [normalized_branch(line), {}] yield family
def main(bib, mode): # pragma: no cover count = 0 skipped = 0 with transaction.manager: provider_map = get_map(Provider) macroarea_map = get_map(Macroarea) doctype_map = get_map(Doctype) known_ids = set(r[0] for r in DBSession.query(Ref.pk)) languoid_map = {} for l in DBSession.query(Languoid): if l.hid: languoid_map[l.hid] = l languoid_map[l.id] = l for i, rec in enumerate(bib): if len(rec.keys()) < 6: skipped += 1 #print '---> skip', rec.id #print rec continue changed = False assert rec.get('glottolog_ref_id') id_ = int(rec.get('glottolog_ref_id')) if mode != 'update' and id_ in known_ids: continue ref = DBSession.query(Source).get(id_) update = True if ref else False kw = { 'pk': id_, 'bibtex_type': getattr(EntryType, rec.genre), 'id': str(id_), 'jsondata': {'bibtexkey': rec.id}, } for source, target in FIELD_MAP.items(): value = rec.get(source) if value: value = unescape(value) if target: kw[target] = CONVERTER.get(source, lambda x: x)(value) else: kw['jsondata'][source] = value # try to extract numeric year, startpage, endpage, numberofpages, ... if rec.get('numberofpages'): try: kw['pages_int'] = int(rec.get('numberofpages').strip()) except ValueError: pass if kw.get('year'): match = YEAR_PATTERN.search(kw.get('year')) if match: kw['year_int'] = int(match.group('year')) if kw.get('publisher'): p = kw.get('publisher') if ':' in p: address, publisher = [s.strip() for s in kw['publisher'].split(':', 1)] if not 'address' in kw or kw['address'] == address: kw['address'], kw['publisher'] = address, publisher if kw.get('pages'): pages = kw.get('pages') match = ROMANPAGESPATTERNra.search(pages) if not match: match = ROMANPAGESPATTERNar.search(pages) if match: if 'pages_int' not in kw: kw['pages_int'] = roman_to_int(match.group('roman')) \ + int(match.group('arabic')) else: start = None number = None match = None for match in PAGES_PATTERN.finditer(pages): if start is None: start = int(match.group('start')) number = (number or 0) \ + (int(match.group('end')) - int(match.group('start')) + 1) if match: kw['endpage_int'] = int(match.group('end')) kw['startpage_int'] = start kw.setdefault('pages_int', number) else: try: kw['startpage_int'] = int(pages) except ValueError: pass if update: for k in kw.keys(): if k == 'pk': continue #if k == 'title': # v = ref.title or ref.description #else: if 1: v = getattr(ref, k) if kw[k] != v: # # TODO! # setattr(ref, k, kw[k]) #if k not in ['jsondata', 'publisher']: # print k, ref.pk # print kw[k] # print v # print '--------------' changed = True if ref.title: ref.description = ref.title else: changed = True ref = Ref(**kw) def append(attr, obj): if obj and obj not in attr: changed = True # # TODO! # attr.append(obj) for name in set(filter(None, [s.strip() for s in kw['jsondata'].get('macro_area', '').split(',')])): append(ref.macroareas, macroarea_map[name]) for name in set(filter(None, [s.strip() for s in kw['jsondata'].get('src', '').split(',')])): append(ref.providers, provider_map[slug(name)]) for m in DOCTYPE_PATTERN.finditer(kw['jsondata'].get('hhtype', '')): append(ref.doctypes, doctype_map[m.group('name')]) if len(kw['jsondata'].get('lgcode', '')) == 3: kw['jsondata']['lgcode'] = '[%s]' % kw['jsondata']['lgcode'] for m in CODE_PATTERN.finditer(kw['jsondata'].get('lgcode', '')): for code in set(m.group('code').split(',')): if code not in languoid_map: if code not in ['NOCODE_Payagua', 'emx']: print '--> unknown code:', code.encode('utf8') else: append(ref.languages, languoid_map[code]) for glottocode in filter(None, kw['jsondata'].get('alnumcodes', '').split(';')): if glottocode not in languoid_map: print '--> unknown glottocode:', glottocode.encode('utf8') else: append(ref.languages, languoid_map[glottocode]) if not update: #pass # # TODO! # DBSession.add(ref) if i % 100 == 0: print i, 'records done' if changed: count += 1 print count, 'records updated or imported' print skipped, 'records skipped because of lack of information'