def main(args): #get_obsolete_refs(args) with transaction.manager: #match_obsolete_refs(args) # # TODO: # - create bibtex file containing all refs to be removed! # - # matched = args.data_file(args.version, 'obsolete_refs_matched.json') if matched.exists(): with open(matched) as fp: matched = json.load(fp) else: matched = {} for id_, repl in matched.items(): if not repl: continue ref = Ref.get(id_, default=None) if ref is None: continue Config.add_replacement(ref, repl, session=DBSession, model=Source) DBSession.delete(ref)
def match_obsolete_refs(args): with open(args.data_file(args.version, 'obsolete_refs.json')) as fp: refs = json.load(fp) matched = args.data_file(args.version, 'obsolete_refs_matched.json') if matched.exists(): with open(matched) as fp: matched = json.load(fp) else: matched = {} # # TODO: optionally re-evaluate known-unmatched refs! # count = 0 f, m = 0, 0 for id_ in refs: if id_ in matched: continue count += 1 if count > 1000: print '1000 obsolete refs processed!' break ref = Ref.get(id_) found = False if ref.description and len(ref.description) > 5: for match in DBSession.query(Ref)\ .filter(not_(Source.id.in_(refs)))\ .filter(Source.description.contains(ref.description))\ .filter(or_(Source.author == ref.author, Source.year == ref.year))\ .limit(10): print '++', ref.id, '->', match.id, '++', ref.author, '->', match.author, '++', ref.year, '->', match.year matched[ref.id] = match.id found = True break if not found and ref.name and len(ref.name) > 5: for match in DBSession.query(Ref)\ .filter(not_(Source.id.in_(refs)))\ .filter(Source.name == ref.name)\ .limit(10): try: if match.description and ref.description and slug(match.description) == slug(ref.description): print '++', ref.id, '->', match.id, '++', ref.description, '->', match.description matched[ref.id] = match.id found = True break except AssertionError: continue if not found: m += 1 print '--', ref.id, ref.name, ref.description matched[ref.id] = None else: f += 1 print f, 'found' print m, 'missed' with open(args.data_file(args.version, 'obsolete_refs_matched.json'), 'w') as fp: json.dump(matched, fp)
def update(args): author = 'ISO 639-3 Registration Authority' pid = 'iso6393' dtid = 'overview' dt = Doctype.get(dtid) provider = Provider.get(pid, default=None) if provider is None: provider = Provider( id=pid, abbr=pid, name=author, description="Change requests submitted to the ISO 639-3 registration authority.") iid = max(int(DBSession.execute( "select max(cast(id as integer)) from source").fetchone()[0]), 500000) pk = int(DBSession.execute("select max(pk) from source").fetchone()[0]) for crno, affected in args.json['changerequests'].items(): year, serial = crno.split('-') title = 'Change Request Number %s' % crno ref = Ref.get(title, key='title', default=None) if not ref: iid += 1 pk += 1 ref = Ref( pk=pk, id=str(iid), name='%s %s' % (author, year), bibtex_type=EntryType.misc, number=crno, description=title, year=year, year_int=int(year), title=title, author=author, address='Dallas', publisher='SIL International', url='http://www.sil.org/iso639-3/cr_files/%s.pdf' % crno, doctypes_str=dtid, providers_str=pid, language_note=', '.join('%(Language Name)s [%(Affected Identifier)s]' % spec for spec in affected), jsondata=dict(hhtype=dtid, src=pid)) ref.doctypes.append(dt) ref.providers.append(provider) for spec in affected: lang = Languoid.get(spec['Affected Identifier'], key='hid', default=None) if lang and lang not in ref.languages: ref.languages.append(lang) DBSession.add(ref) transaction.commit() transaction.begin() matched = 0 near = 0 max_identifier_pk = DBSession.query( Identifier.pk).order_by(desc(Identifier.pk)).first()[0] families = [] for family in DBSession.query(Languoid)\ .filter(Languoid.level == LanguoidLevel.family)\ .filter(Language.active == True)\ .all(): isoleafs = set() for row in DBSession.query(TreeClosureTable.child_pk, Languoid.hid)\ .filter(family.pk == TreeClosureTable.parent_pk)\ .filter(Languoid.pk == TreeClosureTable.child_pk)\ .filter(Languoid.hid != None)\ .filter(Languoid.level == LanguoidLevel.language)\ .filter(Languoid.status == LanguoidStatus.established)\ .all(): if len(row[1]) == 3: isoleafs.add(row[1]) families.append((family, isoleafs)) families = sorted(families, key=lambda p: len(p[1])) for mid, leafs in args.json['macrolanguages'].items(): leafs = set(leafs) found = False for family, isoleafs in families: if leafs == isoleafs: if mid not in [c.name for c in family.identifiers if c.type == IdentifierType.iso.value]: family.codes.append(Identifier( id=str(max_identifier_pk + 1), name=mid, type=IdentifierType.iso.value)) max_identifier_pk += 1 matched += 1 found = True break elif leafs.issubset(isoleafs): print '~~~', family.name, '-->', mid, 'distance:', len(leafs), len(isoleafs) near += 1 found = True break if not found: print '---', mid, leafs print 'matched', matched, 'of', len(args.json['macrolanguages']), 'macrolangs' print near
def update(args): author = 'ISO 639-3 Registration Authority' pid = 'iso6393' dtid = 'overview' dt = Doctype.get(dtid) provider = Provider.get(pid, default=None) if provider is None: provider = Provider( id=pid, abbr=pid, name=author, description= "Change requests submitted to the ISO 639-3 registration authority." ) iid = max( int( DBSession.execute( "select max(cast(id as integer)) from source").fetchone()[0]), 500000) pk = int(DBSession.execute("select max(pk) from source").fetchone()[0]) for crno, affected in args.json['changerequests'].items(): year, serial = crno.split('-') title = 'Change Request Number %s' % crno ref = Ref.get(title, key='title', default=None) if not ref: iid += 1 pk += 1 ref = Ref(pk=pk, id=str(iid), name='%s %s' % (author, year), bibtex_type=EntryType.misc, number=crno, description=title, year=year, year_int=int(year), title=title, author=author, address='Dallas', publisher='SIL International', url='http://www.sil.org/iso639-3/cr_files/%s.pdf' % crno, language_note=', '.join( '%(Language Name)s [%(Affected Identifier)s]' % spec for spec in affected), jsondata=dict(hhtype=dtid, src=pid)) ref.doctypes.append(dt) ref.providers.append(provider) for spec in affected: lang = Languoid.get(spec['Affected Identifier'], key='hid', default=None) if lang and lang not in ref.languages: ref.languages.append(lang) DBSession.add(ref) transaction.commit() transaction.begin() matched = 0 near = 0 max_identifier_pk = DBSession.query(Identifier.pk).order_by( desc(Identifier.pk)).first()[0] families = [] for family in DBSession.query(Languoid)\ .filter(Languoid.level == LanguoidLevel.family)\ .filter(Language.active == True)\ .all(): isoleafs = set() for row in DBSession.query(TreeClosureTable.child_pk, Languoid.hid)\ .filter(family.pk == TreeClosureTable.parent_pk)\ .filter(Languoid.pk == TreeClosureTable.child_pk)\ .filter(Languoid.hid != None)\ .filter(Languoid.level == LanguoidLevel.language)\ .filter(Languoid.status == LanguoidStatus.established)\ .all(): if len(row[1]) == 3: isoleafs.add(row[1]) families.append((family, isoleafs)) families = sorted(families, key=lambda p: len(p[1])) for mid, leafs in args.json['macrolanguages'].items(): leafs = set(leafs) found = False for family, isoleafs in families: if leafs == isoleafs: if mid not in [ c.name for c in family.identifiers if c.type == IdentifierType.iso.value ]: family.codes.append( Identifier(id=str(max_identifier_pk + 1), name=mid, type=IdentifierType.iso.value)) max_identifier_pk += 1 matched += 1 found = True break elif leafs.issubset(isoleafs): print '~~~', family.name, '-->', mid, 'distance:', len( leafs), len(isoleafs) near += 1 found = True break if not found: print '---', mid, leafs print 'matched', matched, 'of', len( args.json['macrolanguages']), 'macrolangs' print near