def patch_wikip_country(): print('[+] patch Wikipedia list of country prefixes: WIKIP_COUNTRY') # isonameset = set([r['country_name'] for r in WIKIP_ISO3166.values()]) # for country, preflist in sorted(WIKIP_COUNTRY.items()): assert (all([pref.isdigit() for pref in preflist])) # found = False # if country in COUNTRY_RENAME: newname = COUNTRY_RENAME[country] WIKIP_COUNTRY[newname] = preflist del WIKIP_COUNTRY[country] print('> country name changed from %s to %s' % (country, newname)) country = newname found = True # elif country in isonameset: found = True # elif country in SUBTERR_TO_COUNTRY: #print('> country %s, prefix %s, sub-territory of %s'\ # % (country, ', '.join(['+%s' % pref for pref in preflist]), SUBTERR_TO_COUNTRY[country])) found = True # else: for r in sorted(WIKIP_ISO3166.values(), key=lambda r: r['country_name']): if country_present(country, r['nameset']): newname = r['country_name'] WIKIP_COUNTRY[newname] = preflist del WIKIP_COUNTRY[country] print('> country name changed from %s to %s' % (country, newname)) country = newname found = True break # if not found: print('> country name %s, prefix %s, not found in WIKIP_ISO3166'\ % (country, ', '.join(['+%s' % pref for pref in preflist]))) # for pref in preflist: if pref not in WIKIP_MSISDN: found = True for i in range(len(pref) - 1, 0, -1): if pref[:i] in WIKIP_MSISDN: if i == 1: print('> country %s, prefix +%s not in WIKIP_MSISDN, corresponds to +%s'\ % (country, pref, pref[:i])) else: print('> country %s, prefix +%s not in WIKIP_MSISDN, but +%s corresponds to %s'\ % (country, pref, pref[:i], ', '.join(['%s (%s)' % (r[1], r[0]) for r in WIKIP_MSISDN[pref[:i]]]))) found = True if not found: print('> country %s, prefix +%s not in WIKIP_MSISDN' % (country, pref))
def patch_txtn_mnc(): print('[+] patch txtNation list of MCC-MNC: CSV_TXTN_MCCMNC') # isonameset = set([r['country_name'] for r in WIKIP_ISO3166.values()]) # for mccmnc, inf in sorted(CSV_TXTN_MCCMNC.items()): if not mccmnc.isdigit(): print('> deleting entry for %s' % mccmnc) del CSV_TXTN_MCCMNC[mccmnc] continue # if isinstance(inf, list): infs = inf for inf in infs: if inf[0] not in isonameset: newname = _patch_country_name(inf[0]) if newname: i = infs.index(inf) del infs[i] infs.insert(i, (newname, inf[1])) else: if inf[0] not in isonameset: newname = _patch_country_name(inf[0]) if newname: CSV_TXTN_MCCMNC[mccmnc] = (newname, inf[1])
def patch_egal_min_dist(): print('[+] patch Egallic country distance dataset: CSV_EGAL_MIN_DIST') # isonameset = set([r['country_name'] for r in WIKIP_ISO3166.values()]) # # 1) rename some countries for src, dst_dist in sorted(CSV_EGAL_MIN_DIST.items()): for dst in sorted(dst_dist): if dst in COUNTRY_RENAME: CSV_EGAL_MIN_DIST[src][COUNTRY_RENAME[dst]] = CSV_EGAL_MIN_DIST[src][dst] del CSV_EGAL_MIN_DIST[src][dst] # for src, dst_dist in sorted(CSV_EGAL_MIN_DIST.items()): if src in COUNTRY_RENAME: CSV_EGAL_MIN_DIST[COUNTRY_RENAME[src]] = CSV_EGAL_MIN_DIST[src] del CSV_EGAL_MIN_DIST[src] src = COUNTRY_RENAME[src] # # 2) do some verifications for src, dst_dist in sorted(CSV_EGAL_MIN_DIST.items()): for dst in dst_dist: if dst not in CSV_EGAL_MIN_DIST: print('>>> dst country %s in %s, not in src' % (dst, src)) if not src in isonameset: if not src in SUBTERR_TO_COUNTRY: print('>>> country %s, not matching any territory name' % src) else: print('> country %s, matching only a sub-territory name' % src)
def patch_wfb(): print('[+] patch the World Factbook dataset: WORLD_FB') # isonameset = set([r['country_name'] for r in WIKIP_ISO3166.values()]) # for name, infos in sorted(WORLD_FB.items()): for k in ('cc2', 'cc3', 'ccn', 'gec'): if infos[k] == '-': infos[k] = '' if name in WFB_COUNTRY_DEL: del WORLD_FB[name] print('> country %s deleted' % name) # for name, infos in sorted(WORLD_FB.items()): if name in COUNTRY_RENAME: newname = COUNTRY_RENAME[name] assert (newname not in WORLD_FB) WORLD_FB[newname] = infos del WORLD_FB[name] print('> country name changed from %s to %s' % (name, newname)) # patch borders' names too try: bord = infos['infos']['boundaries']['bord'] except Exception: pass else: for b, d in list(bord.items()): if b in COUNTRY_RENAME: bord[COUNTRY_RENAME[b]] = d del bord[b] # for name, infos in sorted(WORLD_FB.items()): if name not in isonameset: if name in SUBTERR_TO_COUNTRY: country = SUBTERR_TO_COUNTRY[name] if infos['cc2'] and infos['cc2'] != COUNTRY_SPEC[country][ 'cc2']: print('> country %s, exists as sub-territory, CC2 mismatch %s / %s'\ % (name, infos['cc2'], COUNTRY_SPEC[country]['cc2'])) else: pass #print('> country %s, exists as sub-territory of %s' % (name, country)) else: # CC2 lookup if infos['cc2']: if infos['cc2'] not in WIKIP_ISO3166: print('> country %s, CC2 %s, not in WIKIP_ISO3166' % (name, infos['cc2'])) else: newname = WIKIP_ISO3166[infos['cc2']]['country_name'] assert (newname not in WORLD_FB) WORLD_FB[newname] = infos del WORLD_FB[name] print('> country name changed from %s to %s' % (name, newname)) elif name not in WFB_UNINHABITED: print('> country %s, no CC2, not referenced, unknown' % name)
def patch_itut_sanc(sanclist): print('[+] patch ITU-T list of SANC: %r' % id(sanclist)) # isonameset = set([r['country_name'] for r in WIKIP_ISO3166.values()]) for sanc, cntr in list(sanclist.items()): if cntr not in isonameset: newname = _patch_country_name(cntr) if newname: sanclist[sanc] = newname
def patch_itut_spc(spclist): print('[+] patch ITU-T list of SPC: %r' % id(spclist)) # isonameset = set([r['country_name'] for r in WIKIP_ISO3166.values()]) for cntr, spcs in list(spclist.items()): if cntr not in isonameset: newname = _patch_country_name(cntr) if newname: del spclist[cntr] spclist[newname] = spcs
def patch_wikip_mnc(): print('[+] patch Wikipedia list of MCC-MNC: WIKIP_MNC') # nameset = set() [ nameset.update(country_name_canon(r['country_name'])) for r in WIKIP_ISO3166.values() ] # # patch country names and ensure country name exists in iso3166 db (or align them) # ensure CC2 is upper case and in WIKIP_ISO3166 # ensure CC2 is available when network is not intl logs = set() for mcc0 in sorted(WIKIP_MNC): for r in WIKIP_MNC[mcc0]: for oldname, newname in sorted(COUNTRY_RENAME.items()): if country_match(r['country_name'], oldname): r['country_name'] = newname print('> MCC %s MNC %s, country name changed from %s to %s'\ % (r['mcc'], r['mnc'], oldname, newname)) # r['codes_alpha_2'] = list( sorted(map(str.upper, r['codes_alpha_2']))) # if r['codes_alpha_2']: if len(r['codes_alpha_2']) == 1 and not country_present( r['country_name'], nameset): newname = WIKIP_ISO3166[r['codes_alpha_2'] [0]]['country_name'] logs.add('> MCC %s, all MNC, country name updated from %s to %s, CC2 %s'\ % (r['mcc'], r['country_name'], newname, r['codes_alpha_2'][0])) r['country_name'] = newname for cc2 in r['codes_alpha_2']: if cc2 not in WIKIP_ISO3166: print('> MCC %s MNC %s, CC2 %s unknown' % (r['mcc'], r['mnc'], cc2)) elif r['mcc'] not in MCC_INTL: print('> MCC %s MNC %s, no CC2 but not intl network' % (r['mcc'], r['mnc'])) # aliases = [] for mcc0 in sorted(WIKIP_MNC): for r in WIKIP_MNC[mcc0]: if (r['mcc'], r['mnc']) in MNC_ALIAS: # add an alias alias = dict(r) alias['mcc'], alias['mnc'] = MNC_ALIAS[(r['mcc'], r['mnc'])] aliases.append(alias) print('> added MNC alias %s.%s -> %s.%s'\ % (r['mcc'], r['mnc'], alias['mcc'], alias['mnc'])) for alias in aliases: WIKIP_MNC[alias['mcc'][0:1]].append(alias) # for log in sorted(logs): print(log)
def patch_itut_mnc(mncs): print('[+] patch ITU-T list of MCC-MNC: %r' % id(mncs)) # isonameset = set([r['country_name'] for r in WIKIP_ISO3166.values()]) mncset = set() for mnos in WIKIP_MNC.values(): for mno in mnos: mncset.add(mno['mcc'] + mno['mnc']) # for cntr, mnos in list(mncs.items()): if cntr not in isonameset: newname = _patch_country_name(cntr) if newname: del mncs[cntr] mncs[newname] = mnos
def _patch_country_name(name): # if name in COUNTRY_RENAME: newname = COUNTRY_RENAME[name] print('> country name changed from %s to %s' % (name, newname)) return newname # nameset = country_name_canon(name) for cinf in WIKIP_ISO3166.values(): for namesub in nameset: if country_match_set(namesub, cinf['nameset']): newname = cinf['country_name'] print('> country name changed from %s to %s' % (name, newname)) return newname # print('>>> country name %s not found' % name) return ''
def extend_country_spec(): print('[+] patch COUNTRY_SPEC dict') for cc2, r in sorted(WIKIP_ISO3166.items()): if r['sovereignity'] in WIKIP_ISO3166: # country dependent from another one sov = WIKIP_ISO3166[r['sovereignity']] if sov['country_name'] not in COUNTRY_SPEC: COUNTRY_SPEC[sov['country_name']] = {} sovs = COUNTRY_SPEC[sov['country_name']] if 'sub' not in sovs: sovs['sub'] = [] if 'sub_cc2' not in sovs: sovs['sub_cc2'] = [] if r['country_name'] not in sovs['sub']: sovs['sub'].append(r['country_name']) print('> country %s (%s) added under %s' % (r['country_name'], cc2, sov['country_name'])) if cc2 not in sovs['sub_cc2']: sovs['sub_cc2'].append(cc2)
def patch_wikip_borders(): print('[+] patch Wikipedia territory borders list: WIKIP_BORDERS') # # 1) patch names for r in WIKIP_BORDERS: for oldname, newname in sorted(COUNTRY_RENAME.items()): if country_match(r['country_name'], oldname): r['country_name'] = newname if newname in COUNTRY_SPEC and 'url' in COUNTRY_SPEC[newname]: r['country_url'] = COUNTRY_SPEC[newname]['url'] print('> country name changed from %s to %s' % (oldname, newname)) for n in r['neigh'][:]: for oldname, newname in sorted(COUNTRY_RENAME.items()): if country_match(n, oldname): r['neigh'].remove(n) r['neigh'].append(newname) r['neigh'].sort() print('> border changed from %s to %s, country %s'\ % (oldname, newname, r['country_name'])) for s in r['country_sub'][:]: if country_match(s[0], r['country_name']): # simply remove territory r['country_sub'].remove(s) for oldname, newname in sorted(COUNTRY_RENAME.items()): if country_match(s[0], oldname): if newname in COUNTRY_SPEC and 'url' in COUNTRY_SPEC[newname]: new_s = (newname, COUNTRY_SPEC[newname]) else: new_s = (newname, s[1]) r['country_sub'].remove(s) r['country_sub'].append( new_s ) r['country_sub'].sort(key=lambda t: t[0]) print('> sub changed from %s to %s, country %s'\ % (oldname, newname, r['country_name'])) # # 2) delete entries for r in WIKIP_BORDERS[:]: if r['country_name'] in BORD_COUNTRY_DEL: WIKIP_BORDERS.remove(r) print('> country %s deleted' % r['country_name']) elif r['country_name'] in BORD_DUP_DEL and r['country_sub']: WIKIP_BORDERS.remove(r) print('> duplicated country %s deleted' % r['country_name']) # ensure all sub-territories are referenced within COUNTRY_SPEC for name, url in r['country_sub']: if not country_present(name, COUNTRY_SPEC[r['country_name']]['sub']): print('>>> country %s, sub-territory %s not present in COUNTRY_SPEC'\ % (r['country_name'], name)) else: for n in r['neigh'][:]: if country_present(n[0], BORD_COUNTRY_DEL): r['neigh'].remove(n) print('> country %s, border %s deleted' % (r['country_name'], n[0])) # # 4) remove borders to FR, NL, UK when actually against an oversea territory # to enable conversion of WIKIP_BORDERS into a dict BD = {r['country_name']: r for r in WIKIP_BORDERS} assert( len(BD) == len(WIKIP_BORDERS) ) for name in BORD_DUP_DEL: for r in WIKIP_BORDERS: for r_neigh in r['neigh'][:]: if country_match(r_neigh, name) \ and not country_present(r['country_name'], BD[name]['neigh']): # delete entry r['neigh'].remove(r_neigh) print('> country %s, neighbour %s deleted' % (r['country_name'], r_neigh)) # # 5) ensure all country / borders are in the ISO3166 dict isonames = set() [isonames.update(country_name_canon(r['country_name'])) for r in WIKIP_ISO3166.values()] for r in WIKIP_BORDERS: # if not any([name in isonames for name in country_name_canon(r['country_name'])]): if country_present(r['country_name'], SUBTERR_TO_COUNTRY): # warning: SUBTERR_TO_COUNTRY lookup could fail here print('> not present in ISO3166 dict but referenced as sub of %s, country %s'\ % (SUBTERR_TO_COUNTRY[r['country_name']], r['country_name'])) else: print('>>> not present in ISO3166 dict, country %s' % r['country_name']) # if r['country_name'] in COUNTRY_SPEC and 'bord' in COUNTRY_SPEC[r['country_name']]: # ensure those specific borders are correctly referenced for b in COUNTRY_SPEC[r['country_name']]['bord']: if b not in [n[0] for n in r['neigh']]: print('>>> missing border %s, country %s' % (b, r['country_name'])) # for n in r['neigh']: if not any([name in isonames for name in country_name_canon(n)]): if country_present(n, SUBTERR_TO_COUNTRY): # warning: SUBTERR_TO_COUNTRY lookup could fail here print('> border %s not present in ISO3166 dict but referenced as sub of %s, country %s'\ % (n, SUBTERR_TO_COUNTRY[n], r['country_name'])) else: print('>>> border %s not present in ISO3166 dict, country %s'\ % (n, r['country_name']))
def patch_wikip_iso3166(): print('[+] patch Wikipedia ISO3166 dict: WIKIP_ISO3166') # # 1) add entries for country, infos in sorted(COUNTRY_SPEC.items()): if 'cc2' in infos and infos['cc2'] not in WIKIP_ISO3166: r = dict(REC_ISO3166) for k in r: if k in infos: r[k] = infos[k] r['country_name'] = country r['code_alpha_2'] = infos['cc2'] if 'url' in infos: r['country_url'] = infos['url'] WIKIP_ISO3166[r['code_alpha_2']] = r print('> CC2 %s, %s added' % (r['code_alpha_2'], r['country_name'])) # for new, old in sorted(CC2_ALIAS.items()): if new not in WIKIP_ISO3166: WIKIP_ISO3166[new] = WIKIP_ISO3166[old] print('> CC2 %s, alias to %s' % (new, old)) # # 1bis) add more entries, extracted from the international telephone numbering listing for pref, infos in sorted(WIKIP_MSISDN.items()): for cc2, name, url in sorted(infos): if cc2 not in WIKIP_ISO3166: r = dict(REC_ISO3166) r['code_alpha_2'] = cc2 r['country_name'] = name r['country_url'] = url WIKIP_ISO3166[cc2] = r print('> CC2 %s, %s added from WIKIP_MSISDN' % (cc2, name)) # # 2) patch country names for cc2, infos in sorted(WIKIP_ISO3166.items()): for oldname, newname in sorted(COUNTRY_RENAME.items()): if country_match(infos['country_name'], oldname): infos['country_name'] = newname if newname in COUNTRY_SPEC_CC2: assert( COUNTRY_SPEC[newname]['cc2'] == cc2 ) if 'url' in COUNTRY_SPEC[newname]: infos['country_url'] = COUNTRY_SPEC[newname]['url'] print('> country name changed from %s to %s, CC2 %s' % (oldname, newname, cc2)) # # 3) ensure all tld are lower case, and CC codes are upper case for infos in WIKIP_ISO3166.values(): infos['cc_tld'] = infos['cc_tld'].lower() infos['cc_tld_url'] = infos['cc_tld_url'].lower() infos['code_alpha_2'] = infos['code_alpha_2'].upper() infos['code_alpha_3'] = infos['code_alpha_3'].upper() # # 4) ensure all canon names do not collide names = [country_name_canon(r['country_name']) for r in \ [WIKIP_ISO3166[cc2] for cc2 in sorted(WIKIP_ISO3166)]] for i, nameset in enumerate(names[:-1]): for name in nameset: for j, nameset_totest in enumerate(names[1+i:]): if name in nameset_totest: print('>>> country name collision %s / %s'\ % (sorted(WIKIP_ISO3166)[i], sorted(WIKIP_ISO3166)[i+1+j])) # # 5) ensure all overseas, sub-territories and other geographic specificities # are referenced correctly, and verify sovereignity for country in COUNTRY_SPEC_CC2: cc2 = COUNTRY_SPEC[country]['cc2'] wc = WIKIP_ISO3166[cc2] if country != wc['country_name']: print('> country name changed from %s to %s, CC2 %s' % (wc['country_name'], country, cc2)) wc['country_name'] = country if 'url' in COUNTRY_SPEC[country]: wc['country_url'] = COUNTRY_SPEC[country]['url'] if 'sub_cc2' in COUNTRY_SPEC[country]: for cc2_s in COUNTRY_SPEC[country]['sub_cc2']: if cc2_s not in WIKIP_ISO3166: print('>>> missing CC2 %s, part of %s' % (cc2_s, country)) else: wc_s = WIKIP_ISO3166[cc2_s] if wc_s['sovereignity'] == '': wc_s['sovereignity'] = cc2 elif wc_s['sovereignity'] != cc2: print('>>> CC2 %s, %s, sovereignity mismatch %s / %s'\ % (cc2_s, wc_s['country_name'], wc_s['sovereignity'], cc2)) # # 6) keep track of country name variants for cc2, infos in sorted(WIKIP_ISO3166.items()): infos['nameset'] = country_name_canon(infos['country_name']) if infos['state_name']: infos['nameset'].update( country_name_canon(infos['state_name']) )