def __init__(self): self.namesSet = set() self.tldsSet = set() self.alternative2name = {} self.tld2name = {} self.name2alternatives = {} # The list of country names, alternative spellings, and 2-letter codes (TLDs) f = open(os.path.join(DATA_PATH, 'countries.csv'), 'rb') reader = UnicodeReader(f) reader.next() for row in reader: # cid = int(row[0]) # The country name name = unidecode(row[1]).lower().strip() self.namesSet.add(name) self.alternative2name[name] = name # Different alternative names, separated by comma alternatives = [unidecode(a).lower().strip() for a in row[2].split(',') if len(row[2].strip())] for a in alternatives: self.alternative2name[a] = name self.namesSet.add(a) allVariants = set(alternatives).union(set([name])) for variant in allVariants: self.name2alternatives[variant] = allVariants # The 2-letter codes (TLDs) codes = [t.lower().strip() for t in row[4].split(',')] for c in [c for c in codes if len(c)]: self.tld2name[c] = name self.tldsSet.add(c) f.close()
def __init__(self): self.namesSet = set() self.tldsSet = set() self.alternative2name = {} self.tld2name = {} self.name2alternatives = {} # The list of country names, alternative spellings, and 2-letter codes (TLDs) f = open(os.path.join(DATA_PATH, 'countries.csv'), 'rb') reader = UnicodeReader(f) reader.next() for row in reader: # cid = int(row[0]) # The country name name = unidecode(row[1]).lower().strip() self.namesSet.add(name) self.alternative2name[name] = name # Different alternative names, separated by comma alternatives = [ unidecode(a).lower().strip() for a in row[2].split(',') if len(row[2].strip()) ] for a in alternatives: self.alternative2name[a] = name self.namesSet.add(a) allVariants = set(alternatives).union(set([name])) for variant in allVariants: self.name2alternatives[variant] = allVariants # The 2-letter codes (TLDs) codes = [t.lower().strip() for t in row[4].split(',')] for c in [c for c in codes if len(c)]: self.tld2name[c] = name self.tldsSet.add(c) f.close()
def __init__(self): self.abbrev2name = {} self.namesSet = set() self.abbrevsSet = set() # Load data f = open(os.path.join(DATA_PATH, 'brazilStates.csv'), 'rb') reader = UnicodeReader(f) header = reader.next() for row in reader: name = unidecode(row[0]).lower().strip() abbrev = row[1].lower().strip() self.abbrevsSet.add(abbrev) self.abbrev2name[abbrev] = name self.namesSet.add(name) f.close()
w_log = UnicodeWriter(open(os.path.join(dataPath, 'idm', 'idm_log.csv'), 'wb')) writer = UnicodeWriter(open(os.path.join(dataPath, 'idm', 'idm_map.csv'), 'wb')) w_maybe = UnicodeWriter( open(os.path.join(dataPath, 'idm', 'idm_maybe.csv'), 'wb')) idx = 0 step = 100000 curidx = step aliases = {} # reader = UnicodeReader(open(os.path.join(dataPath, 'users_clean_emails_sample.csv'), 'rb')) reader = UnicodeReader( open(os.path.join(dataPath, 'active_prolific_users.csv'), 'rb')) _header = reader.next() # Helper structures d_email_uid = {} d_uid_email = {} d_prefix_uid = {} d_uid_prefix = {} d_comp_prefix_uid = {} d_uid_comp_prefix = {} d_uid_domain = {} d_domain_uid = {} d_name_uid = {}
dataPath = os.path.abspath('../../data/2014-01') w_log = UnicodeWriter(open(os.path.join(dataPath, 'idm', 'idm_log.csv'), 'wb')) writer = UnicodeWriter(open(os.path.join(dataPath, 'idm', 'idm_map.csv'), 'wb')) w_maybe = UnicodeWriter(open(os.path.join(dataPath, 'idm', 'idm_maybe.csv'), 'wb')) idx = 0 step = 100000 curidx = step aliases = {} # reader = UnicodeReader(open(os.path.join(dataPath, 'users_clean_emails_sample.csv'), 'rb')) reader = UnicodeReader(open(os.path.join(dataPath, 'clean', 'users_clean_emails.csv'), 'rb')) _header = reader.next() # Helper structures d_email_uid = {} d_uid_email = {} d_prefix_uid = {} d_uid_prefix = {} d_comp_prefix_uid = {} d_uid_comp_prefix = {} d_uid_domain = {} d_domain_uid = {} d_name_uid = {}