def __init__(self):
        self.namesSet = set()
        self.tldsSet = set()
        self.alternative2name = {}
        self.tld2name = {}
        self.name2alternatives = {}

        # The list of country names, alternative spellings, and 2-letter codes (TLDs)
        f = open(os.path.join(DATA_PATH, 'countries.csv'), 'rb')
        reader = UnicodeReader(f)
        reader.next()
        for row in reader:
#            cid = int(row[0])
            # The country name
            name = unidecode(row[1]).lower().strip()
            self.namesSet.add(name)
            self.alternative2name[name] = name
            
            # Different alternative names, separated by comma
            alternatives = [unidecode(a).lower().strip() for a in row[2].split(',') if len(row[2].strip())]
            for a in alternatives:
                self.alternative2name[a] = name
                self.namesSet.add(a)
                
            allVariants = set(alternatives).union(set([name]))
            for variant in allVariants:
                self.name2alternatives[variant] = allVariants
                
            # The 2-letter codes (TLDs)
            codes = [t.lower().strip() for t in row[4].split(',')]
            for c in [c for c in codes if len(c)]:
                self.tld2name[c] = name
                self.tldsSet.add(c)
        f.close()
예제 #2
0
    def __init__(self):
        self.namesSet = set()
        self.tldsSet = set()
        self.alternative2name = {}
        self.tld2name = {}
        self.name2alternatives = {}

        # The list of country names, alternative spellings, and 2-letter codes (TLDs)
        f = open(os.path.join(DATA_PATH, 'countries.csv'), 'rb')
        reader = UnicodeReader(f)
        reader.next()
        for row in reader:
            #            cid = int(row[0])
            # The country name
            name = unidecode(row[1]).lower().strip()
            self.namesSet.add(name)
            self.alternative2name[name] = name

            # Different alternative names, separated by comma
            alternatives = [
                unidecode(a).lower().strip() for a in row[2].split(',')
                if len(row[2].strip())
            ]
            for a in alternatives:
                self.alternative2name[a] = name
                self.namesSet.add(a)

            allVariants = set(alternatives).union(set([name]))
            for variant in allVariants:
                self.name2alternatives[variant] = allVariants

            # The 2-letter codes (TLDs)
            codes = [t.lower().strip() for t in row[4].split(',')]
            for c in [c for c in codes if len(c)]:
                self.tld2name[c] = name
                self.tldsSet.add(c)
        f.close()
 def __init__(self):
     self.abbrev2name = {}
     self.namesSet = set()
     self.abbrevsSet = set()
     
     # Load data
     f = open(os.path.join(DATA_PATH, 'brazilStates.csv'), 'rb')
     reader = UnicodeReader(f)
     header = reader.next()
     for row in reader:
         name = unidecode(row[0]).lower().strip()
         abbrev = row[1].lower().strip()
         self.abbrevsSet.add(abbrev)
         self.abbrev2name[abbrev] = name
         self.namesSet.add(name)
     f.close()
w_log = UnicodeWriter(open(os.path.join(dataPath, 'idm', 'idm_log.csv'), 'wb'))
writer = UnicodeWriter(open(os.path.join(dataPath, 'idm', 'idm_map.csv'),
                            'wb'))
w_maybe = UnicodeWriter(
    open(os.path.join(dataPath, 'idm', 'idm_maybe.csv'), 'wb'))

idx = 0
step = 100000
curidx = step

aliases = {}

#    reader = UnicodeReader(open(os.path.join(dataPath, 'users_clean_emails_sample.csv'), 'rb'))
reader = UnicodeReader(
    open(os.path.join(dataPath, 'active_prolific_users.csv'), 'rb'))
_header = reader.next()

# Helper structures
d_email_uid = {}
d_uid_email = {}

d_prefix_uid = {}
d_uid_prefix = {}

d_comp_prefix_uid = {}
d_uid_comp_prefix = {}

d_uid_domain = {}
d_domain_uid = {}

d_name_uid = {}
dataPath = os.path.abspath('../../data/2014-01')

w_log = UnicodeWriter(open(os.path.join(dataPath, 'idm', 'idm_log.csv'), 'wb'))
writer = UnicodeWriter(open(os.path.join(dataPath, 'idm', 'idm_map.csv'), 'wb'))
w_maybe = UnicodeWriter(open(os.path.join(dataPath, 'idm', 'idm_maybe.csv'), 'wb'))

idx = 0
step = 100000
curidx = step

aliases = {}

#    reader = UnicodeReader(open(os.path.join(dataPath, 'users_clean_emails_sample.csv'), 'rb'))
reader = UnicodeReader(open(os.path.join(dataPath, 'clean', 'users_clean_emails.csv'), 'rb'))
_header = reader.next()

# Helper structures
d_email_uid = {}
d_uid_email = {}

d_prefix_uid = {}
d_uid_prefix = {}

d_comp_prefix_uid = {}
d_uid_comp_prefix = {}

d_uid_domain = {}
d_domain_uid = {}

d_name_uid = {}