def __init__(self):
        self.namesSet = set()
        self.tldsSet = set()
        self.alternative2name = {}
        self.tld2name = {}
        self.name2alternatives = {}

        # The list of country names, alternative spellings, and 2-letter codes (TLDs)
        f = open(os.path.join(DATA_PATH, 'countries.csv'), 'rb')
        reader = UnicodeReader(f)
        reader.next()
        for row in reader:
#            cid = int(row[0])
            # The country name
            name = unidecode(row[1]).lower().strip()
            self.namesSet.add(name)
            self.alternative2name[name] = name
            
            # Different alternative names, separated by comma
            alternatives = [unidecode(a).lower().strip() for a in row[2].split(',') if len(row[2].strip())]
            for a in alternatives:
                self.alternative2name[a] = name
                self.namesSet.add(a)
                
            allVariants = set(alternatives).union(set([name]))
            for variant in allVariants:
                self.name2alternatives[variant] = allVariants
                
            # The 2-letter codes (TLDs)
            codes = [t.lower().strip() for t in row[4].split(',')]
            for c in [c for c in codes if len(c)]:
                self.tld2name[c] = name
                self.tldsSet.add(c)
        f.close()
 def __init__(self):
     self.abbrev2name = {}
     self.namesSet = set()
     self.abbrevsSet = set()
     
     # Load data
     f = open(os.path.join(DATA_PATH, 'brazilStates.csv'), 'rb')
     reader = UnicodeReader(f)
     header = reader.next()
     for row in reader:
         name = unidecode(row[0]).lower().strip()
         abbrev = row[1].lower().strip()
         self.abbrevsSet.add(abbrev)
         self.abbrev2name[abbrev] = name
         self.namesSet.add(name)
     f.close()
예제 #3
0
    def __init__(self):
        self.dict = {}

        # Load data
        f = open(os.path.join(DATA_PATH, 'blackList.csv'), 'rb')
        reader = UnicodeReader(f)
        for row in reader:
            name = row[0].lower().strip()
            self.dict[name] = 1
        f.close()
예제 #4
0
    def __init__(self):
        self.namesSet = set()
        self.tldsSet = set()
        self.alternative2name = {}
        self.tld2name = {}
        self.name2alternatives = {}

        # The list of country names, alternative spellings, and 2-letter codes (TLDs)
        f = open(os.path.join(DATA_PATH, 'countries.csv'), 'rb')
        reader = UnicodeReader(f)
        reader.next()
        for row in reader:
            #            cid = int(row[0])
            # The country name
            name = unidecode(row[1]).lower().strip()
            self.namesSet.add(name)
            self.alternative2name[name] = name

            # Different alternative names, separated by comma
            alternatives = [
                unidecode(a).lower().strip() for a in row[2].split(',')
                if len(row[2].strip())
            ]
            for a in alternatives:
                self.alternative2name[a] = name
                self.namesSet.add(a)

            allVariants = set(alternatives).union(set([name]))
            for variant in allVariants:
                self.name2alternatives[variant] = allVariants

            # The 2-letter codes (TLDs)
            codes = [t.lower().strip() for t in row[4].split(',')]
            for c in [c for c in codes if len(c)]:
                self.tld2name[c] = name
                self.tldsSet.add(c)
        f.close()
    def __init__(self):

        self.abbrev2name = {}
        self.namesSet = set()
        self.abbrevsSet = set()

        f = open(os.path.join(DATA_PATH, 'canadaProvinces.csv'), 'rb')
        reader = UnicodeReader(f)
        for row in reader:
            name = row[0].lower().strip()
            self.namesSet.add(name)
            abbrev = row[1].lower().strip()
            self.abbrevsSet.add(abbrev)
            self.abbrev2name[abbrev] = name
        f.close()
예제 #6
0
csv_path = os.path.abspath(sys.argv[1])
results_path = os.path.abspath(sys.argv[2])
sanity_path = os.path.abspath(sys.argv[3])
num_non_trivial = int(sys.argv[4])
num_threads = int(sys.argv[5])

# num_trivial = 5
# num_non_trivial = 7

data = {}
coverage = {}

strategies = set([])

sanity = {}
reader = UnicodeReader(open(sanity_path))
for row in reader:
    if row[1] == 'OK':
        sanity[row[0]] = True
    else:
        sanity[row[0]] = False

reader = UnicodeReader(open(csv_path))

ignored = set([])

for row in reader:
    # 1436583.js;hash_def_one_renaming.freqlen;$[body][0][definitions][0][value][body][2][body][right][variables][_values][$n][scope];9;8;False;config;config
    # Update 1/6/17
    # 4664436.js;basic_renaming;lm;$[body][0][definitions][0][name][thedef][references][2][scope][variables][_values][$T][scope];3;6;False;frame;frame
    file_name = row[0]
예제 #7
0
output_path = Folder(sys.argv[3]).create()
num_threads = int(sys.argv[4])

flog = 'log_' + os.path.basename(training_sample_path)

try:
    for f in [flog]:
        os.remove(os.path.join(output_path, f))
except:
    pass


with open(training_sample_path, 'r') as f, \
        open(os.path.join(output_path, flog), 'w') as g:

    reader = UnicodeReader(f)
    writer = UnicodeWriter(g)

    pool = multiprocessing.Pool(processes=num_threads)

    for result in pool.imap_unordered(processFile, reader):

        if result[1]:
            (js_file_path, ok, msg) = result

            writer.writerow([js_file_path, msg])

        else:
            writer.writerow([result[0], result[2]])
예제 #8
0
            #                 normalized,
            hash_def_one_renaming,
            hash_def_two_renaming)

    except Exception, e:
        return (js_file_path, None, str(e))

files_root = os.path.abspath(sys.argv[1])
output_path = Folder(sys.argv[2]).create()
sample_size = int(sys.argv[3])
num_threads = int(sys.argv[4])

flog = 'log_renameAndUglify'

in_log = set([])
reader = UnicodeReader(open(os.path.join(files_root, flog), 'r'))
try:
    for row in reader:
        if row[1] == 'OK':
            in_log.add(row[0])
except:
    pass
print len(in_log), 'in log'

on_disk = set(Folder(os.path.join(files_root, 'orig')).baseFileNames('*.js')).\
intersection(Folder(os.path.join(files_root, 'no_renaming')).baseFileNames('*.js')).\
intersection(Folder(os.path.join(files_root, 'hash_def_one_renaming')).baseFileNames('*.js')).\
intersection(Folder(os.path.join(files_root, 'hash_def_two_renaming')).baseFileNames('*.js'))
# intersection(Folder(os.path.join(files_root, 'basic_renaming')).baseFileNames('*.js')).\
# intersection(Folder(os.path.join(files_root, 'normalized')).baseFileNames('*.js')).\
예제 #9
0
Folder(os.path.join(output_path, 'hash_def_two_renaming')).create()


# seen = set(Folder(os.path.join(output_path, 'orig')).baseFileNames('*.js')).\
# intersection(Folder(os.path.join(output_path, 'no_renaming')).baseFileNames('*.js')).\
# intersection(Folder(os.path.join(output_path, 'basic_renaming')).baseFileNames('*.js')).\
# intersection(Folder(os.path.join(output_path, 'normalized')).baseFileNames('*.js')).\
# intersection(Folder(os.path.join(output_path, 'hash_def_one_renaming')).baseFileNames('*.js')).\
# intersection(Folder(os.path.join(output_path, 'hash_def_two_renaming')).baseFileNames('*.js'))

flog = 'log_' + os.path.basename(training_sample_path)


seen = set([])
try:
    reader = UnicodeReader(open(os.path.join(output_path, flog), 'r'))
    for row in reader:
        seen.add(row[0])
except:
    pass

print len(seen), 'already processed'
# print seen.pop()
# exit()

# try:
#     for f in [flog]: #f3, f4, f6]:
#         os.remove(os.path.join(output_path, f))
# except:
#     pass
예제 #10
0
from unicodeManager import UnicodeReader, UnicodeWriter
import multiprocessing
from tools import ScopeAnalyst, Lexer, IndexBuilder
from evalRenamingHelper import *


try:
    csv_path = os.path.abspath(sys.argv[1])
    orig_dir = os.path.abspath(sys.argv[2])
    output_file = os.path.abspath(sys.argv[3])
except:
    print("usage: python evalRenamings.py csvpath originalFileDir output_file")
    quit()


reader = UnicodeReader(open(csv_path))

ignored = set([])

#Key: file, line, token_id -> row
renameMap = {}
fileKeys = {}
jsnice_rows = []

for row in reader:
    #filename,renaming_strat,consistency_strat,scope_id,line_index,token_id_per_line,isGlobal,Choosen_Renaming,list_of_renamings
    file_name = row[0]
    rename_strat = row[1]
    consistency_strat = row[2]
    #if(rename_strat == "n2p"): #skip jsnice lines
    #    jsnice_rows.append(row)
예제 #11
0
dataPath = os.path.abspath('../data')

w_log = UnicodeWriter(open(os.path.join(dataPath, 'idm', 'idm_log.csv'), 'wb'))
writer = UnicodeWriter(open(os.path.join(dataPath, 'idm', 'idm_map.csv'),
                            'wb'))
w_maybe = UnicodeWriter(
    open(os.path.join(dataPath, 'idm', 'idm_maybe.csv'), 'wb'))

idx = 0
step = 100000
curidx = step

aliases = {}

#    reader = UnicodeReader(open(os.path.join(dataPath, 'users_clean_emails_sample.csv'), 'rb'))
reader = UnicodeReader(
    open(os.path.join(dataPath, 'active_prolific_users.csv'), 'rb'))
_header = reader.next()

# Helper structures
d_email_uid = {}
d_uid_email = {}

d_prefix_uid = {}
d_uid_prefix = {}

d_comp_prefix_uid = {}
d_uid_comp_prefix = {}

d_uid_domain = {}
d_domain_uid = {}
예제 #12
0
파일: cmp.py 프로젝트: renesugar/jsNaughty
@author: Bogdan Vasilescu
'''

import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 
                                             os.path.pardir)))

from unicodeManager import UnicodeReader, UnicodeWriter

pth_f1 = os.path.abspath(sys.argv[1])
pth_f2 = os.path.abspath(sys.argv[2])

d1 = {}
with open(pth_f1, 'r') as f1:
    reader = UnicodeReader(f1)
    for row in reader:
        d1[tuple(row[:7])] = row[8]
    

d2 = {}
with open(pth_f2, 'r') as f2:
    reader = UnicodeReader(f2)
    for row in reader:
        d2[tuple(row[:7])] = row[8]
    
for k, v in sorted(d2.items(), key=lambda e:(e[0][0],e[0][1],e[0][2])):
    if len(set(v.split(',')).symmetric_difference(set(d1[k].split(',')))):
        (f, rs, cs, s, l, c, g) = k
        print f, rs, cs, l, c, g
        print '\t', sorted(v.split(','))
unmask = {}

dataPath = os.path.abspath('../../data/2014-01')

w_log = UnicodeWriter(open(os.path.join(dataPath, 'idm', 'idm_log.csv'), 'wb'))
writer = UnicodeWriter(open(os.path.join(dataPath, 'idm', 'idm_map.csv'), 'wb'))
w_maybe = UnicodeWriter(open(os.path.join(dataPath, 'idm', 'idm_maybe.csv'), 'wb'))

idx = 0
step = 100000
curidx = step

aliases = {}

#    reader = UnicodeReader(open(os.path.join(dataPath, 'users_clean_emails_sample.csv'), 'rb'))
reader = UnicodeReader(open(os.path.join(dataPath, 'clean', 'users_clean_emails.csv'), 'rb'))
_header = reader.next()

# Helper structures
d_email_uid = {}
d_uid_email = {}

d_prefix_uid = {}
d_uid_prefix = {}

d_comp_prefix_uid = {}
d_uid_comp_prefix = {}

d_uid_domain = {}
d_domain_uid = {}
예제 #14
0
import os
import sys
sys.path.append(
    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)))
import random
from unicodeManager import UnicodeReader, UnicodeWriter

file_in = os.path.abspath(sys.argv[1])
file_out = os.path.abspath(sys.argv[2])
size = int(sys.argv[3])

data = []
reader = UnicodeReader(open(file_in, 'r'))
for row in reader:
    data.append(row[0])

data_sample = random.sample(data, size)

with open(file_out, 'w') as of:
    writer = UnicodeWriter(of)
    for f in data_sample:
        writer.writerow([f])
예제 #15
0
    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)))

from folderManager import Folder
from unicodeManager import UnicodeReader, UnicodeWriter

corpus_dir = Folder(sys.argv[1])


def str_to_bool(s):
    if s == 'True':
        return True
    return False


isMini = {}
reader = UnicodeReader(open('isMinified.csv', 'r'))
for row in reader:
    isMini[row[0]] = str_to_bool(row[1])

eligible = [
    os.path.basename(f) for f in corpus_dir.fullFileNames("*.js")
    if not isMini.get(os.path.basename(f), False)
]

size = len(eligible)
tt = int(0.8 * size)
training_size = int(0.9 * tt)
tuning_size = int(tt - training_size)
testing_size = size - tt

print 'Total:', size
예제 #16
0
    def __init__(self, MIN_CITY_LENGTH=5, MIN_POPULATION=50000):
        self.MIN_CITY_LENGTH = MIN_CITY_LENGTH
        self.MIN_POPULATION = MIN_POPULATION

        # Most likely, these do not refer to actual city names
        self.blackList = BlackList().dict
        #        print self.blackList.keys()

        self.city2countryPopulation = {}
        self.largeCity2countryPopulation = {}

        countries = WorldCountries()

        # Load data
        # GeoNames list of cities: http://download.geonames.org/export/dump/
        f = open(os.path.join(DATA_PATH, 'cities1000.csv'), 'rb')
        reader = UnicodeReader(f)

        for row in reader:
            city = unidecode(row[2]).lower().strip()
            # Alternative names/spellings for the same city
            alternatives = [
                a for a in
                [unidecode(a).lower().strip() for a in row[3].split(',')]
                if len(a) >= self.MIN_CITY_LENGTH
                and not self.blackList.has_key(a)
            ]
            population = int(row[14])
            # Country 2-letter code
            code = row[8].lower()

            if len(city
                   ) >= self.MIN_CITY_LENGTH and not self.blackList.has_key(
                       city):
                try:
                    country = countries.tld2name[code]
                except:
                    # Not all possible 2-letter country codes are known in countries.csv
                    # If necessary, add manually and rerun
                    print 'UNKNOWN CODE:', city, population, code
                    exit()

                self.city2countryPopulation.setdefault(
                    city, set([(country, population)]))
                self.city2countryPopulation[city].add((country, population))

                # Record same country for all alternative names of this city
                for a in alternatives:
                    self.city2countryPopulation.setdefault(
                        a, set([(country, population)]))
                    self.city2countryPopulation[a].add((country, population))

                # Also keep a shorter list with large cities only
                if population >= self.MIN_POPULATION:
                    # Record country for this city
                    # Note: Two cities with the same name in different countries
                    # or even two cities with the same name in the same country
                    # are recorded separately
                    self.largeCity2countryPopulation.setdefault(
                        city, set([(country, population)]))
                    self.largeCity2countryPopulation[city].add(
                        (country, population))

                    # Record same country for all alternative names of this city
                    for a in alternatives:
                        self.largeCity2countryPopulation.setdefault(
                            a, set([(country, population)]))
                        self.largeCity2countryPopulation[a].add(
                            (country, population))
        f.close()
예제 #17
0
unmask = {}

dataPath = os.path.abspath('../')

w_log = UnicodeWriter(open(os.path.join(dataPath, 'idm_log.csv'), 'wb'))
writer = UnicodeWriter(open(os.path.join(dataPath, 'idm_map.csv'), 'wb'))
w_maybe = UnicodeWriter(open(os.path.join(dataPath, 'idm_maybe.csv'), 'wb'))

idx = 0
step = 100000
curidx = step

aliases = {}

reader = UnicodeReader(
    open(os.path.join(dataPath, 'users_emails_sample.csv'), 'rb'))
# _header = reader.next()

# Helper structures
d_email_uid = {}
d_uid_email = {}

d_prefix_uid = {}
d_uid_prefix = {}

d_comp_prefix_uid = {}
d_uid_comp_prefix = {}

d_uid_domain = {}
d_domain_uid = {}
예제 #18
0
            orig.append(' '.join([t for (_tt,t) in line]) + "\n")
        
        return (js_file_path, orig)
        
    except Exception, e:
        return (js_file_path, None, str(e))
    
    

files_root = os.path.abspath(sys.argv[1])
output_path = Folder(sys.argv[2]).create()
ok_files_path = os.path.abspath(sys.argv[3])
num_threads = int(sys.argv[4])

is_not_minified = set([])
reader = UnicodeReader(open('isMinified.csv', 'r'))
for row in reader:
    if row[1] == 'False':
        is_not_minified.add(row[0])

ok_files = set([])
reader = UnicodeReader(open(ok_files_path, 'r'))
for row in reader:
    fname = row[0]
    if row[1] == 'OK' and fname in is_not_minified:
        ok_files.add(fname)
print len(ok_files), 'files'


f1 = 'corpus.orig.js'
log = 'log_lm.csv'
unmask = {}

dataPath = os.path.abspath('./')

w_log = UnicodeWriter(open(os.path.join(dataPath, 'idm_log.csv'), 'wb'))
writer = UnicodeWriter(open(os.path.join(dataPath, 'idm_map.csv'), 'wb'))
w_maybe = UnicodeWriter(open(os.path.join(dataPath, 'idm_maybe.csv'), 'wb'))

idx = 0
step = 100000
curidx = step

aliases = {}

reader = UnicodeReader(
    open(os.path.join(dataPath, 'authors-no-bots-emails.csv'), 'rb'))
# _header = reader.next()

# Helper structures
d_email_uid = {}
d_uid_email = {}

d_prefix_uid = {}
d_uid_prefix = {}

d_comp_prefix_uid = {}
d_uid_comp_prefix = {}

d_uid_domain = {}
d_domain_uid = {}