def __init__(self): self.namesSet = set() self.tldsSet = set() self.alternative2name = {} self.tld2name = {} self.name2alternatives = {} # The list of country names, alternative spellings, and 2-letter codes (TLDs) f = open(os.path.join(DATA_PATH, 'countries.csv'), 'rb') reader = UnicodeReader(f) reader.next() for row in reader: # cid = int(row[0]) # The country name name = unidecode(row[1]).lower().strip() self.namesSet.add(name) self.alternative2name[name] = name # Different alternative names, separated by comma alternatives = [unidecode(a).lower().strip() for a in row[2].split(',') if len(row[2].strip())] for a in alternatives: self.alternative2name[a] = name self.namesSet.add(a) allVariants = set(alternatives).union(set([name])) for variant in allVariants: self.name2alternatives[variant] = allVariants # The 2-letter codes (TLDs) codes = [t.lower().strip() for t in row[4].split(',')] for c in [c for c in codes if len(c)]: self.tld2name[c] = name self.tldsSet.add(c) f.close()
def __init__(self): self.abbrev2name = {} self.namesSet = set() self.abbrevsSet = set() # Load data f = open(os.path.join(DATA_PATH, 'brazilStates.csv'), 'rb') reader = UnicodeReader(f) header = reader.next() for row in reader: name = unidecode(row[0]).lower().strip() abbrev = row[1].lower().strip() self.abbrevsSet.add(abbrev) self.abbrev2name[abbrev] = name self.namesSet.add(name) f.close()
def __init__(self): self.dict = {} # Load data f = open(os.path.join(DATA_PATH, 'blackList.csv'), 'rb') reader = UnicodeReader(f) for row in reader: name = row[0].lower().strip() self.dict[name] = 1 f.close()
def __init__(self): self.namesSet = set() self.tldsSet = set() self.alternative2name = {} self.tld2name = {} self.name2alternatives = {} # The list of country names, alternative spellings, and 2-letter codes (TLDs) f = open(os.path.join(DATA_PATH, 'countries.csv'), 'rb') reader = UnicodeReader(f) reader.next() for row in reader: # cid = int(row[0]) # The country name name = unidecode(row[1]).lower().strip() self.namesSet.add(name) self.alternative2name[name] = name # Different alternative names, separated by comma alternatives = [ unidecode(a).lower().strip() for a in row[2].split(',') if len(row[2].strip()) ] for a in alternatives: self.alternative2name[a] = name self.namesSet.add(a) allVariants = set(alternatives).union(set([name])) for variant in allVariants: self.name2alternatives[variant] = allVariants # The 2-letter codes (TLDs) codes = [t.lower().strip() for t in row[4].split(',')] for c in [c for c in codes if len(c)]: self.tld2name[c] = name self.tldsSet.add(c) f.close()
def __init__(self): self.abbrev2name = {} self.namesSet = set() self.abbrevsSet = set() f = open(os.path.join(DATA_PATH, 'canadaProvinces.csv'), 'rb') reader = UnicodeReader(f) for row in reader: name = row[0].lower().strip() self.namesSet.add(name) abbrev = row[1].lower().strip() self.abbrevsSet.add(abbrev) self.abbrev2name[abbrev] = name f.close()
csv_path = os.path.abspath(sys.argv[1]) results_path = os.path.abspath(sys.argv[2]) sanity_path = os.path.abspath(sys.argv[3]) num_non_trivial = int(sys.argv[4]) num_threads = int(sys.argv[5]) # num_trivial = 5 # num_non_trivial = 7 data = {} coverage = {} strategies = set([]) sanity = {} reader = UnicodeReader(open(sanity_path)) for row in reader: if row[1] == 'OK': sanity[row[0]] = True else: sanity[row[0]] = False reader = UnicodeReader(open(csv_path)) ignored = set([]) for row in reader: # 1436583.js;hash_def_one_renaming.freqlen;$[body][0][definitions][0][value][body][2][body][right][variables][_values][$n][scope];9;8;False;config;config # Update 1/6/17 # 4664436.js;basic_renaming;lm;$[body][0][definitions][0][name][thedef][references][2][scope][variables][_values][$T][scope];3;6;False;frame;frame file_name = row[0]
output_path = Folder(sys.argv[3]).create() num_threads = int(sys.argv[4]) flog = 'log_' + os.path.basename(training_sample_path) try: for f in [flog]: os.remove(os.path.join(output_path, f)) except: pass with open(training_sample_path, 'r') as f, \ open(os.path.join(output_path, flog), 'w') as g: reader = UnicodeReader(f) writer = UnicodeWriter(g) pool = multiprocessing.Pool(processes=num_threads) for result in pool.imap_unordered(processFile, reader): if result[1]: (js_file_path, ok, msg) = result writer.writerow([js_file_path, msg]) else: writer.writerow([result[0], result[2]])
# normalized, hash_def_one_renaming, hash_def_two_renaming) except Exception, e: return (js_file_path, None, str(e)) files_root = os.path.abspath(sys.argv[1]) output_path = Folder(sys.argv[2]).create() sample_size = int(sys.argv[3]) num_threads = int(sys.argv[4]) flog = 'log_renameAndUglify' in_log = set([]) reader = UnicodeReader(open(os.path.join(files_root, flog), 'r')) try: for row in reader: if row[1] == 'OK': in_log.add(row[0]) except: pass print len(in_log), 'in log' on_disk = set(Folder(os.path.join(files_root, 'orig')).baseFileNames('*.js')).\ intersection(Folder(os.path.join(files_root, 'no_renaming')).baseFileNames('*.js')).\ intersection(Folder(os.path.join(files_root, 'hash_def_one_renaming')).baseFileNames('*.js')).\ intersection(Folder(os.path.join(files_root, 'hash_def_two_renaming')).baseFileNames('*.js')) # intersection(Folder(os.path.join(files_root, 'basic_renaming')).baseFileNames('*.js')).\ # intersection(Folder(os.path.join(files_root, 'normalized')).baseFileNames('*.js')).\
Folder(os.path.join(output_path, 'hash_def_two_renaming')).create() # seen = set(Folder(os.path.join(output_path, 'orig')).baseFileNames('*.js')).\ # intersection(Folder(os.path.join(output_path, 'no_renaming')).baseFileNames('*.js')).\ # intersection(Folder(os.path.join(output_path, 'basic_renaming')).baseFileNames('*.js')).\ # intersection(Folder(os.path.join(output_path, 'normalized')).baseFileNames('*.js')).\ # intersection(Folder(os.path.join(output_path, 'hash_def_one_renaming')).baseFileNames('*.js')).\ # intersection(Folder(os.path.join(output_path, 'hash_def_two_renaming')).baseFileNames('*.js')) flog = 'log_' + os.path.basename(training_sample_path) seen = set([]) try: reader = UnicodeReader(open(os.path.join(output_path, flog), 'r')) for row in reader: seen.add(row[0]) except: pass print len(seen), 'already processed' # print seen.pop() # exit() # try: # for f in [flog]: #f3, f4, f6]: # os.remove(os.path.join(output_path, f)) # except: # pass
from unicodeManager import UnicodeReader, UnicodeWriter import multiprocessing from tools import ScopeAnalyst, Lexer, IndexBuilder from evalRenamingHelper import * try: csv_path = os.path.abspath(sys.argv[1]) orig_dir = os.path.abspath(sys.argv[2]) output_file = os.path.abspath(sys.argv[3]) except: print("usage: python evalRenamings.py csvpath originalFileDir output_file") quit() reader = UnicodeReader(open(csv_path)) ignored = set([]) #Key: file, line, token_id -> row renameMap = {} fileKeys = {} jsnice_rows = [] for row in reader: #filename,renaming_strat,consistency_strat,scope_id,line_index,token_id_per_line,isGlobal,Choosen_Renaming,list_of_renamings file_name = row[0] rename_strat = row[1] consistency_strat = row[2] #if(rename_strat == "n2p"): #skip jsnice lines # jsnice_rows.append(row)
dataPath = os.path.abspath('../data') w_log = UnicodeWriter(open(os.path.join(dataPath, 'idm', 'idm_log.csv'), 'wb')) writer = UnicodeWriter(open(os.path.join(dataPath, 'idm', 'idm_map.csv'), 'wb')) w_maybe = UnicodeWriter( open(os.path.join(dataPath, 'idm', 'idm_maybe.csv'), 'wb')) idx = 0 step = 100000 curidx = step aliases = {} # reader = UnicodeReader(open(os.path.join(dataPath, 'users_clean_emails_sample.csv'), 'rb')) reader = UnicodeReader( open(os.path.join(dataPath, 'active_prolific_users.csv'), 'rb')) _header = reader.next() # Helper structures d_email_uid = {} d_uid_email = {} d_prefix_uid = {} d_uid_prefix = {} d_comp_prefix_uid = {} d_uid_comp_prefix = {} d_uid_domain = {} d_domain_uid = {}
@author: Bogdan Vasilescu ''' import os import sys sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))) from unicodeManager import UnicodeReader, UnicodeWriter pth_f1 = os.path.abspath(sys.argv[1]) pth_f2 = os.path.abspath(sys.argv[2]) d1 = {} with open(pth_f1, 'r') as f1: reader = UnicodeReader(f1) for row in reader: d1[tuple(row[:7])] = row[8] d2 = {} with open(pth_f2, 'r') as f2: reader = UnicodeReader(f2) for row in reader: d2[tuple(row[:7])] = row[8] for k, v in sorted(d2.items(), key=lambda e:(e[0][0],e[0][1],e[0][2])): if len(set(v.split(',')).symmetric_difference(set(d1[k].split(',')))): (f, rs, cs, s, l, c, g) = k print f, rs, cs, l, c, g print '\t', sorted(v.split(','))
unmask = {} dataPath = os.path.abspath('../../data/2014-01') w_log = UnicodeWriter(open(os.path.join(dataPath, 'idm', 'idm_log.csv'), 'wb')) writer = UnicodeWriter(open(os.path.join(dataPath, 'idm', 'idm_map.csv'), 'wb')) w_maybe = UnicodeWriter(open(os.path.join(dataPath, 'idm', 'idm_maybe.csv'), 'wb')) idx = 0 step = 100000 curidx = step aliases = {} # reader = UnicodeReader(open(os.path.join(dataPath, 'users_clean_emails_sample.csv'), 'rb')) reader = UnicodeReader(open(os.path.join(dataPath, 'clean', 'users_clean_emails.csv'), 'rb')) _header = reader.next() # Helper structures d_email_uid = {} d_uid_email = {} d_prefix_uid = {} d_uid_prefix = {} d_comp_prefix_uid = {} d_uid_comp_prefix = {} d_uid_domain = {} d_domain_uid = {}
import os import sys sys.path.append( os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))) import random from unicodeManager import UnicodeReader, UnicodeWriter file_in = os.path.abspath(sys.argv[1]) file_out = os.path.abspath(sys.argv[2]) size = int(sys.argv[3]) data = [] reader = UnicodeReader(open(file_in, 'r')) for row in reader: data.append(row[0]) data_sample = random.sample(data, size) with open(file_out, 'w') as of: writer = UnicodeWriter(of) for f in data_sample: writer.writerow([f])
os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))) from folderManager import Folder from unicodeManager import UnicodeReader, UnicodeWriter corpus_dir = Folder(sys.argv[1]) def str_to_bool(s): if s == 'True': return True return False isMini = {} reader = UnicodeReader(open('isMinified.csv', 'r')) for row in reader: isMini[row[0]] = str_to_bool(row[1]) eligible = [ os.path.basename(f) for f in corpus_dir.fullFileNames("*.js") if not isMini.get(os.path.basename(f), False) ] size = len(eligible) tt = int(0.8 * size) training_size = int(0.9 * tt) tuning_size = int(tt - training_size) testing_size = size - tt print 'Total:', size
def __init__(self, MIN_CITY_LENGTH=5, MIN_POPULATION=50000): self.MIN_CITY_LENGTH = MIN_CITY_LENGTH self.MIN_POPULATION = MIN_POPULATION # Most likely, these do not refer to actual city names self.blackList = BlackList().dict # print self.blackList.keys() self.city2countryPopulation = {} self.largeCity2countryPopulation = {} countries = WorldCountries() # Load data # GeoNames list of cities: http://download.geonames.org/export/dump/ f = open(os.path.join(DATA_PATH, 'cities1000.csv'), 'rb') reader = UnicodeReader(f) for row in reader: city = unidecode(row[2]).lower().strip() # Alternative names/spellings for the same city alternatives = [ a for a in [unidecode(a).lower().strip() for a in row[3].split(',')] if len(a) >= self.MIN_CITY_LENGTH and not self.blackList.has_key(a) ] population = int(row[14]) # Country 2-letter code code = row[8].lower() if len(city ) >= self.MIN_CITY_LENGTH and not self.blackList.has_key( city): try: country = countries.tld2name[code] except: # Not all possible 2-letter country codes are known in countries.csv # If necessary, add manually and rerun print 'UNKNOWN CODE:', city, population, code exit() self.city2countryPopulation.setdefault( city, set([(country, population)])) self.city2countryPopulation[city].add((country, population)) # Record same country for all alternative names of this city for a in alternatives: self.city2countryPopulation.setdefault( a, set([(country, population)])) self.city2countryPopulation[a].add((country, population)) # Also keep a shorter list with large cities only if population >= self.MIN_POPULATION: # Record country for this city # Note: Two cities with the same name in different countries # or even two cities with the same name in the same country # are recorded separately self.largeCity2countryPopulation.setdefault( city, set([(country, population)])) self.largeCity2countryPopulation[city].add( (country, population)) # Record same country for all alternative names of this city for a in alternatives: self.largeCity2countryPopulation.setdefault( a, set([(country, population)])) self.largeCity2countryPopulation[a].add( (country, population)) f.close()
unmask = {} dataPath = os.path.abspath('../') w_log = UnicodeWriter(open(os.path.join(dataPath, 'idm_log.csv'), 'wb')) writer = UnicodeWriter(open(os.path.join(dataPath, 'idm_map.csv'), 'wb')) w_maybe = UnicodeWriter(open(os.path.join(dataPath, 'idm_maybe.csv'), 'wb')) idx = 0 step = 100000 curidx = step aliases = {} reader = UnicodeReader( open(os.path.join(dataPath, 'users_emails_sample.csv'), 'rb')) # _header = reader.next() # Helper structures d_email_uid = {} d_uid_email = {} d_prefix_uid = {} d_uid_prefix = {} d_comp_prefix_uid = {} d_uid_comp_prefix = {} d_uid_domain = {} d_domain_uid = {}
orig.append(' '.join([t for (_tt,t) in line]) + "\n") return (js_file_path, orig) except Exception, e: return (js_file_path, None, str(e)) files_root = os.path.abspath(sys.argv[1]) output_path = Folder(sys.argv[2]).create() ok_files_path = os.path.abspath(sys.argv[3]) num_threads = int(sys.argv[4]) is_not_minified = set([]) reader = UnicodeReader(open('isMinified.csv', 'r')) for row in reader: if row[1] == 'False': is_not_minified.add(row[0]) ok_files = set([]) reader = UnicodeReader(open(ok_files_path, 'r')) for row in reader: fname = row[0] if row[1] == 'OK' and fname in is_not_minified: ok_files.add(fname) print len(ok_files), 'files' f1 = 'corpus.orig.js' log = 'log_lm.csv'
unmask = {} dataPath = os.path.abspath('./') w_log = UnicodeWriter(open(os.path.join(dataPath, 'idm_log.csv'), 'wb')) writer = UnicodeWriter(open(os.path.join(dataPath, 'idm_map.csv'), 'wb')) w_maybe = UnicodeWriter(open(os.path.join(dataPath, 'idm_maybe.csv'), 'wb')) idx = 0 step = 100000 curidx = step aliases = {} reader = UnicodeReader( open(os.path.join(dataPath, 'authors-no-bots-emails.csv'), 'rb')) # _header = reader.next() # Helper structures d_email_uid = {} d_uid_email = {} d_prefix_uid = {} d_uid_prefix = {} d_comp_prefix_uid = {} d_uid_comp_prefix = {} d_uid_domain = {} d_domain_uid = {}