def approx_guesses(fname, q): """ TODO: WRITE SOMETHING HERE """ global pwm pwm = Passwords(fname) subset_heap = priority_dict() covered = set() guess_list = [] ballsize = 1000 # I don't care any bigger ball freq_cache = {} done = set() pwfreq = np.copy(pwm.values()) # deep copy of the frequencies l = 1 st = time.time() for i, (pwid, f) in enumerate(pwm): rpw = pwm.id2pw(pwid) if len(rpw) < 6: continue pw = pwm.id2pw(pwid) p = pwm.prob(pw) neighbors = [rpw] for tpw, w in subset_heap.sorted_iter(): w = -w ball = getball(tpw) nw = pwfreq[ball].sum() if w == nw: if w >= f * ballsize: # correct value print "Guess({}/{}): {} weight: {}"\ .format(len(guess_list), q, tpw, w/pwm.totalf()) done.add(tpw) guess_list.append(tpw) pwfreq[ball] = 0 if len(guess_list) >= q: break else: # The ball weight is still small subset_heap[tpw] = -nw break else: subset_heap[tpw] = -nw for tpw, ball in zip(neighbors, map(getball, iter(neighbors))): ballsize = ballsize * 0.9 + ball.shape[0] * 0.1 subset_heap[tpw] = -pwfreq[ball].sum() if len(subset_heap) > l: print(">> ({}) : Heap size: {} ballsize: {}".format( time.time() - st, len(subset_heap), ballsize)) l = len(subset_heap) * 2 if i % 30 == 0: print(">> ({}) : {}: {!r} ({})".format(time.time() - st, i, rpw, f)) if len(guess_list) >= q: break normal_succ = pwm.sumvalues(q=q) / pwm.totalf() pool = multiprocessing.Pool(7) guessed_pws = np.unique(np.concatenate(pool.map(getball, guess_list))) fuzzy_succ = pwm.values()[guessed_pws].sum() / pwm.totalf() print("normal succ: {}, fuzzy succ: {}".format(normal_succ, fuzzy_succ)) with open('approx_guess_{}.json'.format(q), 'wb') as f: json.dump(guess_list, f) return guess_list
def greedy_maxcoverage_heap(fname, q, **kwargs): global pwm pwm = Passwords(fname) subset_heap = priority_dict() covered = set() guess_list = [] ballsize = 2000 # I don't care any bigger ball freq_cache = {} done = set() pwfreq = np.copy(pwm.values()) # deep copy of the frequencies l = 1 st = time.time() pool = multiprocessing.Pool(5) for i, (pwid, f) in enumerate(pwm): rpw = pwm.id2pw(pwid) if len(rpw) < 6: continue pw = pwm.id2pw(pwid) p = pwm.prob(pw) neighbors = set(apply_edits(pw.encode('ascii', errors='ignore'))) - done for tpw, w in subset_heap.sorted_iter(): w = -w ball = getball(tpw) nw = pwfreq[ball].sum() if w == nw: if w >= f * ballsize: # correct value print("Guess({}/{}): {} weight: {}"\ .format(len(guess_list), q, tpw, w/pwm.totalf())) done.add(tpw) guess_list.append(tpw) pwfreq[ball] = 0 if len(guess_list) >= q: break else: # The ball weight is still small subset_heap[tpw] = -nw break else: subset_heap[tpw] = -nw b_max = 0 for tpw, ball in zip(neighbors, pool.map(getball, iter(neighbors))): subset_heap[tpw] = -pwfreq[ball].sum() b_max = max(b_max, ball.shape[0]) ballsize = ballsize * 0.9 + b_max * 0.1 if len(subset_heap) > l: print(">< ({}) : Heap size: {} ballsize: {}".format( time.time() - st, len(subset_heap), ballsize)) l = len(subset_heap) * 2 if i % 10 == 0: print("({}) : {}: {} ({})".format(time.time() - st, i, rpw, f)) if len(guess_list) >= q: break normal_succ = pwm.sumvalues(q=q) / pwm.totalf() guessed_pws = np.unique(np.concatenate(pool.map(getball, guess_list))) fuzzy_succ = pwm.values()[guessed_pws].sum() / pwm.totalf() print("normal succ: {}, fuzzy succ: {}".format(normal_succ, fuzzy_succ)) with open('guess_{}.json'.format(q), 'w') as f: json.dump(guess_list, f) return guess_list
def greedy_maxcoverage_heap(fname, q, **kwargs): global pwm pwm = Passwords(fname) subset_heap = priority_dict() covered = set() guess_list = [] ballsize = 2000 # I don't care any bigger ball freq_cache = {} done = set() pwfreq = np.copy(pwm.values()) # deep copy of the frequencies l = 1 st = time.time() pool = multiprocessing.Pool(5) for i, (pwid, f) in enumerate(pwm): rpw = pwm.id2pw(pwid) if len(rpw)<6: continue pw = pwm.id2pw(pwid) p = pwm.prob(pw) neighbors = set(apply_edits(pw.encode('ascii', errors='ignore'))) - done for tpw, w in subset_heap.sorted_iter(): w = -w ball = getball(tpw) nw = pwfreq[ball].sum() if w == nw: if w >= f*ballsize: # correct value print("Guess({}/{}): {} weight: {}"\ .format(len(guess_list), q, tpw, w/pwm.totalf())) done.add(tpw) guess_list.append(tpw) pwfreq[ball] = 0 if len(guess_list)>=q: break else: # The ball weight is still small subset_heap[tpw] = -nw break else: subset_heap[tpw] = -nw b_max = 0 for tpw, ball in zip(neighbors, pool.map(getball, iter(neighbors))): subset_heap[tpw] = -pwfreq[ball].sum() b_max = max(b_max, ball.shape[0]) ballsize = ballsize*0.9 + b_max*0.1 if len(subset_heap) > l: print(">< ({}) : Heap size: {} ballsize: {}".format( time.time()-st, len(subset_heap), ballsize )) l = len(subset_heap) * 2 if i%10==0: print("({}) : {}: {} ({})".format(time.time()-st, i, rpw, f)) if len(guess_list)>=q: break normal_succ = pwm.sumvalues(q=q)/pwm.totalf() guessed_pws = np.unique(np.concatenate(pool.map(getball, guess_list))) fuzzy_succ = pwm.values()[guessed_pws].sum()/pwm.totalf() print("normal succ: {}, fuzzy succ: {}".format(normal_succ, fuzzy_succ)) with open('guess_{}.json'.format(q), 'w') as f: json.dump(guess_list, f) return guess_list
def compute_secloss(guess_file, attpwf, chlpwf, q=100): chlpwm = Passwords(chlpwf, max_pass_len=25, min_pass_len=5) attpwm = Passwords(attpwf, max_pass_len=25, min_pass_len=5) guesses = [w for w, _ in json.load(open(guess_file))] guess_set = set(guesses) q = len(guesses) print("Found {} guesses".format(q)) lambda_q = sum(chlpwm.pw2freq(pw) for _id, pw, f in attpwm.iterpws(q))/float(chlpwm.totalf()) print("Normal succces: {}".format(lambda_q)) union_ball = set([ rpw for w in guesses for rpw in KB.word_to_typos(str(w)) if chlpwm.pw2id(rpw)>=0 ]) | guess_set print("Worst case success rate = {}"\ .format(sum(chlpwm.pw2freq(w) for w in union_ball)/float(chlpwm.totalf()))) # global N # N = 10000 # M, A, typo_trie, _ = read_pw_nh_graph(chlpwf, N) # Mprime = np.zeros((M.shape[0], NH_SIZE+1)) # B = [[] for _ in guesses] # # for g in xrange(M.shape[0]): # M = Mprime # fuzzlambda_q = 0.0 # guess_key_ids = [get_trie_id(typo_trie, g) for g in guess_set] # killed = [] # for rpw in union_ball: # try: # rpwid = typo_trie.key_id(unicode(rpw)) # for g in guess_key_ids: # if (M[M[:, 0] == rpwid] == g).any: # killed.append(rpw) # except KeyError: # continue # fuzzlambda_q = sum([chlpwm.pw2freq(w) for w in killed])/chlpwm.totalf() # for rpw in union_ball: # a = set(get_topk_typos(rpw, NH_SIZE+1)) & guess_set # if a: # print rpw, chlpwm.pw2freq(rpw) fuzzlambda_q = sum( chlpwm.pw2freq(rpw) for rpw in union_ball if len(set(get_topk_typos(rpw, NH_SIZE)) & guess_set)>0 )/float(chlpwm.totalf()) # print("fuzzlambda_q:", fuzzlambda_q), # lambda_topk_q = sum( # chlpwm.pw2freq(rpw) # for rpw in union_ball # if len(set(get_typodist_nh(rpw, NH_SIZE)) & guess_set)>0 # )/chlpwm.totalf() print("fuzzlambda_q: ", fuzzlambda_q) print("Secloss:", fuzzlambda_q - lambda_q)
def compute_secloss_with_varying_q(guess_file, attpwf, chlpwf, q=100): chlpwm = Passwords(chlpwf, max_pass_len=25, min_pass_len=5) attpwm = Passwords(attpwf, max_pass_len=25, min_pass_len=5) guesses = [w for w, _ in json.load(open(guess_file))] guess_set = dict((g, i) for i, g in enumerate(guesses)) q = len(guesses) union_ball = list(set([ rpw for w in guesses for rpw in KB.word_to_typos(str(w)) if chlpwm.pw2id(rpw)>=0 ])) freqs = np.array([chlpwm.pw2freq(w) for w in union_ball]) M = np.full((len(union_ball), NH_SIZE+1), -1, dtype=np.int32) for i, rpw in enumerate(union_ball): for j, tpw in enumerate(get_topk_typos(rpw, NH_SIZE)): M[i, j] = guess_set.get(tpw, -1) print("Useful typos:", (M>0).sum()) tq = 1 lambda_topk_q = [] while tq<q: if lambda_topk_q: last_suc = lambda_topk_q[-1][1] else: last_suc = 0 for g in guesses[tq:tq*10]: t = guess_set[g] last_suc += freqs[(M==t).sum(axis=1)>0].sum()/float(chlpwm.totalf()) freqs[(M==t).sum(axis=1)>0] = 0 lambda_topk_q.append((tq*10, last_suc)) print(lambda_topk_q[-1]) tq *= 10 with open('guess_file.csv', 'wb') as f: csvf = csv.writer(f) csvf.writerow('q,lambda_q,secloss'.split()) for tq, succ in lambda_topk_q: lambda_q = chlpwm.sumvalues(tq)/float(chlpwm.totalf()) csvf.writerow([tq, lambda_q, succ-lambda_q])
def compute_black_list_succ(fname, b, q, sketch_size): """Computes the offline success rate of an attacker who has access to the sketch and wants to make q (int) queries per password. b is either a number or a set. If b is a number then this specify black listing top b passwords. fname is the attacker's password model. In case b is a number, then the black list is chosen from the top b passwords of the attacker's model, which sounds iffy, but that implies that the attacker has complete knowledge of the real password distribuion. """ pwf = Passwords(fname) n_sketches = 2**sketch_size n = q * n_sketches pwarr, farr = ['' for _ in range(n)], [0 for _ in range(n)] pwiter = pwf.iterpws() for i in range(n): pwarr[i], farr[i] = pwiter.next() if isinstance(b, int): b = pwarr[:b] if not isinstance(b, set): b = set(b) i, j = 0, 0 nfarr = np.zeros(n * n_sketches) for i in range(n): if pwarr[i] in b: nfarr[j:j+n_sketches] = float(farr[i])/n_sketches j += n_sketches else: nfarr[j] = farr[i] j += 1 if j>nfarr.shape[0]: break print nfarr.shape, n if nfarr.shape[0]<n: return -np.partition(-nfarr, n)[:n].sum()/pwf.totalf() else: return nfarr.sum()/pwf.totalf()
def compute_guesses_using_typodist(fname, q, nh_size=5, topk=False, offline=False): """ Computes the Neighborhood based on sampling from the typo distribution. """ # Re-create the neighborhood, it should be small global proc_name, N print(N, MIN_ENTROPY_CUTOFF, REL_ENT_CUTOFF, nh_size) if topk: proc_name = "TOPKTypo-{}-{}-{}".format else: proc_name = "TYPODIST-{}-{}-{}".format proc_name = proc_name(MIN_ENTROPY_CUTOFF, REL_ENT_CUTOFF, ('off' if offline else 'on')) pwm = Passwords(fname, max_pass_len=25, min_pass_len=5) typodir = '{}/typodir'.format(pwd) pwm = Passwords(fname, max_pass_len=25, min_pass_len=5) N = min(N, len(pwm)) tpw_trie_fname = '{}/{}__{}_{}_typo.trie'\ .format(typodir, pwm.fbasename, N, proc_name) rpw_nh_graph = '{}/{}__{}_{}_rpw_nh_graph.npz'\ .format(typodir, pwm.fbasename, N, proc_name) if os.path.exists(tpw_trie_fname) and os.path.exists(rpw_nh_graph): M, B, A, typo_trie = _read_typos(pwm, N, proc_name) else: M, B, A, typo_trie = _get_typos_for_typodist(pwm, q, nh_size, topk) np.savez_compressed(rpw_nh_graph, M=M) typo_trie.save(tpw_trie_fname) guesses = [] i = 0 killed = np.ones(M.shape[0], dtype=bool) while len(guesses)<q: gi = A.argmax() # tpwid of the i-th guess # Set of rows where gi exists killed_gi = B[gi] killed[killed_gi] = False if not offline else True e = (typo_trie.restore_key(gi), A[gi]/float(pwm.totalf())) assert offline or (e not in guesses), "Guesses={}, e={}, killed_gi={}, M[killed_gi]={}"\ .format(guesses, e, gi, M[killed_gi]) if not guesses: print "gi={}, {} -> {} ({}), "\ .format(gi, e[0], len(B[gi]), [typo_trie.restore_key(c) for c in M[killed_gi, 0]]) guesses.append(e) for ri in killed_gi: row = M[ri] f = pwm.pw2freq(typo_trie.restore_key(row[0])) if f<=0: print("RPW freq is zero! rpw={}, f={}, guess={}"\ .format(typo_trie.restore_key(row[0]), f, typo_trie.restore_key(gi))) continue if offline: if gi == row[0]: killed[ri] = False A[gi] = 0 else: A[gi] -= f/float(nh_size) else: A[row] -= f print("({}): {}> {:30s}: {:.3e} (killed={}/{})".format( proc_name, len(guesses), guesses[-1][0], guesses[-1][1]*100, len(killed_gi), M.shape[0]-killed.sum() )) # Sanity check killed_ids = set(itertools.chain(*[B[typo_trie.key_id(t)] for t, _ in guesses])) killed_pws_weight = sum( pwm.pw2freq(typo_trie.restore_key(M[i, 0])) for i in killed_ids ) fuzzlambda_q = sum(g[1] for g in guesses) assert (fuzzlambda_q - killed_pws_weight) < 1e-10, "{} -- {}"\ .format(fuzzlambda_q, killed_pws_weight) print("({}): Total fuzzy success: {}"\ .format(proc_name, 100*fuzzlambda_q)) print("({}): Total normal success: {}"\ .format(proc_name, 100*pwm.sumvalues(q)/float(pwm.totalf()))) guess_f = 'guesses/{}_guesses_{}_typodist_{}_{}.json'\ .format(pwm.fbasename, q, nh_size, proc_name) print("Saving the guesses:", guess_f) with open(guess_f, 'w') as f: json.dump(guesses, f, indent=4)
def approx_guesses(fname, q): """ TODO: WRITE SOMETHING HERE """ global pwm pwm = Passwords(fname) subset_heap = priority_dict() covered = set() guess_list = [] ballsize = 1000 # I don't care any bigger ball freq_cache = {} done = set() pwfreq = np.copy(pwm.values()) # deep copy of the frequencies l = 1 st = time.time() for i, (pwid, f) in enumerate(pwm): rpw = pwm.id2pw(pwid) if len(rpw)<6: continue pw = pwm.id2pw(pwid) p = pwm.prob(pw) neighbors = [rpw] for tpw, w in subset_heap.sorted_iter(): w = -w ball = getball(tpw) nw = pwfreq[ball].sum() if w == nw: if w >= f*ballsize: # correct value print "Guess({}/{}): {} weight: {}"\ .format(len(guess_list), q, tpw, w/pwm.totalf()) done.add(tpw) guess_list.append(tpw) pwfreq[ball] = 0 if len(guess_list)>=q: break else: # The ball weight is still small subset_heap[tpw] = -nw break else: subset_heap[tpw] = -nw for tpw, ball in zip(neighbors, map(getball, iter(neighbors))): ballsize = ballsize*0.9 + ball.shape[0]*0.1 subset_heap[tpw] = -pwfreq[ball].sum() if len(subset_heap) > l: print(">> ({}) : Heap size: {} ballsize: {}".format( time.time()-st, len(subset_heap), ballsize )) l = len(subset_heap) * 2 if i%30==0: print(">> ({}) : {}: {!r} ({})".format(time.time()-st, i, rpw, f)) if len(guess_list)>=q: break normal_succ = pwm.sumvalues(q=q)/pwm.totalf() pool = multiprocessing.Pool(7) guessed_pws = np.unique(np.concatenate(pool.map(getball, guess_list))) fuzzy_succ = pwm.values()[ guessed_pws ].sum()/pwm.totalf() print("normal succ: {}, fuzzy succ: {}".format(normal_succ, fuzzy_succ)) with open('approx_guess_{}.json'.format(q), 'wb') as f: json.dump(guess_list, f) return guess_list
def compute_guesses_using_typodist(fname, q, nh_size=5, topk=False, offline=False): """ Computes the Neighborhood based on sampling from the typo distribution. """ # Re-create the neighborhood, it should be small global proc_name, N print(N, MIN_ENTROPY_CUTOFF, REL_ENT_CUTOFF, nh_size) if topk: proc_name = "TOPKTypo-{}-{}-{}".format else: proc_name = "TYPODIST-{}-{}-{}".format proc_name = proc_name(MIN_ENTROPY_CUTOFF, REL_ENT_CUTOFF, ('off' if offline else 'on')) pwm = Passwords(fname, max_pass_len=25, min_pass_len=5) typodir = '{}/typodir'.format(pwd) pwm = Passwords(fname, max_pass_len=25, min_pass_len=5) N = min(N, len(pwm)) tpw_trie_fname = '{}/{}__{}_{}_typo.trie'\ .format(typodir, pwm.fbasename, N, proc_name) rpw_nh_graph = '{}/{}__{}_{}_rpw_nh_graph.npz'\ .format(typodir, pwm.fbasename, N, proc_name) if os.path.exists(tpw_trie_fname) and os.path.exists(rpw_nh_graph): M, B, A, typo_trie = _read_typos(pwm, N, proc_name) else: M, B, A, typo_trie = _get_typos_for_typodist(pwm, q, nh_size, topk) np.savez_compressed(rpw_nh_graph, M=M) typo_trie.save(tpw_trie_fname) guesses = [] i = 0 killed = np.ones(M.shape[0], dtype=bool) while len(guesses) < q: gi = A.argmax() # tpwid of the i-th guess # Set of rows where gi exists killed_gi = B[gi] killed[killed_gi] = False if not offline else True e = (typo_trie.restore_key(gi), A[gi] / float(pwm.totalf())) assert offline or (e not in guesses), "Guesses={}, e={}, killed_gi={}, M[killed_gi]={}"\ .format(guesses, e, gi, M[killed_gi]) if not guesses: print "gi={}, {} -> {} ({}), "\ .format(gi, e[0], len(B[gi]), [typo_trie.restore_key(c) for c in M[killed_gi, 0]]) guesses.append(e) for ri in killed_gi: row = M[ri] f = pwm.pw2freq(typo_trie.restore_key(row[0])) if f <= 0: print("RPW freq is zero! rpw={}, f={}, guess={}"\ .format(typo_trie.restore_key(row[0]), f, typo_trie.restore_key(gi))) continue if offline: if gi == row[0]: killed[ri] = False A[gi] = 0 else: A[gi] -= f / float(nh_size) else: A[row] -= f print("({}): {}> {:30s}: {:.3e} (killed={}/{})".format( proc_name, len(guesses), guesses[-1][0], guesses[-1][1] * 100, len(killed_gi), M.shape[0] - killed.sum())) # Sanity check killed_ids = set( itertools.chain(*[B[typo_trie.key_id(t)] for t, _ in guesses])) killed_pws_weight = sum( pwm.pw2freq(typo_trie.restore_key(M[i, 0])) for i in killed_ids) fuzzlambda_q = sum(g[1] for g in guesses) assert (fuzzlambda_q - killed_pws_weight) < 1e-10, "{} -- {}"\ .format(fuzzlambda_q, killed_pws_weight) print("({}): Total fuzzy success: {}"\ .format(proc_name, 100*fuzzlambda_q)) print("({}): Total normal success: {}"\ .format(proc_name, 100*pwm.sumvalues(q)/float(pwm.totalf()))) guess_f = 'guesses/{}_guesses_{}_typodist_{}_{}.json'\ .format(pwm.fbasename, q, nh_size, proc_name) print("Saving the guesses:", guess_f) with open(guess_f, 'w') as f: json.dump(guesses, f, indent=4)