def create_pcfg(vault_leak, password_leak=None): # learn the grammar vault_d = json.load(open(vault_leak)) print "# of vaults: ", len(vault_d) print "max size of vault:", max(len(x) for x in vault_d.values()) print "max size of vault:", min(len(x) for x in vault_d.values()) if not password_leak: D = defaultdict(int) for k, v in vault_d.items(): if len(v) > 40: continue for x in v: D[x] += 1 password_leak = PW_TMP_FILE with open(password_leak, 'w') as f: f.write('\n'.join('%d\t%s' % (f, p) for p, f in sorted( D.items(), key=lambda x: x[1], reverse=True))) print "Password file created" parallel_buildpcfg(password_leak) # learn the vault distribution tg = TrainedGrammar() G = cal_size_subG(tg, vault_leak) f = os.tmpfile() json.dump(G, f) f.seek(0) cal_stat(fds=[f]) f.close()
# s = [sum(R[i])/float(len(R[i])) for i in range(len(NT))] print G.keys() for k, v in G.items(): if len(v) < 30: for i in range(1, len(v) + 30): v[i] = 5 * v.get(i, 1) json.dump(G, open(GRAMMAR_DIR + 'vault_dist.cfg', 'wb'), indent=2, separators=(',', ':'), sort_keys=True) if __name__ == "__main__": if sys.argv[1] == '-process': tg = TrainedGrammar() print json.dumps(cal_size_subG(tg, sys.argv[2]), indent=2) elif sys.argv[1] == '-stat': # give the vaultcleaned files, cal_stat(fnames=sys.argv[2:]) elif sys.argv[1] == '-default': tg = TrainedGrammar() files = [ "data_vault/%s_vaultcleaned.json" % x for x in ['joe', 'weir'][:1] ] G = {} for f in files: G.update(cal_size_subG(tg, f)) f = os.tmpfile() json.dump(G, f) f.seek(0)
def __init__(self, grammar=None, cal_cdf=False): self.G = grammar if not self.G: self.G = TrainedGrammar(cal_cdf=cal_cdf)
class DTE_large(DTE): """ encodes a rule """ def __init__(self, grammar=None, cal_cdf=False): self.G = grammar if not self.G: self.G = TrainedGrammar(cal_cdf=cal_cdf) # self.G.load(hny_config.GRAMMAR_DIR+'/grammar.cfg') def encode(self, lhs, rhs): return self.G.encode_rule(lhs,rhs) def decode(self, lhs, pt): return self.G.decode_rule(lhs, pt) def get_freq(self, lhs, rhs): return self.G.get_freq(lhs, rhs) try: s, e = self.G.get_freq_range(lhs, rhs) return e-s except ValueError: print "ValueError in get_freq -- %s is not in %s:" % \ (rhs,self.G[lhs][0]) return -1 def encode_grammar(self, G): # Encode sub-grammar vd = VaultDistribution() stack = ['G'] code_g = [] done = [] while stack: head = stack.pop() assert head not in done done.append(head) rule_dict = G[head] t_set = [] for rhs, f in rule_dict.items(): if rhs != '__total__': r = filter(lambda x: x not in done+stack, self.G.get_actual_NonTlist(head, rhs)) if r: for x in r: if (x not in t_set): t_set.append(x) t_set.reverse() stack.extend(t_set) n = len(rule_dict.keys())-1 code_g.append(vd.encode_vault_size(head, n)) if n<0: print "Sorry I cannot encode your password! Please choose" print "something different, password12" exit(0) assert n == vd.decode_vault_size(head, code_g[-1]) code_g.extend([self.encode(head, r) for r in rule_dict.keys() if r != '__total__']) extra = hny_config.HONEY_VAULT_GRAMMAR_SIZE - len(code_g); code_g.extend([convert2group(0,1) for x in range(extra)]) return code_g def decode_grammar(self, P): g=SubGrammar(self.G) vd = VaultDistribution() iterp = iter(P) stack = ['G'] done = [] while stack: head = stack.pop() assert head not in done done.append(head) p = iterp.next() n = vd.decode_vault_size(head, p) #print "RuleSizeDecoding:", head, n t_set = [] for x in range(n): rhs = self.decode(head, iterp.next()) #print "Decoding:", stack, head, '==>', rhs if rhs != '__totoal__': r = filter(lambda x: x not in done+stack, self.G.get_actual_NonTlist(head, rhs)) if r: for x in r: if (x not in t_set): t_set.append(x) g.add_rule(head, rhs) t_set.reverse() stack.extend(t_set) g.finalize() # fixes the freq and some other book keepings return g