def load(self, filename): self.G = json.load(open_(filename), object_pairs_hook=OrderedDict) # 这里读取字典的键值 for k, v in self.G.items(): if self.cal_cdf: # print_err("Calculating CDF!") # lf表示的是当前规则中的数量 # 每一个规则都要把上一次的规则的数量加载其中(有点像是处理(5)-(0)就能求出从1~5的规则出现的次数 lf = 0 for l, f in v.items(): # v[l] += lf lf += f v['__total__'] = lf else: v['__total__'] = sum(v.values()) # 然后这里统计出现的所有的W字符串,在一会得字符串生成过程中使用 self.Wlist = [] for k, D in self.G.items(): if k.startswith('W'): self.Wlist.extend([x for x in D]) # 设定data变量,方便管理日期规则 self.date = Date() # 建立dawg,方便生成管理word规则 self.Wlist = IntDAWG(self.Wlist)
class NonT_W(NonT): sym, prod, prob = 'W', '', 0.0 thisdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) word_dawg = IntDAWG().load('{}dictionary1.1.dawg'.format(thisdir)) fname_dawg = IntDAWG().load('{}eng_dict.dawg'.format(thisdir)) lname_dawg = IntDAWG().load('{}eng_dict.dawg'.format(thisdir)) total_f = word_dawg[u'__total__'] + \ fname_dawg[u'__total__'] + \ lname_dawg[u'__total__'] l33t_replaces = DAWG.compile_replaces({ '3': 'e', '4': 'a', '@': 'a', '$': 's', '0': 'o', '1': 'i', 'z': 's' }) def __init__(self, word): # super(NonT_W, self).__init__() w = unicode(word.lower()) dawg = [] for d in [ self.word_dawg, # self.fname_dawg, # self.lname_dawg ]: k = d.similar_keys(w, self.l33t_replaces) if k: dawg.append((d, k[0])) if dawg: v = list(set([d[1] for d in dawg])) if len(v) > 1 or not v[0].isalpha(): return # v = v[0] f = sum([d[0][v] for d in dawg]) self.prod = v self.sym = 'W%s' % get_nont_class('W', v) # self.L = NonT_L(v, word) self.prob = self.L.prob * float(f) / self.total_f # def parse_tree(self): pt = ParseTree() pt.add_rule((self.sym, self.prod)) pt.extend_rules(self.L.parse_tree()) return pt def rule_set(self): rs = RuleSet() rs.add_rule(self.sym, self.prod) rs.update_set(self.L.rule_set()) return rs def __str__(self): return '%s: %s<%s> (%g)' % (self.sym, self.prod, self.L, self.prob)
class NonT_W(NonT): sym, prod, prob = 'W', '', 0.0 english_dawg = IntDAWG().load(GRAMMAR_PATH + 'words.dawg') chinese_dawg = IntDAWG().load(GRAMMAR_PATH + 'pinyin.dawg') total_f = english_dawg[u"__total__"] + chinese_dawg[u'__total__'] l33t_replaces = DAWG.compile_replaces({ '3': 'e', '4': 'a', '@': 'a', '$': 's', '0': 'o', '1': 'i', 'z': 's' }) def __init__(self, word): # 传入参数为待分析的密码 # super(NonT_W, self).__init__() w = word.lower() dawg = [] for d in [self.english_dawg, self.chinese_dawg]: # 使用replaces的替换,找到和w相似的内容,返回一个list,【0】为与w最相似的部分 k = d.similar_keys(w, self.l33t_replaces) if k: dawg.append((d, k[0])) # dawg中存放了之前word,fname,lname中与密码最相似的部分 if dawg: # d[1]中存放的是word,fname,lname;里面的字符串可能会有重复的地发 v = list(set([d[1] for d in dawg])) # 假如这个v中存在两个以上的字符串,或者说第一个元素不全是字符串(???会这样的咩) if len(v) > 1 or not v[0].isalpha(): return # # 这里说明,这个字符串至少出现过一次,这里在不同的字典中统计这个字符串的出现次数 v = v[0] f = sum([d[0][v] for d in dawg]) self.prod = v self.sym = 'W%s' % get_nont_class('W', v) self.L = NonT_L(v, word) # 引入NonT_L 分析password的大小写情况 # print(self.L) self.prob = self.L.prob * float(f) / self.total_f # 添加特殊字符对概率的影响 def parse_tree(self): pt = ParseTree() pt.add_rule((self.sym, self.prod)) pt.extend_rules(self.L.parse_tree()) return pt def rule_set(self): rs = RuleSet() rs.add_rule(self.sym, self.prod) # rs.update_set(self.L.rule_set()) return rs def __str__(self): return '%s: %s<%s> (%g)' % (self.sym, self.prod, self.L, self.prob)
class NonT_W(NonT): sym, prod, prob = 'W', '', 0.0 word_dawg = IntDAWG().load('data/English_30000.dawg') fname_dawg = IntDAWG().load('data/facebook-firstnames-withcount.dawg') lname_dawg = IntDAWG().load('data/facebook-lastnames-withcount.dawg') total_f = word_dawg[u'__total__'] + \ fname_dawg[u'__total__'] + \ lname_dawg[u'__total__'] l33t_replaces = DAWG.compile_replaces({ '3': 'e', '4': 'a', '@': 'a', '$': 's', '0': 'o', '1': 'i', 'z': 's' }) def __init__(self, word): # super(NonT_W, self).__init__() w = unicode(word.lower()) dawg = [] for d in [self.word_dawg, self.fname_dawg, self.lname_dawg]: k = d.similar_keys(w, self.l33t_replaces) if k: dawg.append((d, k[0])) if dawg: v = list(set([d[1] for d in dawg])) if len(v) > 1 or not v[0].isalpha(): return v = v[0] f = sum([d[0][v] for d in dawg]) self.prod = v self.sym = 'W%s' % get_nont_class('W', v) self.L = NonT_L(v, word) self.prob = self.L.prob * float(f) / self.total_f def parse_tree(self): pt = ParseTree() pt.add_rule((self.sym, self.prod)) pt.extend_rule(self.L.parse_tree()) return pt def rule_set(self): rs = RuleSet() rs.add_rule(self.sym, self.prod) rs.update_set(self.L.rule_set()) return rs def __str__(self): return '%s: %s<%s> (%g)' % (self.sym, self.prod, self.L, self.prob)
def load(self, filename): self.G = json.load(open_(filename), object_pairs_hook=OrderedDict) for k, v in self.G.items(): if self.cal_cdf: print_err("Calculating CDF!") lf = 0 for l, f in v.items(): v[l] += lf lf += f v['__total__'] = lf else: v['__total__'] = sum(v.values()) Wlist = [x for k, v in self.G.items() for x in v if k.startswith('W')] self.date = Date() self.Wdawg = IntDAWG(Wlist)
def finalize(self): self.fix_freq() self.NonT_set = [x for x in list(self.G.keys()) if x.find('_') < 0] # + list('Yymd') self.G = self.R.G Wlist = [ x for k, v in list(self.G.items()) for x in v if k.startswith('W') ] self.Wdawg = IntDAWG(Wlist) for k, v in self.G.items(): for rhs, f in v.items(): if f <= 0: print("Zero frequency LHS added, setting frequency to 1") v[rhs] = 1 if '__total__' in v: v['__total__'] += 1 if '__total__' not in v: print( '__total__ should be there in the keys!!. I am adding one.' ) v['__total__'] = sum(v.values()) if 'T' in self.G: self.date = Date(T_rules=[ x for x in list(self.G['T'].keys()) if x != '__total__' ]) self.freeze = True self.R.G.default_factory = None
def build_int_dawg(filename): with open_(filename) as inpf: freq_style = get_f_w_freq f_line = inpf.readline() w = [] if f_line.startswith('#'): words = f_line.strip().split() freq_style = get_file_data_format(words[1:]) else: w = [freq_style(f_line)] w.extend([freq_style(line) for line in inpf]) w.append(('__total__', sum_freq)) int_dawg = IntDAWG(w) of = filename.split('.')[0] + '.dawg' with open(of, 'wb') as o: int_dawg.write(o) test_dawg(of, w[:10] + w[-10:])
def finalize(self): self.fix_freq() self.NonT_set = filter(lambda x: x.find('_') < 0, self.G.keys()) #+ list('Yymd') self.G = self.R.G Wlist = [x for k, v in self.G.items() for x in v if k.startswith('W')] self.Wdawg = IntDAWG(Wlist) if 'T' in self.G: self.date = Date( T_rules=[x for x in self.G['T'].keys() if x != '__total__']) self.freeze = True
def load(self, filename): self.G = json.load(open_(filename), object_pairs_hook=OrderedDict) for k,v in self.G.items(): if self.cal_cdf: print_err("Calculating CDF!") lf = 0 for l,f in v.items(): v[l] += lf lf += f v['__total__'] = lf else: v['__total__'] = sum(v.values()) Wlist = [x for k,v in self.G.items() for x in v if k.startswith('W')] self.date = Date() self.Wdawg = IntDAWG(Wlist)
def finalize(self): self.fix_freq() self.NonT_set = filter(lambda x: x.find('_') < 0, self.G.keys()) #+ list('Yymd') self.G = self.R.G Wlist = [x for k,v in self.G.items() for x in v if k.startswith('W')] self.Wdawg = IntDAWG(Wlist) for k,v in self.G.items(): if '__total__' not in v: print '__total__ should be there in the keys!!. I am adding one.' v['__total__'] = sum(v.values()) if 'T' in self.G: self.date = Date(T_rules=[x for x in self.G['T'].keys() if x != '__total__']) self.freeze = True
class TrainedGrammar(object): l33t_replaces = DAWG.compile_replaces(hny_config.L33T) def __init__(self, g_file=hny_config.TRAINED_GRAMMAR_FILE, cal_cdf=False): self.cal_cdf = cal_cdf self.load(g_file) self.NonT_set = [x for x in list(self.G.keys()) if x.find('_') < 0] def load(self, filename): self.G = json.load(open_(filename), object_pairs_hook=OrderedDict) for k, v in list(self.G.items()): if self.cal_cdf: print_err("Calculating CDF!") lf = 0 for l, f in list(v.items()): v[l] += lf lf += f v['__total__'] = lf else: v['__total__'] = sum(v.values()) # Create dawg/trie of the Wlist items for fast retrieval Wlist = [x for k, v in list(self.G.items()) for x in v if k.startswith('W')] self.date = Date() self.Wdawg = IntDAWG(Wlist) def get_prob(self, l, r): f = self.G.get(l, {}).get(r, 0) tot = self.G.get(l, {}).get('__total__', 1e-3) return max(float(f) / tot, 0.0) def isNonTerm(self, lhs): # this means given lhs, rhs will be in NonT return lhs in self.NonT_set def get_actual_NonTlist(self, lhs, rhs): if lhs == 'G': # Don't include, "W1,G", "D1,G" etc. if rhs.endswith(',G'): return [] return rhs.split(',') elif lhs == 'T': return ['%s_%s' % (lhs, c) for c in (rhs.split(',') if ',' in rhs else rhs)] elif lhs == 'L': return ['%s_%s' % (lhs, c) for c in rhs] elif lhs in ['W', 'D', 'Y', 'R', 'K']: return [] else: return [] def get_freq(self, l, r): return self.G.get(l, {}).get(r, 0) def get_W_rule(self, word): w = str(word.lower()) k = self.Wdawg.similar_keys(w, self.l33t_replaces) if k: k = k[0] L = NonT_L(k, word) sym = 'W%s' % get_nont_class('W', k) try: p = self.get_prob(sym, k) except KeyError as ex: print(k, sym, ex) raise KeyError(ex) return sym, [(k, L)], p def get_T_rule(self, word): T = self.date.IsDate(word) if T: p = 10 ** (len(word)) for r in T.tree: p *= self.get_prob(*r) p *= self.get_prob(*(T.get_rule())) return 'T', [(word, T)], p def get_all_matches(self, word): rules = [] for nt in self.NonT_set: if nt.startswith('G'): # 'G' should not be considered here continue if nt.startswith('W'): l = self.get_W_rule(word) if l: rules.append(l) elif nt == 'T': l = self.get_T_rule(word) if l: rules.append(l) else: f = self.G[nt].get(word, 0) if f > 0: rules.append((nt, [word], float(f) / self.G[nt]['__total__'])) rules = [x for x in rules if x and x[-1]] if rules: return max(rules, key=lambda x: x[-1]) def join(self, r, s): def not_startswith_L_T(x): return (x and not (x[0].startswith('L_') or x[0].startswith('T_'))) if not_startswith_L_T(s) and not_startswith_L_T(r): k = ','.join([r[0], s[0]]) p = r[-1] * s[-1] a = r[1] + s[1] return (k, a, p) def random_parse(self, word, try_num=3): """ Returns a random parse of the word following the grammar. """ # First- rejection sampling, most inefficient version # break the word into random parts and then see if that parse exist print("\n^^^^^^^^^^^_______________^^^^^^^^^^^^^^") if try_num < 0: print("I am very sorry. I could not parse this :(!!") return None # NO IDEA HOW TO randomly pick a parse tree!! @@TODO raise ValueError("Not implemented") def parse(self, word): A = {} if not word: return () for j in range(len(word)): for i in range(len(word) - j): A[(i, i + j)] = self.get_all_matches(word[i:j + i + 1]) t = [A[(i, i + j)]] t.extend([self.join(A[(i, k)], A[(k + 1, i + j)]) for k in range(i, i + j)]) if t: A[(i, i + j)] = \ max(t, key=lambda x: x[-1] if x else 0) else: A[(i, i + j)] = () # print "Not sure why it reached here. But it did!" # print i, j, word[i: i+j+1] return A[(0, len(word) - 1)] @staticmethod def default_parse_tree(word): """ Returns the default parse of a word. Default parse is G -> W1,G | D1,G | Y1,G | W1 | D1 | Y1 This parses any string over the allowed alphabet returns a l-o-r traversed parse tree """ pt = ParseTree() n = len(word) for i, c in enumerate(word): r = whatchar(c) + '1' if i < n - 1: r = r + ',G' pt.add_rule(('G', r)) pt.add_rule((r[:2], c.lower())) if r.startswith('W'): nont_l = NonT_L(c, c) pt.extend_rules(nont_l.parse_tree()) return pt def l_parse_tree(self, word): # leftmost parse-tree pt = ParseTree() p = self.parse(word) print(p) if not p: print("Failing at {!r}".format(word)) return pt # assert p[0] in self.G['G'], "Wrong rule: {} --> {}".format('G', p[0]) if p[0] not in self.G['G']: return self.default_parse_tree(word) pt.add_rule(('G', p[0])) for l, each_r in zip(p[0].split(','), p[1]): if isinstance(each_r, str): pt.add_rule((l, each_r)) elif l.startswith('W'): pt.add_rule((l, each_r[0])) L_parse_tree = each_r[1].parse_tree() pt.add_rule(L_parse_tree[0]) if len(L_parse_tree.tree) > 1: pt.tree.extend(L_parse_tree[1][1]) elif l == 'T': p = each_r[1] rule_name = ','.join([r[0].replace('T_', '') for r in p]) pt.add_rule((l, rule_name)) pt.extend_rules(p) else: print("Something is severely wrong") return pt def rule_set(self, word): rs = RuleSet() pt = self.l_parse_tree(word) for p in pt.tree: rs.add_rule(*p) return rs def encode_rule(self, l, r): rhs_dict = self.G[l] try: i = list(rhs_dict.keys()).index(r) if DEBUG: c = list(rhs_dict.keys())[i] assert c == r, "The index is wrong" except ValueError: raise ValueError("'{}' not in the rhs_dict (l: '{}')" .format(r, l)) l_pt = sum(list(rhs_dict.values())[:i]) r_pt = l_pt + rhs_dict[r] - 1 assert l_pt <= r_pt, "Rule with zero freq! rhs_dict[{}] = {} (l={})\n{}"\ .format(r, rhs_dict, l, self.G) return convert2group(random.randint(l_pt, r_pt), rhs_dict['__total__']) def encode_pw(self, pw): pt = self.l_parse_tree(pw) try: code_g = [self.encode_rule(*p) for p in pt] except ValueError as ex: print("Error in encoding: {!r}".format(pw)) # raise ValueError(ex) extra = hny_config.PASSWORD_LENGTH - len(code_g) code_g.extend(convert2group(0, 1, extra)) return code_g def decode_rule(self, l, p): rhs_dict = self.G[l] if not rhs_dict: return '' assert '__total__' in rhs_dict, "__total__ not in {!r}, l={!r}" \ .format(rhs_dict, l) p %= rhs_dict['__total__'] if self.cal_cdf: if len(rhs_dict) > 1000: print_once(l, len(rhs_dict)) return bin_search(list(rhs_dict.items()), p, 0, len(rhs_dict)) for k, v in list(rhs_dict.items()): if p < v: return k else: p -= v print("Allas could not find.", l, p) def decode_l33t(self, w, iterp): # print("L33t:::", w, iterp) l = self.decode_rule('L', next(iterp)) if l == 'Caps': return w.capitalize() elif l == 'lower': return w.lower() elif l == 'UPPER': return w.upper() else: nw = ''.join([self.decode_rule('L_%s' % c, next(iterp)) for c in w]) return nw def decode_pw(self, P): assert len(P) == hny_config.PASSWORD_LENGTH, \ "Not correct length to decode, Expecting {}, got {}" \ .format(hny_config.PASSWORD_LENGTH, len(P)) iterp = iter(P) plaintext = '' stack = ['G'] while stack: lhs = stack.pop() rhs = self.decode_rule(lhs, next(iterp)) if lhs in ['G', 'T', 'W', 'Y', 'D']: arr = rhs.split(',') if lhs != 'T' \ else ['T_%s' % c for c in rhs.split(',')] arr.reverse() stack.extend(arr) elif lhs.startswith('W'): rhs = self.decode_l33t(rhs, iterp) plaintext += rhs else: plaintext += rhs return plaintext def encode_grammar(self, G): """ Encodes a sub-grammar @G under the current grammar. Note: Does not record the frequencies. G->[ """ vd = VaultDistPCFG() stack = ['G'] code_g = [] done = list(G.default_keys()) while stack: head = stack.pop() assert head not in done, "head={} already in done={}".format(head, done) done.append(head) rule_dict = G[head] t_set = [] for rhs in rule_dict.keys(): if rhs != '__total__': r = [x for x in self.get_actual_NonTlist(head, rhs) if x not in done + stack] for x in r: if x not in t_set: t_set.append(x) t_set.reverse() stack.extend(t_set) n = len(list(rule_dict.keys())) - 1 code_g.append(vd.encode_vault_size(head, n)) if n < 0: print("Sorry I cannot encode your password ({!r})! \nPlease choose" " something different, like password12!! (Just kidding.)".format((head, list(rule_dict.keys())))) exit(0) assert n == vd.decode_vault_size(head, code_g[-1]), \ "Vault size encoding mismatch.\nhead: {!r}, code_g: {}, n: {}, "\ "decoded_vault_size: {}"\ .format(head, code_g[-1], n, vd.decode_vault_size(head, code_g[-1])) code_g.extend([self.encode_rule(head, r) for r in rule_dict.keys() if r != '__total__']) # i = 0 # for r in rule_dict.keys(): # if r == '__total__': continue # nr = self.decode_rule(head, code_g[-n+i]) # print("Decoding:", code_g[-n+i], head, nr) # i += 1 # if nr != r: # print(">>> Mismatch: nr={}, r={}, code_g={}".format(nr, r, code_g[-n+i])) extra = hny_config.HONEY_VAULT_GRAMMAR_SIZE - len(code_g) code_g.extend(convert2group(0, 1, extra)) return code_g def decode_grammar(self, P): """ Decodes a subgrammar under self.G using the random numbers from P. """ g = SubGrammar(self) vd = VaultDistPCFG() iterp = iter(P) stack = ['G'] done = list(g.default_keys()) while stack: head = stack.pop() assert head not in done, "@Head ({}) in @done. It should not!".format(head) done.append(head) p = next(iterp) # print "RuleSizeDecoding:", head, done n = vd.decode_vault_size(head, p) t_set = [] for _ in range(n): p = next(iterp) rhs = self.decode_rule(head, p) if rhs != '__totoal__': r = [y for y in self.get_actual_NonTlist(head, rhs) if y not in done + stack] for y in r: if y not in t_set: t_set.append(y) else: print(">>>>> __total__ should not be in the encoded grammar. Something is wrong!") g.add_rule(head, rhs) t_set.reverse() stack.extend(t_set) g.finalize() # fixes the freq and some other book keepings return g def __getitem__(self, l): return self.G[l] def __contains__(self, k): return k in self.G def is_grammar(self): return bool(self.G['G']) def __str__(self): return json.dumps(self.G['G'], indent=2) def nonterminals(self): return list(self.G.keys())
def test_dawg(filename, wlist): d = IntDAWG() d = d.load(filename) for w in wlist: assert w[1] == d[str(w[0])]
def test_dawg(filename, wlist): d = IntDAWG() d = d.load(filename) for w in wlist: assert w[1] == d[unicode(w[0])]
class TrainedGrammar(object): l33t_replaces = DAWG.compile_replaces({ '3':'e', '4':'a', '@':'a', '$':'s', '0':'o', '1':'i', 'z':'s' }) def __init__(self, g_file=grammar_file, cal_cdf=False): self.cal_cdf = cal_cdf self.load(g_file) self.NonT_set = filter(lambda x: x.find('_') < 0, self.G.keys()) def load(self, filename): self.G = json.load(open_(filename), object_pairs_hook=OrderedDict) for k,v in self.G.items(): if self.cal_cdf: print_err("Calculating CDF!") lf = 0 for l,f in v.items(): v[l] += lf lf += f v['__total__'] = lf else: v['__total__'] = sum(v.values()) Wlist = [x for k,v in self.G.items() for x in v if k.startswith('W')] self.date = Date() self.Wdawg = IntDAWG(Wlist) def get_prob(self, l, r): f = self.G.get(l, {}).get(r, 0) if f>0: return float(f)/self.G[l]['__total__'] def isNonTerm(self, lhs): # this means given lhs, rhs will be in NonT return lhs in self.NonT_set def get_actual_NonTlist(self, lhs, rhs): if lhs == 'G': return rhs.split(',') elif lhs == 'T': return ['%s_%s' % (lhs,c) for c in rhs.split(',')] elif lhs == 'L': return ['%s_%s' % (lhs,c) for c in rhs] else: return [] def get_freq(self, l, r): return self.G.get(l, {}).get(r, 0) def get_W_rule(self, word): w = unicode(word.lower()) k = self.Wdawg.similar_keys(w, self.l33t_replaces) if k: k = k[0] L = NonT_L(k, word) sym = 'W%s' % get_nont_class('W', k) return (sym, [(k, L)], self.get_prob(sym, k)) def get_T_rule(self, word): T = self.date.IsDate(word) if T: p = 10**(len(word)-8) # for r in T.tree: # p *= self.get_prob(*r) # p *= self.get_prob(*(T.get_rule())) return ('T', [(word, T)], p) def get_all_matches(self, word): rules = [] for nt in self.NonT_set: if nt.startswith('W'): l = self.get_W_rule(word) if l: rules.append(l) elif nt == 'T': l = self.get_T_rule(word) if l: rules.append(l) else: f = self.G[nt].get(word, 0) if f>0: rules.append((nt, [(word)], float(f)/self.G[nt]['__total__'])) rules = filter(lambda x: x and x[-1], rules) if rules: return max(rules, key=lambda x: x[-1]) def join(self, r, s): not_startswith_L_T = lambda x: x and \ not (x[0].startswith('L_') or x[0].startswith('T_')) if not_startswith_L_T(s) and not_startswith_L_T(r): k = ','.join([r[0],s[0]]) p = r[-1] * s[-1] a = r[1] + s[1] return (k, a, p) def parse(self, word): A = {} for j in range(len(word)): for i in range(len(word)-j): A[(i, i+j)] = self.get_all_matches(word[i:j+i+1]) t = [A[(i, i+j)]] t.extend([self.join(A[(i,k)], A[(k+1, i+j)]) for k in range(i, i+j)]) if t: A[(i, i+j)] = \ max(t, key = lambda x: x[-1] if x else 0) else: A[(i, i+j)] = () # print "Not sure why it reached here. But it did!" # print i, j, word[i: i+j+1] return A[(0, len(word)-1)] def l_parse_tree(self, word): # leftmost parse-tree pt = ParseTree() p = self.parse(word) if not p: print "Failing at ", word.encode('utf-8') return pt pt.add_rule(('G', p[0])) for l, each_r in zip(p[0].split(','), p[1]): if isinstance(each_r, basestring): pt.add_rule((l, each_r)) elif l.startswith('W'): pt.add_rule((l, each_r[0])) L_parse_tree = each_r[1].parse_tree() pt.add_rule(L_parse_tree[0]) if len(L_parse_tree.tree)>1: pt.tree.extend(L_parse_tree[1][1]) elif l == 'T': p = each_r[1] rule_name = ','.join([r[0].replace('T_','') for r in p]) pt.add_rule((l, rule_name)) pt.extend_rule(p) else: print "Something is severly wrong" return pt def rule_set(self, word): rs = RuleSet() pt = self.l_parse_tree(word) for p in pt.tree: rs.add_rule(*p) return rs def encode_rule(self, l, r): rhs_dict = self.G[l] i = rhs_dict.keys().index(r) assert i >= 0 l_pt = sum(rhs_dict.values()[:i]) r_pt = l_pt + rhs_dict[r] return convert2group(random.randint(l_pt,r_pt), rhs_dict['__total__']) def encode_pw(self, pw): pt = self.l_parse_tree(pw) code_g = [self.encode_rule(*p) for p in pt] extra = hny_config.PASSWORD_LENGTH - len(code_g); code_g.extend([convert2group(0,1) for x in range(extra)]) return code_g def decode_rule(self, l, p): rhs_dict = self.G[l] p %= rhs_dict['__total__'] if self.cal_cdf: if len(rhs_dict)>1000: print_once(l, len(rhs_dict)) return bin_search(rhs_dict.items(), p, 0, len(rhs_dict)) for k,v in rhs_dict.items(): if p<v: return k else: p -= v print "Allas could not find.", l, p def decode_l33t(self, w, iterp): l = self.decode_rule('L', iterp.next()) if l == 'Caps': return w.capitalize() elif l == 'lower': return w.lower() elif l == 'UPPER': return w.upper() else: nw = ''.join([self.decode_rule('L_%s'%c, iterp.next()) for c in w]) return nw def decode_pw(self, P): assert len(P) == hny_config.PASSWORD_LENGTH iterp = iter(P) plaintext = ''; stack = ['G'] while stack: lhs = stack.pop() rhs = self.decode_rule(lhs, iterp.next()) if lhs in ['G', 'T']: arr = rhs.split(',') if lhs == 'G' \ else ['T_%s'% c for c in rhs.split(',')] arr.reverse() stack.extend(arr) elif lhs.startswith('W'): rhs = self.decode_l33t(rhs, iterp) plaintext += rhs else: plaintext += rhs return plaintext def __getitem__(self, l): return self.G[l] def __contains__(self, k): return k in self.G def is_grammar(self): return bool(self.G['G'])
class TrainedGrammar(object): l33t_replaces = DAWG.compile_replaces({ '3':'e', '4':'a', '@':'a', '$':'s', '0':'o', '1':'i', 'z':'s' }) def __init__(self, g_file=GRAMMAR_FILE, cal_cdf=False): self.cal_cdf = cal_cdf self.load(g_file) self.NonT_set = filter(lambda x: x.find('_') < 0, self.G.keys()) def load(self, filename): self.G = json.load(open_(filename), object_pairs_hook=OrderedDict) for k,v in self.G.items(): if self.cal_cdf: print_err("Calculating CDF!") lf = 0 for l,f in v.items(): v[l] += lf lf += f v['__total__'] = lf else: v['__total__'] = sum(v.values()) # Create dawg/trie of the Wlist items for fast retrieval Wlist = [x for k,v in self.G.items() for x in v if k.startswith('W')] self.date = Date() self.Wdawg = IntDAWG(Wlist) def get_prob(self, l, r): f = self.G.get(l, {}).get(r, 0) return max(float(f)/self.G[l]['__total__'], 0.0) def isNonTerm(self, lhs): # this means given lhs, rhs will be in NonT return lhs in self.NonT_set def get_actual_NonTlist(self, lhs, rhs): if lhs == 'G': # Don't include, "W1,G", "D1,G" etc. if rhs.endswith(',G'): return [] return rhs.split(',') elif lhs == 'T': return ['%s_%s' % (lhs,c) for c in (rhs.split(',') if ',' in rhs else rhs)] elif lhs == 'L': return ['%s_%s' % (lhs,c) for c in rhs] elif lhs in ['W', 'D', 'Y', 'R', 'K']: return [] else: return [] def get_freq(self, l, r): return self.G.get(l, {}).get(r, 0) def get_W_rule(self, word): w = unicode(word.lower()) k = self.Wdawg.similar_keys(w, self.l33t_replaces) if k: k = k[0] L = NonT_L(k, word) sym = 'W%s' % get_nont_class('W', k) return (sym, [(k, L)], self.get_prob(sym, k)) def get_T_rule(self, word): T = self.date.IsDate(word) if T: p = 10**(len(word)-8) for r in T.tree: p *= self.get_prob(*r) p *= self.get_prob(*(T.get_rule())) return ('T', [(word, T)], p) def get_all_matches(self, word): rules = [] for nt in self.NonT_set: if nt.startswith('W'): l = self.get_W_rule(word) if l: rules.append(l) elif nt == 'T': l = self.get_T_rule(word) if l: rules.append(l) else: f = self.G[nt].get(word, 0) if f>0: rules.append((nt, [(word)], float(f)/self.G[nt]['__total__'])) rules = filter(lambda x: x and x[-1], rules) if rules: return max(rules, key=lambda x: x[-1]) def join(self, r, s): not_startswith_L_T = lambda x: x and \ not (x[0].startswith('L_') or x[0].startswith('T_')) if not_startswith_L_T(s) and not_startswith_L_T(r): k = ','.join([r[0],s[0]]) p = r[-1] * s[-1] a = r[1] + s[1] return (k, a, p) def random_parse(self, word, try_num=3): """ Returns a random parse of the word following the grammar. """ # First- rejection sampling, most inefficient version # break the word into random parts and then see if that parse exist print "\n^^^^^^^^^^^_______________^^^^^^^^^^^^^^" if try_num<0: print "I am very sorry. I could not parse this :(!!" return None # NO IDEA HOW TO randomly pick a parse tree!! @@TODO def parse(self, word): A = {} if not word: return () for j in range(len(word)): for i in range(len(word)-j): A[(i, i+j)] = self.get_all_matches(word[i:j+i+1]) t = [A[(i, i+j)]] t.extend([self.join(A[(i,k)], A[(k+1, i+j)]) for k in range(i, i+j)]) if t: A[(i, i+j)] = \ max(t, key = lambda x: x[-1] if x else 0) else: A[(i, i+j)] = () # print "Not sure why it reached here. But it did!" # print i, j, word[i: i+j+1] return A[(0, len(word)-1)] def default_parse_tree(self, word): """ Returns the default parse of a word. Default parse is G -> W1,G | D1,G | Y1,G | W1 | D1 | Y1 This parses any string over the allowed alphabet returns a l-o-r traversed parse tree """ pt = ParseTree() n = len(word) for i,c in enumerate(word): r = whatchar(c) + '1' if i<n-1: r = r + ',G' pt.add_rule(('G', r)) pt.add_rule((r[:2], c.lower())) if r.startswith('W'): nont_l = NonT_L(c, c) pt.extend_rules(nont_l.parse_tree()) return pt def l_parse_tree(self, word): # leftmost parse-tree pt = ParseTree() p = self.parse(word) if not p: print "Failing at ", word.encode('utf-8') return pt #assert p[0] in self.G['G'], "Wrong rule: {} --> {}".format('G', p[0]) if p[0] not in self.G['G']: return self.default_parse_tree(word) pt.add_rule(('G', p[0])) for l, each_r in zip(p[0].split(','), p[1]): if isinstance(each_r, basestring): pt.add_rule((l, each_r)) elif l.startswith('W'): pt.add_rule((l, each_r[0])) L_parse_tree = each_r[1].parse_tree() pt.add_rule(L_parse_tree[0]) if len(L_parse_tree.tree)>1: pt.tree.extend(L_parse_tree[1][1]) elif l == 'T': p = each_r[1] rule_name = ','.join([r[0].replace('T_','') for r in p]) pt.add_rule((l, rule_name)) pt.extend_rules(p) else: print "Something is severly wrong" return pt def rule_set(self, word): rs = RuleSet() pt = self.l_parse_tree(word) for p in pt.tree: rs.add_rule(*p) return rs def encode_rule(self, l, r): rhs_dict = self.G[l] try: i = rhs_dict.keys().index(r) if DEBUG: c = rhs_dict.keys()[i] assert c==r, "The index is wrong" except ValueError: print "'{}' not in the rhs_dict (l: '{}', rhs_dict: {})".format(r, l, self.G[l]) raise ValueError l_pt = sum(rhs_dict.values()[:i]) r_pt = l_pt + rhs_dict[r]-1 return convert2group(random.randint(l_pt,r_pt), rhs_dict['__total__']) def encode_pw(self, pw): pt = self.l_parse_tree(pw) try: code_g = [self.encode_rule(*p) for p in pt] except ValueError: print "Error in encoding: \"{}\"".format(pw) raise ValueError return [] extra = hny_config.PASSWORD_LENGTH - len(code_g); code_g.extend([convert2group(0,1) for x in range(extra)]) return code_g def decode_rule(self, l, p): rhs_dict = self.G[l] if not rhs_dict: return '' assert '__total__' in rhs_dict, "__total__ not in {!r}, l={!r}"\ .format(rhs_dict, l) p %= rhs_dict['__total__'] if self.cal_cdf: if len(rhs_dict)>1000: print_once(l, len(rhs_dict)) return bin_search(rhs_dict.items(), p, 0, len(rhs_dict)) for k,v in rhs_dict.items(): if p<v: return k else: p -= v print "Allas could not find.", l, p def decode_l33t(self, w, iterp): l = self.decode_rule('L', iterp.next()) if l == 'Caps': return w.capitalize() elif l == 'lower': return w.lower() elif l == 'UPPER': return w.upper() else: nw = ''.join([self.decode_rule('L_%s'%c, iterp.next()) for c in w]) return nw def decode_pw(self, P): assert len(P) == hny_config.PASSWORD_LENGTH, \ "Not correct length to decode, Expecting {}, got {}"\ .format(hny_config.PASSWORD_LENGTH, len(P)) iterp = iter(P) plaintext = ''; stack = ['G'] while stack: lhs = stack.pop() rhs = self.decode_rule(lhs, iterp.next()) if lhs in ['G', 'T', 'W', 'R', 'Y', 'D']: arr = rhs.split(',') if lhs != 'T' \ else ['T_%s'% c for c in rhs.split(',')] arr.reverse() stack.extend(arr) elif lhs.startswith('W'): rhs = self.decode_l33t(rhs, iterp) plaintext += rhs else: plaintext += rhs return plaintext def encode_grammar(self, G): """ Encodes a sub-grammar @G under the current grammar. """ vd = VaultDistPCFG() stack = ['G'] code_g = [] done = list(G.default_keys()) while stack: head = stack.pop() assert head not in done, "head={} already in done={}".format(head, done) done.append(head) rule_dict = G[head] t_set = [] for rhs, f in rule_dict.items(): if rhs != '__total__': r = filter(lambda x: x not in done+stack, self.get_actual_NonTlist(head, rhs)) if r: for x in r: if (x not in t_set): t_set.append(x) t_set.reverse() stack.extend(t_set) n = len(rule_dict.keys())-1 code_g.append(vd.encode_vault_size(head, n)) if n<0: print "Sorry I cannot encode your password ('{}')! \nPlease choose"\ " something different, like password12".format((head, rule_dict.keys())) exit(0) assert n == vd.decode_vault_size(head, code_g[-1]), "Vault size encoding mismatch. "\ "\nhead: \"{}\", code_g: {}, n: {}, decoded_vault_size: {}"\ .format(head, code_g[-1], n, vd.decode_vault_size(head, code_g[-1])) code_g.extend([self.encode_rule(head, r) for r in rule_dict.keys() if r != '__total__']) extra = hny_config.HONEY_VAULT_GRAMMAR_SIZE - len(code_g); code_g.extend([convert2group(0,1) for x in range(extra)]) return code_g def decode_grammar(self, P): """ Decodes a subgrammar under self.G using the random numbers from P. """ g=SubGrammar(self) vd = VaultDistPCFG() iterp = iter(P) stack = ['G'] done = [] while stack: head = stack.pop() assert head not in done, "@Head ({}) in @done. It should not!".format(head) done.append(head) p = iterp.next() #print "RuleSizeDecoding:", head, done n = vd.decode_vault_size(head, p) t_set = [] for x in range(n): rhs = self.decode_rule(head, iterp.next()) #print "Decoding:", stack, head, '==>', rhs if rhs != '__totoal__': r = filter(lambda x: x not in done+stack, self.get_actual_NonTlist(head, rhs)) if r: for x in r: if (x not in t_set): t_set.append(x) g.add_rule(head, rhs) t_set.reverse() stack.extend(t_set) g.finalize() # fixes the freq and some other book keepings return g def __getitem__(self, l): return self.G[l] def __contains__(self, k): return k in self.G def is_grammar(self): return bool(self.G['G']) def __str__(self): return json.dumps(self.G['G'], indent=2) def nonterminals(self): return self.G.keys()
class TrainedGrammar(object): l33t_replaces = DAWG.compile_replaces({ '3': 'e', '4': 'a', '@': 'a', '$': 's', '0': 'o', '1': 'i', 'z': 's' }) def __init__(self, g_file=grammar_file, cal_cdf=False): self.cal_cdf = cal_cdf self.load(g_file) self.NonT_set = filter(lambda x: x.find('_') < 0, self.G.keys()) def load(self, filename): self.G = json.load(open_(filename), object_pairs_hook=OrderedDict) for k, v in self.G.items(): if self.cal_cdf: print_err("Calculating CDF!") lf = 0 for l, f in v.items(): v[l] += lf lf += f v['__total__'] = lf else: v['__total__'] = sum(v.values()) Wlist = [x for k, v in self.G.items() for x in v if k.startswith('W')] self.date = Date() self.Wdawg = IntDAWG(Wlist) def get_prob(self, l, r): f = self.G.get(l, {}).get(r, 0) if f > 0: return float(f) / self.G[l]['__total__'] def isNonTerm(self, lhs): # this means given lhs, rhs will be in NonT return lhs in self.NonT_set def get_actual_NonTlist(self, lhs, rhs): if lhs == 'G': return rhs.split(',') elif lhs == 'T': return ['%s_%s' % (lhs, c) for c in rhs.split(',')] elif lhs == 'L': return ['%s_%s' % (lhs, c) for c in rhs] else: return [] def get_freq(self, l, r): return self.G.get(l, {}).get(r, 0) def get_W_rule(self, word): w = unicode(word.lower()) k = self.Wdawg.similar_keys(w, self.l33t_replaces) if k: k = k[0] L = NonT_L(k, word) sym = 'W%s' % get_nont_class('W', k) return (sym, [(k, L)], self.get_prob(sym, k)) def get_T_rule(self, word): T = self.date.IsDate(word) if T: p = 10**(len(word) - 8) # for r in T.tree: # p *= self.get_prob(*r) # p *= self.get_prob(*(T.get_rule())) return ('T', [(word, T)], p) def get_all_matches(self, word): rules = [] for nt in self.NonT_set: if nt.startswith('W'): l = self.get_W_rule(word) if l: rules.append(l) elif nt == 'T': l = self.get_T_rule(word) if l: rules.append(l) else: f = self.G[nt].get(word, 0) if f > 0: rules.append( (nt, [(word)], float(f) / self.G[nt]['__total__'])) rules = filter(lambda x: x and x[-1], rules) if rules: return max(rules, key=lambda x: x[-1]) def join(self, r, s): not_startswith_L_T = lambda x: x and \ not (x[0].startswith('L_') or x[0].startswith('T_')) if not_startswith_L_T(s) and not_startswith_L_T(r): k = ','.join([r[0], s[0]]) p = r[-1] * s[-1] a = r[1] + s[1] return (k, a, p) def parse(self, word): A = {} for j in range(len(word)): for i in range(len(word) - j): A[(i, i + j)] = self.get_all_matches(word[i:j + i + 1]) t = [A[(i, i + j)]] t.extend([ self.join(A[(i, k)], A[(k + 1, i + j)]) for k in range(i, i + j) ]) if t: A[(i, i+j)] = \ max(t, key = lambda x: x[-1] if x else 0) else: A[(i, i + j)] = () # print "Not sure why it reached here. But it did!" # print i, j, word[i: i+j+1] return A[(0, len(word) - 1)] def l_parse_tree(self, word): # leftmost parse-tree pt = ParseTree() p = self.parse(word) if not p: print "Failing at ", word.encode('utf-8') return pt pt.add_rule(('G', p[0])) for l, each_r in zip(p[0].split(','), p[1]): if isinstance(each_r, basestring): pt.add_rule((l, each_r)) elif l.startswith('W'): pt.add_rule((l, each_r[0])) L_parse_tree = each_r[1].parse_tree() pt.add_rule(L_parse_tree[0]) if len(L_parse_tree.tree) > 1: pt.tree.extend(L_parse_tree[1][1]) elif l == 'T': p = each_r[1] rule_name = ','.join([r[0].replace('T_', '') for r in p]) pt.add_rule((l, rule_name)) pt.extend_rule(p) else: print "Something is severly wrong" return pt def rule_set(self, word): rs = RuleSet() pt = self.l_parse_tree(word) for p in pt.tree: rs.add_rule(*p) return rs def encode_rule(self, l, r): rhs_dict = self.G[l] i = rhs_dict.keys().index(r) assert i >= 0 l_pt = sum(rhs_dict.values()[:i]) r_pt = l_pt + rhs_dict[r] return convert2group(random.randint(l_pt, r_pt), rhs_dict['__total__']) def encode_pw(self, pw): pt = self.l_parse_tree(pw) code_g = [self.encode_rule(*p) for p in pt] extra = hny_config.PASSWORD_LENGTH - len(code_g) code_g.extend([convert2group(0, 1) for x in range(extra)]) return code_g def decode_rule(self, l, p): rhs_dict = self.G[l] p %= rhs_dict['__total__'] if self.cal_cdf: if len(rhs_dict) > 1000: print_once(l, len(rhs_dict)) return bin_search(rhs_dict.items(), p, 0, len(rhs_dict)) for k, v in rhs_dict.items(): if p < v: return k else: p -= v print "Allas could not find.", l, p def decode_l33t(self, w, iterp): l = self.decode_rule('L', iterp.next()) if l == 'Caps': return w.capitalize() elif l == 'lower': return w.lower() elif l == 'UPPER': return w.upper() else: nw = ''.join( [self.decode_rule('L_%s' % c, iterp.next()) for c in w]) return nw def decode_pw(self, P): assert len(P) == hny_config.PASSWORD_LENGTH iterp = iter(P) plaintext = '' stack = ['G'] while stack: lhs = stack.pop() rhs = self.decode_rule(lhs, iterp.next()) if lhs in ['G', 'T']: arr = rhs.split(',') if lhs == 'G' \ else ['T_%s'% c for c in rhs.split(',')] arr.reverse() stack.extend(arr) elif lhs.startswith('W'): rhs = self.decode_l33t(rhs, iterp) plaintext += rhs else: plaintext += rhs return plaintext def __getitem__(self, l): return self.G[l] def __contains__(self, k): return k in self.G def is_grammar(self): return bool(self.G['G'])
class Grammer(object): """ 功能:处理pcfg加密和解密的类 属性: G OrderedDict:存放各种规则 l33t_replace:各类可替换规则 方法: parse:分析密码的组成内容s """ G = '' l33t_replaces = DAWG.compile_replaces({ '3': 'e', '4': 'a', '@': 'a', '$': 's', '0': 'o', '1': 'i', 'z': 's' }) def __init__(self, filename, cfg=False): self.cal_cdf = cfg self.load(filename) self.Non_set = [] for each in self.G: # 不能有_存在G中 if each.find('_') < 0: self.Non_set.append(each) # 读入字典,并且统计所有的规则的sym def load(self, filename): self.G = json.load(open_(filename), object_pairs_hook=OrderedDict) # 这里读取字典的键值 for k, v in self.G.items(): if self.cal_cdf: # print_err("Calculating CDF!") # lf表示的是当前规则中的数量 # 每一个规则都要把上一次的规则的数量加载其中(有点像是处理(5)-(0)就能求出从1~5的规则出现的次数 lf = 0 for l, f in v.items(): # v[l] += lf lf += f v['__total__'] = lf else: v['__total__'] = sum(v.values()) # 然后这里统计出现的所有的W字符串,在一会得字符串生成过程中使用 self.Wlist = [] for k, D in self.G.items(): if k.startswith('W'): self.Wlist.extend([x for x in D]) # 设定data变量,方便管理日期规则 self.date = Date() # 建立dawg,方便生成管理word规则 self.Wlist = IntDAWG(self.Wlist) # 产生一个(sym,([word]).prod)的规律 def getProb(self, l, r): f = self.G.get(l, {}).get(r, 0) return max(float(f) / self.G[l]['__total__'], 0.0) # 方法:得到最可能符合密码的规则 # 返回值:list,内部存有类似于节点的点 # 其中: # 如果为word类型,则返回值定义为('W1',[(similar_keys,Nont_T)],prob) # 如果为time类型,则返回值定义为('T',[(passwd,Date)],prob)] # 否则返回值定义为(sym,[(passwd)],prob) def genRuleMatches(self, passwd): # 用于存储所有可能的规则 l = [] # 首先要查找这段密码属于哪个规则: for rule in self.Non_set: # =================如果是词汇规则的话============================ if rule.startswith('W'): # 在之前整理的dawg中查找 k = self.Wlist.similar_keys(passwd.lower(), self.l33t_replaces) # 将最相似的作为规则 if k: sym = "W%s" % (get_nont_class('W', passwd)) prod = NonT_L(k, passwd) prob = self.getProb(sym, passwd.lower()) l.append((sym, [(k[0], prod)], prob)) # ================如果是时间规则的话============================= elif rule.startswith('T'): # 在之前找到的Date中处理passwd, # 假如是日期的话,返回值[('T_Y', '2013'), ('T_m', '10'), ('T_d', '26')]类似 T = self.date.IsDate(passwd) if T: sym = 'T' prod = (passwd, T) prob = 10**(len(passwd) - 8) for each in T: prob *= self.getProb(*each) # print((sym,[prod],prob)) l.append((sym, [prod], prob)) # 如果不是这两个种类的话,其他种类的规则是没有节点的 else: # 只需要计算出现概率即可 f = self.G[rule].get(passwd, 0) if f > 0: l.append((rule, [(passwd)], float(f) / self.G[rule]['__total__'])) # 然后我们查找,把概率最高的规则作为返回值 temp_prob = 0 tu = () for each in l: if temp_prob < each[2]: tu = each temp_prob = each[2] return tu def not_startswith_L_T(self, passwd): if passwd: if passwd[0].startswith('L_') or passwd[0].startswith('T_'): return False else: return True else: return passwd # 方法:把两个不同的节点连接起来 def join(self, l, r): # 如果不是特使的节点就把它们连起来 if self.not_startswith_L_T(l) and self.not_startswith_L_T(r): sym = ','.join([l[0], r[0]]) prob = l[-1] * r[-1] prod = l[1] + r[1] return (sym, prod, prob) def parse(self, passwd): # 首先检验读入的字符串不是空字符串 if not passwd: return '' nonTRule = {} # 然后是对读入的字符串进行分析 # 使用它的算法:先算每一个叫部分的规则,然后组合起来(有点像。。。。那个。。分治的思想) index = 0 first = True for rep in range(len(passwd)): for start in range(len(passwd) - rep): index += 1 # 1、(分)将字符串分成不同的小块进行分析(治),得到此部分的方法 # (此处思想是二维的动归,rep表示的是此时跨过多少个字符串) nonTRule[(start, start + rep)] = self.genRuleMatches( passwd[start:start + rep + 1]) rule_list = [] rule_list.append(nonTRule[(start, start + rep)]) # 2、(合)分析各个部分的小块的发生概率,分别记录下来 for bet in range(start, start + rep): temp_non = self.join(nonTRule[(start, bet)], nonTRule[(bet + 1, start + rep)]) rule_list.append(temp_non) # 3、(计)找到发生概率最大的规则,将这个规则当作此时[start:start+rep+1]的值 # 使用fliter生成迭代对象,更好找我们要的变量prob # temp = filter(lambda k:k,rule_list) # 记录下此时的最可能的规则 if rule_list: nonTRule[(start, start + rep)] = max(rule_list, key=lambda x: x[-1] if x else 0) # print(nonTRule[(start,start+rep)]) else: nonTRule[(start, start + rep)] = () return nonTRule[(0, len(passwd) - 1)] # 简单解析函数,将简单规则的密码进行加密(这个简单规则是指类似于123456)或者无法解释的内容 def defaultPasswordParse(self, word): # 将所有的密码格式设置成G -> W1,G | D1,G | Y1,G | W1 | D1 | Y1的形式 pt = ParseTree() n = len(word) for i, c in enumerate(word): r = whatchar(c) + '1' # if i<n-1: # r = r + ',G' pt.add_rule(('G', r)) pt.add_rule((r[:2], c.lower())) if r.startswith('W'): nont_l = NonT_L(c, c) pt.extend_rules(nont_l.parse_tree()) return pt # 简单规则中,其实也差不多,就是直接看是 # 解析函数,目的是将函数解析成需要的语法树,然后在cfg中查找需要的值 def lParseTree(self, passwd): pt = ParseTree() rule = self.parse(passwd) print("our rule is ") print(rule) # 如果返回值为空的话,则说明翻译失败。记录此时密码 if not rule: print("Failed encode %s" % passwd) return pt # 假如是无G状态,就是说简单的密码时,就使用简单的密码加密 if rule[0] not in self.G['G']: return self.defaultPasswordParse(passwd) # 否则的话,首先设定第一层的规则 pt.add_rule(('G', rule[0])) # 然后,将每一层规则和每一个内容读出来,安插到parsetree中 for sym, rhs in zip(rule[0].split(','), rule[1]): # 首先确认一下,假如规则不是W或者T的话,rhs此时应该只是字符串 if isinstance(rhs, str): # 然后可以直接把这个规则放入 pt.add_rule((sym, rhs)) # 假如这个规则是W的话,那么后面跟着的就是(similarkeys_list,NonT_L)则此时要记得先把最相似对象内容放入存档中,并且记录下此时的内容大小写状态 elif sym.startswith('W'): pt.add_rule((sym, rhs[0])) # 这里使用parse_tree变量,把此时的单词的状态子叶记录 ltree = rhs[1].parse_tree() # 然后,此时先把最初的规则放进去 pt.add_rule(ltree[0]) # 假如此时为’133t'规则的话,此时在'133t'之后会记录下此时可能发生替换的元素,则要把这些元素也放入(这些元素已经打包好了) if len(ltree) > 1: pt.tree.extend(ltree[1][1]) # 假如规则是T的话,那么肯定是('T',[('T_Y','1993')..]..)之类的 elif sym.startswith('T'): # 为了与cfg文件内部保持一致,我们此时需要把文件转换成与cfg内的文件一致的格式 temp_sym = '' for each_label in rhs[1]: temp_sym += each_label[0].replace("T_", "") pt.add_rule((sym, temp_sym)) # 然后把其他的节点也放进去 pt.extend_rules(rhs[1]) else: print("we can't figure out this word") # 完成 return pt # 核心加密函数:用于替换我们的密码 def encode_password(self, password): # 首先得到我们的密码的密码树 ptree = self.lParseTree(password) print("our password is ", end='') print(ptree) if not ptree: print("encode failed,change") # 然后将这个密码树映射到不同的数字中: encd = [] # print(ptree) for each_node in ptree: try: encd.append(self.encode_encd(*each_node)) # print(encd) except ValueError as e: print("Error in encoding: \"{}\"".format(password)) print(e) return [] # 假如不出错的话此时就完成了加密,然后注意此时我们的密码可能没有填充完(因为密码本身过短,我们需要使用空白值来填充) length = PASSWORD_MAX_LENGTH - len(encd) # 此时,如果length的长度还是len(encd),那么说明加密失败,返回空列表 if length == PASSWORD_MAX_LENGTH: return [] for i in range(length): encd.append(convert2group(0, 1)) # 映射完成,返回加密完成的数字 return encd # 比例加密函数,用于在一个固定额度区间中获得一个随机数 def encode_encd(self, l, r): # 临时字典,存储此l规则对应的值 rhs_dict = self.G[l] # print(rhs_dict[r]) # 然后获得r的下标 i = list(rhs_dict.keys()).index(r) # 然后开始循环,将其之前的数字进行相加 l_hs = 0 r_hs = 0 for each_index in range(i): l_hs += list(rhs_dict.values())[each_index] # 然后记录下随机数的右侧 r_hs = l_hs + rhs_dict[r] - 1 # 最后调用随机函数,生成介于两者之间的随机数(这里记得把最大值也放上) rn = random.randint(l_hs, r_hs) # print("l_hs is %d,r_hs is %d and the random is %d"%(l_hs,r_hs,rn)) # wn = rn + random.randint(0, int((3000000-rn)/rhs_dict['__total__'])) * rhs_dict['__total__'] wn = convert2group(rn, rhs_dict['__total__']) # print("the wn is %d and it come back is %d"%(wn,wn%rhs_dict['__total__'])) return wn # 比例解密函数 def decode_encd(self, l, r): # 临时字典,存储此时l规则对应的值 rhs_dict = self.G[l] # 然后此时检擦一下是否储存在这个规则(虽然一般都有,可能反解码的时候没有(?) if not rhs_dict: return '' # 还要确保__total__这个属性一定要有,否则就GG assert '__total__' in self.G[l] ,"The __total__ was lost in {!r},l = {!r}"\ .format(rhs_dict,l) # 然后可以开始计算这个值得位置: index = r % rhs_dict['__total__'] # print("the r is %d ,index is %d"%(r,index)) # 接下来,判断参数,决定查找方式 # if self.cal_cdf: # # 假如这个规则比较大的话,我们顺便记录一下这个映射(不知道是否有必要)是否输出 # if len(rhs_dict)>1000: # print_once(l,len(rhs_dict)) # # 使用二分搜索快速查找 # return bin_search(list(rhs_dict.items()),index,0,len(rhs_dict)) # 未使用参数的话,使用比较慢的查找方式 for k, t in rhs_dict.items(): if index < t: return k else: index -= t # 到达这里,说明没有找到。。。检查一下输入是什么吧 print("not find the rule !l is %s and r is %d" % (l, r)) return '' # 尝试进行解密 def decode_password(self, passwd): """ 函数:解密加密的随机串 作用:通过取余运算,将每一个数字对应的原来的法则进行还原,同时利用G点找到之前加密过的密码位置,依次解密 重要参数作用: stack:存放存入的节点 plaintext:存放解密后的字符串 lhs:存放父节点,父节点上存放了某种规则,必定不是字符串 rhs:存放子节点,可能是下一个元素的子节点,可能是字符串 """ if not passwd: return '' # 解密的过程有点像栈堆一样 # 首先新建一个list(如果成功了就换成stack) stack = [] # 然后放入第一个节点(一定是这个,及即时是无法找到对应规则的我们也有G节点) stack.append('G') plaintext = '' index = 0 # 然后进行循环,将密码进行解析 while len(stack) > 0: lhs = stack.pop() # 使用读取功能,检测当前的nond,然后返回当前的状态值 rhs = self.decode_encd(lhs, passwd[index]) index += 1 # 检查此时的rhs节点情况 # 假如该节点为普通节点(而不是什么T_y,L_s那种) if lhs in ['G', 'Y', 'R', 'D', 'T', 'W']: # 那么节点后跟着的就是内容了 if lhs == 'T': # !!可能出错!! sym = ['T_%s' % c for c in rhs] # 普通节点后面跟着的就是普通的规则,用‘,’作为分割符把其分开 else: # print("the rhs is %s"%rhs) sym = rhs.split(',') # 无论哪种情况,都需要把内容颠倒过来(因为放到栈里面,后进先出) sym.reverse() # 然后放入栈中 stack.extend(sym) # 假如此时节点已经是字符节点了,则此时右侧的字符串还未完全的还原,此时还需要把部分元素替换,使用特殊的函数还原 elif lhs.startswith('W'): # 这里passwd放进去,因为下一位必定是大小写判断 l = self.decode_encd('L', passwd[index]) index += 1 # 然后此时判断类型 if l == "lower": plaintext += rhs elif l == "Caps": plaintext += rhs.capitalize() elif l == "UPPER": plaintext += rhs.upper() # 假如是l33t,则此时每个符号都进行了加密,将每个符号进行解密 elif l == "l33t": for c in rhs: plaintext += self.decode_encd('L_%s' % c, passwd[index]) index += 1 # 否则,此时已经是最终节点了 else: plaintext += rhs return plaintext