class NonT_W(NonT): sym, prod, prob = 'W', '', 0.0 english_dawg = IntDAWG().load(GRAMMAR_PATH + 'words.dawg') chinese_dawg = IntDAWG().load(GRAMMAR_PATH + 'pinyin.dawg') total_f = english_dawg[u"__total__"] + chinese_dawg[u'__total__'] l33t_replaces = DAWG.compile_replaces({ '3': 'e', '4': 'a', '@': 'a', '$': 's', '0': 'o', '1': 'i', 'z': 's' }) def __init__(self, word): # 传入参数为待分析的密码 # super(NonT_W, self).__init__() w = word.lower() dawg = [] for d in [self.english_dawg, self.chinese_dawg]: # 使用replaces的替换,找到和w相似的内容,返回一个list,【0】为与w最相似的部分 k = d.similar_keys(w, self.l33t_replaces) if k: dawg.append((d, k[0])) # dawg中存放了之前word,fname,lname中与密码最相似的部分 if dawg: # d[1]中存放的是word,fname,lname;里面的字符串可能会有重复的地发 v = list(set([d[1] for d in dawg])) # 假如这个v中存在两个以上的字符串,或者说第一个元素不全是字符串(???会这样的咩) if len(v) > 1 or not v[0].isalpha(): return # # 这里说明,这个字符串至少出现过一次,这里在不同的字典中统计这个字符串的出现次数 v = v[0] f = sum([d[0][v] for d in dawg]) self.prod = v self.sym = 'W%s' % get_nont_class('W', v) self.L = NonT_L(v, word) # 引入NonT_L 分析password的大小写情况 # print(self.L) self.prob = self.L.prob * float(f) / self.total_f # 添加特殊字符对概率的影响 def parse_tree(self): pt = ParseTree() pt.add_rule((self.sym, self.prod)) pt.extend_rules(self.L.parse_tree()) return pt def rule_set(self): rs = RuleSet() rs.add_rule(self.sym, self.prod) # rs.update_set(self.L.rule_set()) return rs def __str__(self): return '%s: %s<%s> (%g)' % (self.sym, self.prod, self.L, self.prob)
class NonT_W(NonT): sym, prod, prob = 'W', '', 0.0 thisdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) word_dawg = IntDAWG().load('{}dictionary1.1.dawg'.format(thisdir)) fname_dawg = IntDAWG().load('{}eng_dict.dawg'.format(thisdir)) lname_dawg = IntDAWG().load('{}eng_dict.dawg'.format(thisdir)) total_f = word_dawg[u'__total__'] + \ fname_dawg[u'__total__'] + \ lname_dawg[u'__total__'] l33t_replaces = DAWG.compile_replaces({ '3': 'e', '4': 'a', '@': 'a', '$': 's', '0': 'o', '1': 'i', 'z': 's' }) def __init__(self, word): # super(NonT_W, self).__init__() w = unicode(word.lower()) dawg = [] for d in [ self.word_dawg, # self.fname_dawg, # self.lname_dawg ]: k = d.similar_keys(w, self.l33t_replaces) if k: dawg.append((d, k[0])) if dawg: v = list(set([d[1] for d in dawg])) if len(v) > 1 or not v[0].isalpha(): return # v = v[0] f = sum([d[0][v] for d in dawg]) self.prod = v self.sym = 'W%s' % get_nont_class('W', v) # self.L = NonT_L(v, word) self.prob = self.L.prob * float(f) / self.total_f # def parse_tree(self): pt = ParseTree() pt.add_rule((self.sym, self.prod)) pt.extend_rules(self.L.parse_tree()) return pt def rule_set(self): rs = RuleSet() rs.add_rule(self.sym, self.prod) rs.update_set(self.L.rule_set()) return rs def __str__(self): return '%s: %s<%s> (%g)' % (self.sym, self.prod, self.L, self.prob)
class NonT_W(NonT): sym, prod, prob = 'W', '', 0.0 word_dawg = IntDAWG().load('data/English_30000.dawg') fname_dawg = IntDAWG().load('data/facebook-firstnames-withcount.dawg') lname_dawg = IntDAWG().load('data/facebook-lastnames-withcount.dawg') total_f = word_dawg[u'__total__'] + \ fname_dawg[u'__total__'] + \ lname_dawg[u'__total__'] l33t_replaces = DAWG.compile_replaces({ '3': 'e', '4': 'a', '@': 'a', '$': 's', '0': 'o', '1': 'i', 'z': 's' }) def __init__(self, word): # super(NonT_W, self).__init__() w = unicode(word.lower()) dawg = [] for d in [self.word_dawg, self.fname_dawg, self.lname_dawg]: k = d.similar_keys(w, self.l33t_replaces) if k: dawg.append((d, k[0])) if dawg: v = list(set([d[1] for d in dawg])) if len(v) > 1 or not v[0].isalpha(): return v = v[0] f = sum([d[0][v] for d in dawg]) self.prod = v self.sym = 'W%s' % get_nont_class('W', v) self.L = NonT_L(v, word) self.prob = self.L.prob * float(f) / self.total_f def parse_tree(self): pt = ParseTree() pt.add_rule((self.sym, self.prod)) pt.extend_rule(self.L.parse_tree()) return pt def rule_set(self): rs = RuleSet() rs.add_rule(self.sym, self.prod) rs.update_set(self.L.rule_set()) return rs def __str__(self): return '%s: %s<%s> (%g)' % (self.sym, self.prod, self.L, self.prob)
def load(self, filename): self.G = json.load(open_(filename), object_pairs_hook=OrderedDict) # 这里读取字典的键值 for k, v in self.G.items(): if self.cal_cdf: # print_err("Calculating CDF!") # lf表示的是当前规则中的数量 # 每一个规则都要把上一次的规则的数量加载其中(有点像是处理(5)-(0)就能求出从1~5的规则出现的次数 lf = 0 for l, f in v.items(): # v[l] += lf lf += f v['__total__'] = lf else: v['__total__'] = sum(v.values()) # 然后这里统计出现的所有的W字符串,在一会得字符串生成过程中使用 self.Wlist = [] for k, D in self.G.items(): if k.startswith('W'): self.Wlist.extend([x for x in D]) # 设定data变量,方便管理日期规则 self.date = Date() # 建立dawg,方便生成管理word规则 self.Wlist = IntDAWG(self.Wlist)
def finalize(self): self.fix_freq() self.NonT_set = [x for x in list(self.G.keys()) if x.find('_') < 0] # + list('Yymd') self.G = self.R.G Wlist = [ x for k, v in list(self.G.items()) for x in v if k.startswith('W') ] self.Wdawg = IntDAWG(Wlist) for k, v in self.G.items(): for rhs, f in v.items(): if f <= 0: print("Zero frequency LHS added, setting frequency to 1") v[rhs] = 1 if '__total__' in v: v['__total__'] += 1 if '__total__' not in v: print( '__total__ should be there in the keys!!. I am adding one.' ) v['__total__'] = sum(v.values()) if 'T' in self.G: self.date = Date(T_rules=[ x for x in list(self.G['T'].keys()) if x != '__total__' ]) self.freeze = True self.R.G.default_factory = None
def finalize(self): self.fix_freq() self.NonT_set = filter(lambda x: x.find('_') < 0, self.G.keys()) #+ list('Yymd') self.G = self.R.G Wlist = [x for k, v in self.G.items() for x in v if k.startswith('W')] self.Wdawg = IntDAWG(Wlist) if 'T' in self.G: self.date = Date( T_rules=[x for x in self.G['T'].keys() if x != '__total__']) self.freeze = True
def load(self, filename): self.G = json.load(open_(filename), object_pairs_hook=OrderedDict) for k, v in self.G.items(): if self.cal_cdf: print_err("Calculating CDF!") lf = 0 for l, f in v.items(): v[l] += lf lf += f v['__total__'] = lf else: v['__total__'] = sum(v.values()) Wlist = [x for k, v in self.G.items() for x in v if k.startswith('W')] self.date = Date() self.Wdawg = IntDAWG(Wlist)
def build_int_dawg(filename): with open_(filename) as inpf: freq_style = get_f_w_freq f_line = inpf.readline() w = [] if f_line.startswith('#'): words = f_line.strip().split() freq_style = get_file_data_format(words[1:]) else: w = [freq_style(f_line)] w.extend([freq_style(line) for line in inpf]) w.append(('__total__', sum_freq)) int_dawg = IntDAWG(w) of = filename.split('.')[0] + '.dawg' with open(of, 'wb') as o: int_dawg.write(o) test_dawg(of, w[:10] + w[-10:])
def finalize(self): self.fix_freq() self.NonT_set = filter(lambda x: x.find('_') < 0, self.G.keys()) #+ list('Yymd') self.G = self.R.G Wlist = [x for k,v in self.G.items() for x in v if k.startswith('W')] self.Wdawg = IntDAWG(Wlist) for k,v in self.G.items(): if '__total__' not in v: print '__total__ should be there in the keys!!. I am adding one.' v['__total__'] = sum(v.values()) if 'T' in self.G: self.date = Date(T_rules=[x for x in self.G['T'].keys() if x != '__total__']) self.freeze = True
def test_dawg(filename, wlist): d = IntDAWG() d = d.load(filename) for w in wlist: assert w[1] == d[str(w[0])]