def __init__(self, root): self.__root = root self.__squares = [] self.__tw_squares = [(0,0),(0,7),(0,14),(7,0),(7,14),(14,0),(14,7),(14,14)] self.__dw_sqaures = [(1,1),(2,2),(3,3),(4,4),(7,7),(10,10),(11,11),(12,12),(13,13), (13,1),(12,2),(11,3),(10,4),(4,10),(3,11),(2,12),(1,13)] self.__tl_squares = [(1,5),(1,9),(13,5),(13,9),(5,1),(5,5),(5,9),(5,13), (9,1),(9,5),(9,9),(9,13),(13,5),(13,9)] self.__dl_squares = [(0,3),(0,11),(2,6),(2,8),(3,0),(3,7),(3,14),(6,2),(6,6), (6,8),(6,12),(7,3),(7,11),(8,2),(8,6),(8,8),(8,12), (11,0),(11,7),(11,14),(12,6),(12,8),(14,3),(14,11)] self.__settled_tiles = {} # maps coordinates to tile frames (persist GUI) self.__placed_tiles = {} self.__dawg = DAWG(open('ospd-us.txt').read().split('\n')) for i in range(Board.SIZE): self.__squares.append([]) for j in range(Board.SIZE): if (i,j) in self.__tw_squares: type_ = Board.TRIPLE_WORD elif (i,j) in self.__dw_sqaures: type_ = Board.DOUBLE_WORD elif (i,j) in self.__tl_squares: type_ = Board.TRIPLE_LETTER elif (i,j) in self.__dl_squares: type_ = Board.DOUBLE_LETTER else: type_ = Board.NORMAL self.__squares[i].append((None,type_))
def __init__(self): with open('enable1.txt', 'r') as file: self.valid_scrabble_words = set() # could use chain.from_iterable here when we start using wildcards again for string in file: # self.valid_scrabble_words |= self.wildcard_it(string.strip()) self.valid_scrabble_words.add(string.strip()) self.scrabble_tile_frequencies = {'e': 12, 'a': 9, 'i': 9, 'o': 8, 'n': 6, 'r': 6, 't': 6, 'l': 4, 's': 4, 'u': 4, 'd': 4, 'g': 3, 'b': 2, 'c': 2, 'm': 2, 'p': 2, 'f': 2, 'h': 2, 'v': 2, 'w': 2, 'y': 2, 'k': 1, 'j': 1, 'x': 1, 'q': 1, 'z': 1} # dummy tiles representing wildcards self.scrabble_tile_frequencies.update(dict.fromkeys("ABCDEFGHIJKLMNOPQRSTUVWXYZ", 2)) self.scrabble_tiles = [tile for tile in self.scrabble_tile_frequencies for x in range(self.scrabble_tile_frequencies[tile])] self.test_solution = "" # wildcard tiles, which are represented by uppercase letters, will default to a value of 0 self.letter_scores = defaultdict(int, {'e': 1, 'a': 1, 'i': 1, 'o': 1, 'n': 1, 'r': 1, 't': 1, 'l': 1, 's': 1, 'u': 1, 'd': 2, 'g': 2, 'b': 3, 'c': 3, 'm': 3, 'p': 3, 'f': 4, 'h': 4, 'v': 4, 'w': 4, 'y': 4, 'k': 5, 'j': 8, 'x': 8, 'q': 10, 'z': 10}) if not isfile("word scores.pkl"): with Pool(8) as p: self.word_scores = dict( zip(self.valid_scrabble_words, p.map(self.string_score_2, self.valid_scrabble_words))) with open("word scores.pkl", 'wb') as file: dump(self.word_scores, file, HIGHEST_PROTOCOL) else: with open("word scores.pkl", 'rb') as file: self.word_scores = load(file) if not isfile('word graph.dawg'): self.word_graph = DAWG(self.valid_scrabble_words) self.word_graph.save('word graph.dawg') else: self.word_graph = DAWG().load('word graph.dawg')
class NonT_W(NonT): sym, prod, prob = 'W', '', 0.0 english_dawg = IntDAWG().load(GRAMMAR_PATH + 'words.dawg') chinese_dawg = IntDAWG().load(GRAMMAR_PATH + 'pinyin.dawg') total_f = english_dawg[u"__total__"] + chinese_dawg[u'__total__'] l33t_replaces = DAWG.compile_replaces({ '3': 'e', '4': 'a', '@': 'a', '$': 's', '0': 'o', '1': 'i', 'z': 's' }) def __init__(self, word): # 传入参数为待分析的密码 # super(NonT_W, self).__init__() w = word.lower() dawg = [] for d in [self.english_dawg, self.chinese_dawg]: # 使用replaces的替换,找到和w相似的内容,返回一个list,【0】为与w最相似的部分 k = d.similar_keys(w, self.l33t_replaces) if k: dawg.append((d, k[0])) # dawg中存放了之前word,fname,lname中与密码最相似的部分 if dawg: # d[1]中存放的是word,fname,lname;里面的字符串可能会有重复的地发 v = list(set([d[1] for d in dawg])) # 假如这个v中存在两个以上的字符串,或者说第一个元素不全是字符串(???会这样的咩) if len(v) > 1 or not v[0].isalpha(): return # # 这里说明,这个字符串至少出现过一次,这里在不同的字典中统计这个字符串的出现次数 v = v[0] f = sum([d[0][v] for d in dawg]) self.prod = v self.sym = 'W%s' % get_nont_class('W', v) self.L = NonT_L(v, word) # 引入NonT_L 分析password的大小写情况 # print(self.L) self.prob = self.L.prob * float(f) / self.total_f # 添加特殊字符对概率的影响 def parse_tree(self): pt = ParseTree() pt.add_rule((self.sym, self.prod)) pt.extend_rules(self.L.parse_tree()) return pt def rule_set(self): rs = RuleSet() rs.add_rule(self.sym, self.prod) # rs.update_set(self.L.rule_set()) return rs def __str__(self): return '%s: %s<%s> (%g)' % (self.sym, self.prod, self.L, self.prob)
class NonT_W(NonT): sym, prod, prob = 'W', '', 0.0 thisdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) word_dawg = IntDAWG().load('{}dictionary1.1.dawg'.format(thisdir)) fname_dawg = IntDAWG().load('{}eng_dict.dawg'.format(thisdir)) lname_dawg = IntDAWG().load('{}eng_dict.dawg'.format(thisdir)) total_f = word_dawg[u'__total__'] + \ fname_dawg[u'__total__'] + \ lname_dawg[u'__total__'] l33t_replaces = DAWG.compile_replaces({ '3': 'e', '4': 'a', '@': 'a', '$': 's', '0': 'o', '1': 'i', 'z': 's' }) def __init__(self, word): # super(NonT_W, self).__init__() w = unicode(word.lower()) dawg = [] for d in [ self.word_dawg, # self.fname_dawg, # self.lname_dawg ]: k = d.similar_keys(w, self.l33t_replaces) if k: dawg.append((d, k[0])) if dawg: v = list(set([d[1] for d in dawg])) if len(v) > 1 or not v[0].isalpha(): return # v = v[0] f = sum([d[0][v] for d in dawg]) self.prod = v self.sym = 'W%s' % get_nont_class('W', v) # self.L = NonT_L(v, word) self.prob = self.L.prob * float(f) / self.total_f # def parse_tree(self): pt = ParseTree() pt.add_rule((self.sym, self.prod)) pt.extend_rules(self.L.parse_tree()) return pt def rule_set(self): rs = RuleSet() rs.add_rule(self.sym, self.prod) rs.update_set(self.L.rule_set()) return rs def __str__(self): return '%s: %s<%s> (%g)' % (self.sym, self.prod, self.L, self.prob)
class NonT_W(NonT): sym, prod, prob = 'W', '', 0.0 word_dawg = IntDAWG().load('data/English_30000.dawg') fname_dawg = IntDAWG().load('data/facebook-firstnames-withcount.dawg') lname_dawg = IntDAWG().load('data/facebook-lastnames-withcount.dawg') total_f = word_dawg[u'__total__'] + \ fname_dawg[u'__total__'] + \ lname_dawg[u'__total__'] l33t_replaces = DAWG.compile_replaces({ '3': 'e', '4': 'a', '@': 'a', '$': 's', '0': 'o', '1': 'i', 'z': 's' }) def __init__(self, word): # super(NonT_W, self).__init__() w = unicode(word.lower()) dawg = [] for d in [self.word_dawg, self.fname_dawg, self.lname_dawg]: k = d.similar_keys(w, self.l33t_replaces) if k: dawg.append((d, k[0])) if dawg: v = list(set([d[1] for d in dawg])) if len(v) > 1 or not v[0].isalpha(): return v = v[0] f = sum([d[0][v] for d in dawg]) self.prod = v self.sym = 'W%s' % get_nont_class('W', v) self.L = NonT_L(v, word) self.prob = self.L.prob * float(f) / self.total_f def parse_tree(self): pt = ParseTree() pt.add_rule((self.sym, self.prod)) pt.extend_rule(self.L.parse_tree()) return pt def rule_set(self): rs = RuleSet() rs.add_rule(self.sym, self.prod) rs.update_set(self.L.rule_set()) return rs def __str__(self): return '%s: %s<%s> (%g)' % (self.sym, self.prod, self.L, self.prob)
class NonT_W(NonT): sym, prod, prob = 'W', '', 0.0 thisdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) word_dawg = load_dawg('{}/data/English_30000.dawg.gz'.format(thisdir)) fname_dawg = load_dawg('{}/data/facebook-firstnames-withcount.dawg.gz' .format(thisdir)) lname_dawg = load_dawg('{}/data/facebook-lastnames-withcount.dawg.gz' .format(thisdir)) total_f = word_dawg['__total__'] + fname_dawg['__total__'] + lname_dawg['__total__'] l33t_replaces = DAWG.compile_replaces(L33T) def __init__(self, word): # super(NonT_W, self).__init__() w = str(word.lower()) dawg = [] for d in [self.word_dawg, self.fname_dawg, self.lname_dawg]: k = d.similar_keys(w, self.l33t_replaces) if k: dawg.append((d, k[0])) if dawg: v = list(set([d[1] for d in dawg])) if len(v) > 1 or not v[0].isalpha(): return v = v[0] f = sum([d[0][v] for d in dawg]) self.prod = v self.sym = 'W%s' % get_nont_class('W', v) self.L = NonT_L(v, word) self.prob = self.L.prob * float(f) / self.total_f def parse_tree(self): pt = ParseTree() pt.add_rule((self.sym, self.prod)) pt.extend_rules(self.L.parse_tree()) return pt def rule_set(self): rs = RuleSet() rs.add_rule(self.sym, self.prod) rs.update_set(self.L.rule_set()) return rs def __str__(self): return '%s: %s<%s> (%g)' % (self.sym, self.prod, self.L, self.prob)
if __name__ == '__main__': assert len(sys.argv) == 2 source_dir = sys.argv[1] if source_dir.endswith("/"): source_dir = source_dir[:-1] assert exists(source_dir) target_dir = source_dir + "_dawg" if exists(target_dir): os.rmdir(target_dir) makedirs(target_dir) source_files = listdir(source_dir) for filename in source_files: print filename with open(join(source_dir, filename), 'r') as input_file: contents = input_file.read() if filename == 'mappings': with open(join(target_dir, 'mappings'), 'w'): # copy source to destination output_file.write(contents) else: with open(join(target_dir, filename + ".dawg"), 'w') as output_file: lines = contents.split("\n") d = DAWG(l for l in lines if len(l) > 0) d.write(output_file)
class TrainedGrammar(object): l33t_replaces = DAWG.compile_replaces({ '3':'e', '4':'a', '@':'a', '$':'s', '0':'o', '1':'i', 'z':'s' }) def __init__(self, g_file=GRAMMAR_FILE, cal_cdf=False): self.cal_cdf = cal_cdf self.load(g_file) self.NonT_set = filter(lambda x: x.find('_') < 0, self.G.keys()) def load(self, filename): self.G = json.load(open_(filename), object_pairs_hook=OrderedDict) for k,v in self.G.items(): if self.cal_cdf: print_err("Calculating CDF!") lf = 0 for l,f in v.items(): v[l] += lf lf += f v['__total__'] = lf else: v['__total__'] = sum(v.values()) # Create dawg/trie of the Wlist items for fast retrieval Wlist = [x for k,v in self.G.items() for x in v if k.startswith('W')] self.date = Date() self.Wdawg = IntDAWG(Wlist) def get_prob(self, l, r): f = self.G.get(l, {}).get(r, 0) return max(float(f)/self.G[l]['__total__'], 0.0) def isNonTerm(self, lhs): # this means given lhs, rhs will be in NonT return lhs in self.NonT_set def get_actual_NonTlist(self, lhs, rhs): if lhs == 'G': # Don't include, "W1,G", "D1,G" etc. if rhs.endswith(',G'): return [] return rhs.split(',') elif lhs == 'T': return ['%s_%s' % (lhs,c) for c in (rhs.split(',') if ',' in rhs else rhs)] elif lhs == 'L': return ['%s_%s' % (lhs,c) for c in rhs] elif lhs in ['W', 'D', 'Y', 'R', 'K']: return [] else: return [] def get_freq(self, l, r): return self.G.get(l, {}).get(r, 0) def get_W_rule(self, word): w = unicode(word.lower()) k = self.Wdawg.similar_keys(w, self.l33t_replaces) if k: k = k[0] L = NonT_L(k, word) sym = 'W%s' % get_nont_class('W', k) return (sym, [(k, L)], self.get_prob(sym, k)) def get_T_rule(self, word): T = self.date.IsDate(word) if T: p = 10**(len(word)-8) for r in T.tree: p *= self.get_prob(*r) p *= self.get_prob(*(T.get_rule())) return ('T', [(word, T)], p) def get_all_matches(self, word): rules = [] for nt in self.NonT_set: if nt.startswith('W'): l = self.get_W_rule(word) if l: rules.append(l) elif nt == 'T': l = self.get_T_rule(word) if l: rules.append(l) else: f = self.G[nt].get(word, 0) if f>0: rules.append((nt, [(word)], float(f)/self.G[nt]['__total__'])) rules = filter(lambda x: x and x[-1], rules) if rules: return max(rules, key=lambda x: x[-1]) def join(self, r, s): not_startswith_L_T = lambda x: x and \ not (x[0].startswith('L_') or x[0].startswith('T_')) if not_startswith_L_T(s) and not_startswith_L_T(r): k = ','.join([r[0],s[0]]) p = r[-1] * s[-1] a = r[1] + s[1] return (k, a, p) def random_parse(self, word, try_num=3): """ Returns a random parse of the word following the grammar. """ # First- rejection sampling, most inefficient version # break the word into random parts and then see if that parse exist print "\n^^^^^^^^^^^_______________^^^^^^^^^^^^^^" if try_num<0: print "I am very sorry. I could not parse this :(!!" return None # NO IDEA HOW TO randomly pick a parse tree!! @@TODO def parse(self, word): A = {} if not word: return () for j in range(len(word)): for i in range(len(word)-j): A[(i, i+j)] = self.get_all_matches(word[i:j+i+1]) t = [A[(i, i+j)]] t.extend([self.join(A[(i,k)], A[(k+1, i+j)]) for k in range(i, i+j)]) if t: A[(i, i+j)] = \ max(t, key = lambda x: x[-1] if x else 0) else: A[(i, i+j)] = () # print "Not sure why it reached here. But it did!" # print i, j, word[i: i+j+1] return A[(0, len(word)-1)] def default_parse_tree(self, word): """ Returns the default parse of a word. Default parse is G -> W1,G | D1,G | Y1,G | W1 | D1 | Y1 This parses any string over the allowed alphabet returns a l-o-r traversed parse tree """ pt = ParseTree() n = len(word) for i,c in enumerate(word): r = whatchar(c) + '1' if i<n-1: r = r + ',G' pt.add_rule(('G', r)) pt.add_rule((r[:2], c.lower())) if r.startswith('W'): nont_l = NonT_L(c, c) pt.extend_rules(nont_l.parse_tree()) return pt def l_parse_tree(self, word): # leftmost parse-tree pt = ParseTree() p = self.parse(word) if not p: print "Failing at ", word.encode('utf-8') return pt #assert p[0] in self.G['G'], "Wrong rule: {} --> {}".format('G', p[0]) if p[0] not in self.G['G']: return self.default_parse_tree(word) pt.add_rule(('G', p[0])) for l, each_r in zip(p[0].split(','), p[1]): if isinstance(each_r, basestring): pt.add_rule((l, each_r)) elif l.startswith('W'): pt.add_rule((l, each_r[0])) L_parse_tree = each_r[1].parse_tree() pt.add_rule(L_parse_tree[0]) if len(L_parse_tree.tree)>1: pt.tree.extend(L_parse_tree[1][1]) elif l == 'T': p = each_r[1] rule_name = ','.join([r[0].replace('T_','') for r in p]) pt.add_rule((l, rule_name)) pt.extend_rules(p) else: print "Something is severly wrong" return pt def rule_set(self, word): rs = RuleSet() pt = self.l_parse_tree(word) for p in pt.tree: rs.add_rule(*p) return rs def encode_rule(self, l, r): rhs_dict = self.G[l] try: i = rhs_dict.keys().index(r) if DEBUG: c = rhs_dict.keys()[i] assert c==r, "The index is wrong" except ValueError: print "'{}' not in the rhs_dict (l: '{}', rhs_dict: {})".format(r, l, self.G[l]) raise ValueError l_pt = sum(rhs_dict.values()[:i]) r_pt = l_pt + rhs_dict[r]-1 return convert2group(random.randint(l_pt,r_pt), rhs_dict['__total__']) def encode_pw(self, pw): pt = self.l_parse_tree(pw) try: code_g = [self.encode_rule(*p) for p in pt] except ValueError: print "Error in encoding: \"{}\"".format(pw) raise ValueError return [] extra = hny_config.PASSWORD_LENGTH - len(code_g); code_g.extend([convert2group(0,1) for x in range(extra)]) return code_g def decode_rule(self, l, p): rhs_dict = self.G[l] if not rhs_dict: return '' assert '__total__' in rhs_dict, "__total__ not in {!r}, l={!r}"\ .format(rhs_dict, l) p %= rhs_dict['__total__'] if self.cal_cdf: if len(rhs_dict)>1000: print_once(l, len(rhs_dict)) return bin_search(rhs_dict.items(), p, 0, len(rhs_dict)) for k,v in rhs_dict.items(): if p<v: return k else: p -= v print "Allas could not find.", l, p def decode_l33t(self, w, iterp): l = self.decode_rule('L', iterp.next()) if l == 'Caps': return w.capitalize() elif l == 'lower': return w.lower() elif l == 'UPPER': return w.upper() else: nw = ''.join([self.decode_rule('L_%s'%c, iterp.next()) for c in w]) return nw def decode_pw(self, P): assert len(P) == hny_config.PASSWORD_LENGTH, \ "Not correct length to decode, Expecting {}, got {}"\ .format(hny_config.PASSWORD_LENGTH, len(P)) iterp = iter(P) plaintext = ''; stack = ['G'] while stack: lhs = stack.pop() rhs = self.decode_rule(lhs, iterp.next()) if lhs in ['G', 'T', 'W', 'R', 'Y', 'D']: arr = rhs.split(',') if lhs != 'T' \ else ['T_%s'% c for c in rhs.split(',')] arr.reverse() stack.extend(arr) elif lhs.startswith('W'): rhs = self.decode_l33t(rhs, iterp) plaintext += rhs else: plaintext += rhs return plaintext def encode_grammar(self, G): """ Encodes a sub-grammar @G under the current grammar. """ vd = VaultDistPCFG() stack = ['G'] code_g = [] done = list(G.default_keys()) while stack: head = stack.pop() assert head not in done, "head={} already in done={}".format(head, done) done.append(head) rule_dict = G[head] t_set = [] for rhs, f in rule_dict.items(): if rhs != '__total__': r = filter(lambda x: x not in done+stack, self.get_actual_NonTlist(head, rhs)) if r: for x in r: if (x not in t_set): t_set.append(x) t_set.reverse() stack.extend(t_set) n = len(rule_dict.keys())-1 code_g.append(vd.encode_vault_size(head, n)) if n<0: print "Sorry I cannot encode your password ('{}')! \nPlease choose"\ " something different, like password12".format((head, rule_dict.keys())) exit(0) assert n == vd.decode_vault_size(head, code_g[-1]), "Vault size encoding mismatch. "\ "\nhead: \"{}\", code_g: {}, n: {}, decoded_vault_size: {}"\ .format(head, code_g[-1], n, vd.decode_vault_size(head, code_g[-1])) code_g.extend([self.encode_rule(head, r) for r in rule_dict.keys() if r != '__total__']) extra = hny_config.HONEY_VAULT_GRAMMAR_SIZE - len(code_g); code_g.extend([convert2group(0,1) for x in range(extra)]) return code_g def decode_grammar(self, P): """ Decodes a subgrammar under self.G using the random numbers from P. """ g=SubGrammar(self) vd = VaultDistPCFG() iterp = iter(P) stack = ['G'] done = [] while stack: head = stack.pop() assert head not in done, "@Head ({}) in @done. It should not!".format(head) done.append(head) p = iterp.next() #print "RuleSizeDecoding:", head, done n = vd.decode_vault_size(head, p) t_set = [] for x in range(n): rhs = self.decode_rule(head, iterp.next()) #print "Decoding:", stack, head, '==>', rhs if rhs != '__totoal__': r = filter(lambda x: x not in done+stack, self.get_actual_NonTlist(head, rhs)) if r: for x in r: if (x not in t_set): t_set.append(x) g.add_rule(head, rhs) t_set.reverse() stack.extend(t_set) g.finalize() # fixes the freq and some other book keepings return g def __getitem__(self, l): return self.G[l] def __contains__(self, k): return k in self.G def is_grammar(self): return bool(self.G['G']) def __str__(self): return json.dumps(self.G['G'], indent=2) def nonterminals(self): return self.G.keys()
class Grammer(object): """ 功能:处理pcfg加密和解密的类 属性: G OrderedDict:存放各种规则 l33t_replace:各类可替换规则 方法: parse:分析密码的组成内容s """ G = '' l33t_replaces = DAWG.compile_replaces({ '3': 'e', '4': 'a', '@': 'a', '$': 's', '0': 'o', '1': 'i', 'z': 's' }) def __init__(self, filename, cfg=False): self.cal_cdf = cfg self.load(filename) self.Non_set = [] for each in self.G: # 不能有_存在G中 if each.find('_') < 0: self.Non_set.append(each) # 读入字典,并且统计所有的规则的sym def load(self, filename): self.G = json.load(open_(filename), object_pairs_hook=OrderedDict) # 这里读取字典的键值 for k, v in self.G.items(): if self.cal_cdf: # print_err("Calculating CDF!") # lf表示的是当前规则中的数量 # 每一个规则都要把上一次的规则的数量加载其中(有点像是处理(5)-(0)就能求出从1~5的规则出现的次数 lf = 0 for l, f in v.items(): # v[l] += lf lf += f v['__total__'] = lf else: v['__total__'] = sum(v.values()) # 然后这里统计出现的所有的W字符串,在一会得字符串生成过程中使用 self.Wlist = [] for k, D in self.G.items(): if k.startswith('W'): self.Wlist.extend([x for x in D]) # 设定data变量,方便管理日期规则 self.date = Date() # 建立dawg,方便生成管理word规则 self.Wlist = IntDAWG(self.Wlist) # 产生一个(sym,([word]).prod)的规律 def getProb(self, l, r): f = self.G.get(l, {}).get(r, 0) return max(float(f) / self.G[l]['__total__'], 0.0) # 方法:得到最可能符合密码的规则 # 返回值:list,内部存有类似于节点的点 # 其中: # 如果为word类型,则返回值定义为('W1',[(similar_keys,Nont_T)],prob) # 如果为time类型,则返回值定义为('T',[(passwd,Date)],prob)] # 否则返回值定义为(sym,[(passwd)],prob) def genRuleMatches(self, passwd): # 用于存储所有可能的规则 l = [] # 首先要查找这段密码属于哪个规则: for rule in self.Non_set: # =================如果是词汇规则的话============================ if rule.startswith('W'): # 在之前整理的dawg中查找 k = self.Wlist.similar_keys(passwd.lower(), self.l33t_replaces) # 将最相似的作为规则 if k: sym = "W%s" % (get_nont_class('W', passwd)) prod = NonT_L(k, passwd) prob = self.getProb(sym, passwd.lower()) l.append((sym, [(k[0], prod)], prob)) # ================如果是时间规则的话============================= elif rule.startswith('T'): # 在之前找到的Date中处理passwd, # 假如是日期的话,返回值[('T_Y', '2013'), ('T_m', '10'), ('T_d', '26')]类似 T = self.date.IsDate(passwd) if T: sym = 'T' prod = (passwd, T) prob = 10**(len(passwd) - 8) for each in T: prob *= self.getProb(*each) # print((sym,[prod],prob)) l.append((sym, [prod], prob)) # 如果不是这两个种类的话,其他种类的规则是没有节点的 else: # 只需要计算出现概率即可 f = self.G[rule].get(passwd, 0) if f > 0: l.append((rule, [(passwd)], float(f) / self.G[rule]['__total__'])) # 然后我们查找,把概率最高的规则作为返回值 temp_prob = 0 tu = () for each in l: if temp_prob < each[2]: tu = each temp_prob = each[2] return tu def not_startswith_L_T(self, passwd): if passwd: if passwd[0].startswith('L_') or passwd[0].startswith('T_'): return False else: return True else: return passwd # 方法:把两个不同的节点连接起来 def join(self, l, r): # 如果不是特使的节点就把它们连起来 if self.not_startswith_L_T(l) and self.not_startswith_L_T(r): sym = ','.join([l[0], r[0]]) prob = l[-1] * r[-1] prod = l[1] + r[1] return (sym, prod, prob) def parse(self, passwd): # 首先检验读入的字符串不是空字符串 if not passwd: return '' nonTRule = {} # 然后是对读入的字符串进行分析 # 使用它的算法:先算每一个叫部分的规则,然后组合起来(有点像。。。。那个。。分治的思想) index = 0 first = True for rep in range(len(passwd)): for start in range(len(passwd) - rep): index += 1 # 1、(分)将字符串分成不同的小块进行分析(治),得到此部分的方法 # (此处思想是二维的动归,rep表示的是此时跨过多少个字符串) nonTRule[(start, start + rep)] = self.genRuleMatches( passwd[start:start + rep + 1]) rule_list = [] rule_list.append(nonTRule[(start, start + rep)]) # 2、(合)分析各个部分的小块的发生概率,分别记录下来 for bet in range(start, start + rep): temp_non = self.join(nonTRule[(start, bet)], nonTRule[(bet + 1, start + rep)]) rule_list.append(temp_non) # 3、(计)找到发生概率最大的规则,将这个规则当作此时[start:start+rep+1]的值 # 使用fliter生成迭代对象,更好找我们要的变量prob # temp = filter(lambda k:k,rule_list) # 记录下此时的最可能的规则 if rule_list: nonTRule[(start, start + rep)] = max(rule_list, key=lambda x: x[-1] if x else 0) # print(nonTRule[(start,start+rep)]) else: nonTRule[(start, start + rep)] = () return nonTRule[(0, len(passwd) - 1)] # 简单解析函数,将简单规则的密码进行加密(这个简单规则是指类似于123456)或者无法解释的内容 def defaultPasswordParse(self, word): # 将所有的密码格式设置成G -> W1,G | D1,G | Y1,G | W1 | D1 | Y1的形式 pt = ParseTree() n = len(word) for i, c in enumerate(word): r = whatchar(c) + '1' # if i<n-1: # r = r + ',G' pt.add_rule(('G', r)) pt.add_rule((r[:2], c.lower())) if r.startswith('W'): nont_l = NonT_L(c, c) pt.extend_rules(nont_l.parse_tree()) return pt # 简单规则中,其实也差不多,就是直接看是 # 解析函数,目的是将函数解析成需要的语法树,然后在cfg中查找需要的值 def lParseTree(self, passwd): pt = ParseTree() rule = self.parse(passwd) print("our rule is ") print(rule) # 如果返回值为空的话,则说明翻译失败。记录此时密码 if not rule: print("Failed encode %s" % passwd) return pt # 假如是无G状态,就是说简单的密码时,就使用简单的密码加密 if rule[0] not in self.G['G']: return self.defaultPasswordParse(passwd) # 否则的话,首先设定第一层的规则 pt.add_rule(('G', rule[0])) # 然后,将每一层规则和每一个内容读出来,安插到parsetree中 for sym, rhs in zip(rule[0].split(','), rule[1]): # 首先确认一下,假如规则不是W或者T的话,rhs此时应该只是字符串 if isinstance(rhs, str): # 然后可以直接把这个规则放入 pt.add_rule((sym, rhs)) # 假如这个规则是W的话,那么后面跟着的就是(similarkeys_list,NonT_L)则此时要记得先把最相似对象内容放入存档中,并且记录下此时的内容大小写状态 elif sym.startswith('W'): pt.add_rule((sym, rhs[0])) # 这里使用parse_tree变量,把此时的单词的状态子叶记录 ltree = rhs[1].parse_tree() # 然后,此时先把最初的规则放进去 pt.add_rule(ltree[0]) # 假如此时为’133t'规则的话,此时在'133t'之后会记录下此时可能发生替换的元素,则要把这些元素也放入(这些元素已经打包好了) if len(ltree) > 1: pt.tree.extend(ltree[1][1]) # 假如规则是T的话,那么肯定是('T',[('T_Y','1993')..]..)之类的 elif sym.startswith('T'): # 为了与cfg文件内部保持一致,我们此时需要把文件转换成与cfg内的文件一致的格式 temp_sym = '' for each_label in rhs[1]: temp_sym += each_label[0].replace("T_", "") pt.add_rule((sym, temp_sym)) # 然后把其他的节点也放进去 pt.extend_rules(rhs[1]) else: print("we can't figure out this word") # 完成 return pt # 核心加密函数:用于替换我们的密码 def encode_password(self, password): # 首先得到我们的密码的密码树 ptree = self.lParseTree(password) print("our password is ", end='') print(ptree) if not ptree: print("encode failed,change") # 然后将这个密码树映射到不同的数字中: encd = [] # print(ptree) for each_node in ptree: try: encd.append(self.encode_encd(*each_node)) # print(encd) except ValueError as e: print("Error in encoding: \"{}\"".format(password)) print(e) return [] # 假如不出错的话此时就完成了加密,然后注意此时我们的密码可能没有填充完(因为密码本身过短,我们需要使用空白值来填充) length = PASSWORD_MAX_LENGTH - len(encd) # 此时,如果length的长度还是len(encd),那么说明加密失败,返回空列表 if length == PASSWORD_MAX_LENGTH: return [] for i in range(length): encd.append(convert2group(0, 1)) # 映射完成,返回加密完成的数字 return encd # 比例加密函数,用于在一个固定额度区间中获得一个随机数 def encode_encd(self, l, r): # 临时字典,存储此l规则对应的值 rhs_dict = self.G[l] # print(rhs_dict[r]) # 然后获得r的下标 i = list(rhs_dict.keys()).index(r) # 然后开始循环,将其之前的数字进行相加 l_hs = 0 r_hs = 0 for each_index in range(i): l_hs += list(rhs_dict.values())[each_index] # 然后记录下随机数的右侧 r_hs = l_hs + rhs_dict[r] - 1 # 最后调用随机函数,生成介于两者之间的随机数(这里记得把最大值也放上) rn = random.randint(l_hs, r_hs) # print("l_hs is %d,r_hs is %d and the random is %d"%(l_hs,r_hs,rn)) # wn = rn + random.randint(0, int((3000000-rn)/rhs_dict['__total__'])) * rhs_dict['__total__'] wn = convert2group(rn, rhs_dict['__total__']) # print("the wn is %d and it come back is %d"%(wn,wn%rhs_dict['__total__'])) return wn # 比例解密函数 def decode_encd(self, l, r): # 临时字典,存储此时l规则对应的值 rhs_dict = self.G[l] # 然后此时检擦一下是否储存在这个规则(虽然一般都有,可能反解码的时候没有(?) if not rhs_dict: return '' # 还要确保__total__这个属性一定要有,否则就GG assert '__total__' in self.G[l] ,"The __total__ was lost in {!r},l = {!r}"\ .format(rhs_dict,l) # 然后可以开始计算这个值得位置: index = r % rhs_dict['__total__'] # print("the r is %d ,index is %d"%(r,index)) # 接下来,判断参数,决定查找方式 # if self.cal_cdf: # # 假如这个规则比较大的话,我们顺便记录一下这个映射(不知道是否有必要)是否输出 # if len(rhs_dict)>1000: # print_once(l,len(rhs_dict)) # # 使用二分搜索快速查找 # return bin_search(list(rhs_dict.items()),index,0,len(rhs_dict)) # 未使用参数的话,使用比较慢的查找方式 for k, t in rhs_dict.items(): if index < t: return k else: index -= t # 到达这里,说明没有找到。。。检查一下输入是什么吧 print("not find the rule !l is %s and r is %d" % (l, r)) return '' # 尝试进行解密 def decode_password(self, passwd): """ 函数:解密加密的随机串 作用:通过取余运算,将每一个数字对应的原来的法则进行还原,同时利用G点找到之前加密过的密码位置,依次解密 重要参数作用: stack:存放存入的节点 plaintext:存放解密后的字符串 lhs:存放父节点,父节点上存放了某种规则,必定不是字符串 rhs:存放子节点,可能是下一个元素的子节点,可能是字符串 """ if not passwd: return '' # 解密的过程有点像栈堆一样 # 首先新建一个list(如果成功了就换成stack) stack = [] # 然后放入第一个节点(一定是这个,及即时是无法找到对应规则的我们也有G节点) stack.append('G') plaintext = '' index = 0 # 然后进行循环,将密码进行解析 while len(stack) > 0: lhs = stack.pop() # 使用读取功能,检测当前的nond,然后返回当前的状态值 rhs = self.decode_encd(lhs, passwd[index]) index += 1 # 检查此时的rhs节点情况 # 假如该节点为普通节点(而不是什么T_y,L_s那种) if lhs in ['G', 'Y', 'R', 'D', 'T', 'W']: # 那么节点后跟着的就是内容了 if lhs == 'T': # !!可能出错!! sym = ['T_%s' % c for c in rhs] # 普通节点后面跟着的就是普通的规则,用‘,’作为分割符把其分开 else: # print("the rhs is %s"%rhs) sym = rhs.split(',') # 无论哪种情况,都需要把内容颠倒过来(因为放到栈里面,后进先出) sym.reverse() # 然后放入栈中 stack.extend(sym) # 假如此时节点已经是字符节点了,则此时右侧的字符串还未完全的还原,此时还需要把部分元素替换,使用特殊的函数还原 elif lhs.startswith('W'): # 这里passwd放进去,因为下一位必定是大小写判断 l = self.decode_encd('L', passwd[index]) index += 1 # 然后此时判断类型 if l == "lower": plaintext += rhs elif l == "Caps": plaintext += rhs.capitalize() elif l == "UPPER": plaintext += rhs.upper() # 假如是l33t,则此时每个符号都进行了加密,将每个符号进行解密 elif l == "l33t": for c in rhs: plaintext += self.decode_encd('L_%s' % c, passwd[index]) index += 1 # 否则,此时已经是最终节点了 else: plaintext += rhs return plaintext
class TrainedGrammar(object): l33t_replaces = DAWG.compile_replaces({ '3': 'e', '4': 'a', '@': 'a', '$': 's', '0': 'o', '1': 'i', 'z': 's' }) def __init__(self, g_file=grammar_file, cal_cdf=False): self.cal_cdf = cal_cdf self.load(g_file) self.NonT_set = filter(lambda x: x.find('_') < 0, self.G.keys()) def load(self, filename): self.G = json.load(open_(filename), object_pairs_hook=OrderedDict) for k, v in self.G.items(): if self.cal_cdf: print_err("Calculating CDF!") lf = 0 for l, f in v.items(): v[l] += lf lf += f v['__total__'] = lf else: v['__total__'] = sum(v.values()) Wlist = [x for k, v in self.G.items() for x in v if k.startswith('W')] self.date = Date() self.Wdawg = IntDAWG(Wlist) def get_prob(self, l, r): f = self.G.get(l, {}).get(r, 0) if f > 0: return float(f) / self.G[l]['__total__'] def isNonTerm(self, lhs): # this means given lhs, rhs will be in NonT return lhs in self.NonT_set def get_actual_NonTlist(self, lhs, rhs): if lhs == 'G': return rhs.split(',') elif lhs == 'T': return ['%s_%s' % (lhs, c) for c in rhs.split(',')] elif lhs == 'L': return ['%s_%s' % (lhs, c) for c in rhs] else: return [] def get_freq(self, l, r): return self.G.get(l, {}).get(r, 0) def get_W_rule(self, word): w = unicode(word.lower()) k = self.Wdawg.similar_keys(w, self.l33t_replaces) if k: k = k[0] L = NonT_L(k, word) sym = 'W%s' % get_nont_class('W', k) return (sym, [(k, L)], self.get_prob(sym, k)) def get_T_rule(self, word): T = self.date.IsDate(word) if T: p = 10**(len(word) - 8) # for r in T.tree: # p *= self.get_prob(*r) # p *= self.get_prob(*(T.get_rule())) return ('T', [(word, T)], p) def get_all_matches(self, word): rules = [] for nt in self.NonT_set: if nt.startswith('W'): l = self.get_W_rule(word) if l: rules.append(l) elif nt == 'T': l = self.get_T_rule(word) if l: rules.append(l) else: f = self.G[nt].get(word, 0) if f > 0: rules.append( (nt, [(word)], float(f) / self.G[nt]['__total__'])) rules = filter(lambda x: x and x[-1], rules) if rules: return max(rules, key=lambda x: x[-1]) def join(self, r, s): not_startswith_L_T = lambda x: x and \ not (x[0].startswith('L_') or x[0].startswith('T_')) if not_startswith_L_T(s) and not_startswith_L_T(r): k = ','.join([r[0], s[0]]) p = r[-1] * s[-1] a = r[1] + s[1] return (k, a, p) def parse(self, word): A = {} for j in range(len(word)): for i in range(len(word) - j): A[(i, i + j)] = self.get_all_matches(word[i:j + i + 1]) t = [A[(i, i + j)]] t.extend([ self.join(A[(i, k)], A[(k + 1, i + j)]) for k in range(i, i + j) ]) if t: A[(i, i+j)] = \ max(t, key = lambda x: x[-1] if x else 0) else: A[(i, i + j)] = () # print "Not sure why it reached here. But it did!" # print i, j, word[i: i+j+1] return A[(0, len(word) - 1)] def l_parse_tree(self, word): # leftmost parse-tree pt = ParseTree() p = self.parse(word) if not p: print "Failing at ", word.encode('utf-8') return pt pt.add_rule(('G', p[0])) for l, each_r in zip(p[0].split(','), p[1]): if isinstance(each_r, basestring): pt.add_rule((l, each_r)) elif l.startswith('W'): pt.add_rule((l, each_r[0])) L_parse_tree = each_r[1].parse_tree() pt.add_rule(L_parse_tree[0]) if len(L_parse_tree.tree) > 1: pt.tree.extend(L_parse_tree[1][1]) elif l == 'T': p = each_r[1] rule_name = ','.join([r[0].replace('T_', '') for r in p]) pt.add_rule((l, rule_name)) pt.extend_rule(p) else: print "Something is severly wrong" return pt def rule_set(self, word): rs = RuleSet() pt = self.l_parse_tree(word) for p in pt.tree: rs.add_rule(*p) return rs def encode_rule(self, l, r): rhs_dict = self.G[l] i = rhs_dict.keys().index(r) assert i >= 0 l_pt = sum(rhs_dict.values()[:i]) r_pt = l_pt + rhs_dict[r] return convert2group(random.randint(l_pt, r_pt), rhs_dict['__total__']) def encode_pw(self, pw): pt = self.l_parse_tree(pw) code_g = [self.encode_rule(*p) for p in pt] extra = hny_config.PASSWORD_LENGTH - len(code_g) code_g.extend([convert2group(0, 1) for x in range(extra)]) return code_g def decode_rule(self, l, p): rhs_dict = self.G[l] p %= rhs_dict['__total__'] if self.cal_cdf: if len(rhs_dict) > 1000: print_once(l, len(rhs_dict)) return bin_search(rhs_dict.items(), p, 0, len(rhs_dict)) for k, v in rhs_dict.items(): if p < v: return k else: p -= v print "Allas could not find.", l, p def decode_l33t(self, w, iterp): l = self.decode_rule('L', iterp.next()) if l == 'Caps': return w.capitalize() elif l == 'lower': return w.lower() elif l == 'UPPER': return w.upper() else: nw = ''.join( [self.decode_rule('L_%s' % c, iterp.next()) for c in w]) return nw def decode_pw(self, P): assert len(P) == hny_config.PASSWORD_LENGTH iterp = iter(P) plaintext = '' stack = ['G'] while stack: lhs = stack.pop() rhs = self.decode_rule(lhs, iterp.next()) if lhs in ['G', 'T']: arr = rhs.split(',') if lhs == 'G' \ else ['T_%s'% c for c in rhs.split(',')] arr.reverse() stack.extend(arr) elif lhs.startswith('W'): rhs = self.decode_l33t(rhs, iterp) plaintext += rhs else: plaintext += rhs return plaintext def __getitem__(self, l): return self.G[l] def __contains__(self, k): return k in self.G def is_grammar(self): return bool(self.G['G'])
class Solve: def __init__(self): with open('enable1.txt', 'r') as file: self.valid_scrabble_words = set() # could use chain.from_iterable here when we start using wildcards again for string in file: # self.valid_scrabble_words |= self.wildcard_it(string.strip()) self.valid_scrabble_words.add(string.strip()) self.scrabble_tile_frequencies = {'e': 12, 'a': 9, 'i': 9, 'o': 8, 'n': 6, 'r': 6, 't': 6, 'l': 4, 's': 4, 'u': 4, 'd': 4, 'g': 3, 'b': 2, 'c': 2, 'm': 2, 'p': 2, 'f': 2, 'h': 2, 'v': 2, 'w': 2, 'y': 2, 'k': 1, 'j': 1, 'x': 1, 'q': 1, 'z': 1} # dummy tiles representing wildcards self.scrabble_tile_frequencies.update(dict.fromkeys("ABCDEFGHIJKLMNOPQRSTUVWXYZ", 2)) self.scrabble_tiles = [tile for tile in self.scrabble_tile_frequencies for x in range(self.scrabble_tile_frequencies[tile])] self.test_solution = "" # wildcard tiles, which are represented by uppercase letters, will default to a value of 0 self.letter_scores = defaultdict(int, {'e': 1, 'a': 1, 'i': 1, 'o': 1, 'n': 1, 'r': 1, 't': 1, 'l': 1, 's': 1, 'u': 1, 'd': 2, 'g': 2, 'b': 3, 'c': 3, 'm': 3, 'p': 3, 'f': 4, 'h': 4, 'v': 4, 'w': 4, 'y': 4, 'k': 5, 'j': 8, 'x': 8, 'q': 10, 'z': 10}) if not isfile("word scores.pkl"): with Pool(8) as p: self.word_scores = dict( zip(self.valid_scrabble_words, p.map(self.string_score_2, self.valid_scrabble_words))) with open("word scores.pkl", 'wb') as file: dump(self.word_scores, file, HIGHEST_PROTOCOL) else: with open("word scores.pkl", 'rb') as file: self.word_scores = load(file) if not isfile('word graph.dawg'): self.word_graph = DAWG(self.valid_scrabble_words) self.word_graph.save('word graph.dawg') else: self.word_graph = DAWG().load('word graph.dawg') def reset(self): self.test_solution = "" self.scrabble_tiles = [tile for tile in self.scrabble_tile_frequencies for x in range(self.scrabble_tile_frequencies[tile])] def wildcard_it(self, string): return {str(string[:x].lower() + string[x:].capitalize())[:y] + str(string[:x].lower() + string[x:].capitalize())[y:].capitalize() for x in range(len(string)) for y in range(x, len(string))} def string_score(self, solution): """solution is a string that is worth points returns the point value of the string including subwords""" return sum(self.word_scores[word] for word in self.words_in_string(solution)) def string_score_2(self, solution): """solution is a string that is worth points returns the point value of the string NOT including subwords""" return sum(self.letter_scores[letter] for letter in solution) def words_in_string(self, string): return {word for x in range(len(string)) for word in self.word_graph.prefixes(string[x:])} def evaluate_part(self, candidate_tiles): """candidate_tiles is a string returns a point-value of the string that candidate_tiles represent""" return self.string_score(self.test_solution + candidate_tiles) def make_solution_method_1(self): """returns a string that is worth as many points as possible""" def part_value(part): return part[1] def get_part(part): return part[0] with Pool(32) as p: while self.scrabble_tiles: possible_part_list = p.map(self.evaluate_part, set(permutations(self.scrabble_tiles, r=4))) # doesn't work with wildcard tiles represented by dummy tiles best_part = max(possible_part_list, key=part_value) print(best_part) self.test_solution += "".join(get_part(best_part)) for tile in get_part(best_part): self.scrabble_tiles.remove(tile) print(self.test_solution) return self.test_solution def add_to_solution(self, part): self.test_solution += part for tile in part: # remove used tiles from bag of scrabble tiles if tile.isupper(): for owned_tile in "ABCDEFGHIJKLMNOPQRSTUVWXYZ": self.scrabble_tiles.remove(owned_tile) else: self.scrabble_tiles.remove(tile) def generate_word_combinations(self, words, max_length): """words is a list of words. max_length is a positive integer returns a generator of strings composed of permutations of words for each length up to the length specified by maxlength""" return ("".join(word_tuple) for word_tuple in chain.from_iterable(permutations(words, r=length) for length in range(1, max_length + 1))) def get_feasible_parts(self, word_list): """returns the set of strings that can be made from the current set of tiles left""" current_tile_count = Counter(self.scrabble_tiles) return (words for words in word_list if all(current_tile_count[letter] >= Counter(words)[letter] for letter in words)) def make_solution_method_2(self): """returns a string that is worth as many points as possible""" while self.scrabble_tiles: possible_part_list = self.get_feasible_parts(self.valid_scrabble_words) best_parts = nlargest(100, possible_part_list, self.evaluate_part) # get top n words if best_parts: best_part = max(self.get_feasible_parts(self.generate_word_combinations(best_parts, 2)), key=self.evaluate_part) self.add_to_solution(best_part) print(self.test_solution) print(self.string_score(self.test_solution)) else: break return self.test_solution def make_solution_method_3(self): """returns a string that is worth as many points as possible""" while self.scrabble_tiles: possible_part_list = self.get_feasible_parts(self.valid_scrabble_words) best_parts = nlargest(int(20 - len(self.test_solution) / 5), possible_part_list, self.evaluate_part) # get top n words if best_parts: best_part = choice(list(self.get_feasible_parts(self.generate_word_combinations(best_parts, 3)))) self.add_to_solution(best_part) print(self.test_solution) print(self.string_score(self.test_solution)) else: break return self.test_solution
def _count(sym2_id: defaultdict(set), pmid2_id: defaultdict(set)): # pruning: remove the "empty" symbol if '' in sym2_id: del sym2_id[''] logging.info("initalizing counters") symbols = {s: 0 for s in sym2_id.keys()} # global count per symbol references = {} # count per id & symbol in the referenced titles for sym, ids in sym2_id.items(): for id_ in ids: if id_ in references: references[id_][sym] = 0 else: references[id_] = {sym: 0} logging.info("initializing DAFSA graph") dwag = DAWG(sym2_id.keys()) medline = MedlineSession() for pmid, known_ids in pmid2_id.items(): logging.info("counting PMID %d", pmid) relevant = {} # checked symbols while True: try: for (txt, ) in medline.query( Section.content).filter(Section.pmid == pmid).filter( Section.name != 'Copyright').filter( Section.name != 'Vernacular'): offsets = set(TokenOffsets(txt)) # only attempt prefix matches at offsets for idx in offsets: keys = dwag.prefixes(txt[idx:]) if keys: sym = keys[-1] # only offset-delimited matches if idx + len(sym) in offsets: symbols[sym] += 1 if sym in relevant: if relevant[sym]: for id_ in known_ids & sym2_id[sym]: references[id_][sym] += 1 else: relevant[sym] = False for id_ in known_ids & sym2_id[sym]: references[id_][sym] += 1 relevant[sym] = True break except DatabaseError: medline = MedlineSession() for _id, counts in references.items(): for sym, count in counts.items(): print("{}\t{}\t{}\t{}".format(_id, repr(sym)[1:-1], count, symbols[sym]))
def _count(sym2_id:defaultdict(set), pmid2_id:defaultdict(set)): # pruning: remove the "empty" symbol if '' in sym2_id: del sym2_id[''] logging.info("initalizing counters") symbols = {s: 0 for s in sym2_id.keys()} # global count per symbol references = {} # count per id & symbol in the referenced titles for sym, ids in sym2_id.items(): for id_ in ids: if id_ in references: references[id_][sym] = 0 else: references[id_] = {sym: 0} logging.info("initializing DAFSA graph") dwag = DAWG(sym2_id.keys()) medline = MedlineSession() for pmid, known_ids in pmid2_id.items(): logging.info("counting PMID %d", pmid) relevant = {} # checked symbols while True: try: for (txt,) in medline.query(Section.content ).filter(Section.pmid == pmid ).filter(Section.name != 'Copyright' ).filter(Section.name != 'Vernacular' ): offsets = set(TokenOffsets(txt)) # only attempt prefix matches at offsets for idx in offsets: keys = dwag.prefixes(txt[idx:]) if keys: sym = keys[-1] # only offset-delimited matches if idx + len(sym) in offsets: symbols[sym] += 1 if sym in relevant: if relevant[sym]: for id_ in known_ids & sym2_id[sym]: references[id_][sym] += 1 else: relevant[sym] = False for id_ in known_ids & sym2_id[sym]: references[id_][sym] += 1 relevant[sym] = True break except DatabaseError: medline = MedlineSession() for _id, counts in references.items(): for sym, count in counts.items(): print("{}\t{}\t{}\t{}".format(_id, repr(sym)[1:-1], count, symbols[sym]))