Exemplos de DAWG em Python, exemplos de dawg.DAWG em Python

Exemplo n.º 1

0

Exibir arquivo

	def __init__(self, root):
		self.__root = root
		self.__squares = []
		self.__tw_squares = [(0,0),(0,7),(0,14),(7,0),(7,14),(14,0),(14,7),(14,14)]
		self.__dw_sqaures = [(1,1),(2,2),(3,3),(4,4),(7,7),(10,10),(11,11),(12,12),(13,13),
								(13,1),(12,2),(11,3),(10,4),(4,10),(3,11),(2,12),(1,13)]
		self.__tl_squares = [(1,5),(1,9),(13,5),(13,9),(5,1),(5,5),(5,9),(5,13),
								(9,1),(9,5),(9,9),(9,13),(13,5),(13,9)]
		self.__dl_squares = [(0,3),(0,11),(2,6),(2,8),(3,0),(3,7),(3,14),(6,2),(6,6),
								(6,8),(6,12),(7,3),(7,11),(8,2),(8,6),(8,8),(8,12),
								(11,0),(11,7),(11,14),(12,6),(12,8),(14,3),(14,11)]
		self.__settled_tiles = {} # maps coordinates to tile frames (persist GUI)
		self.__placed_tiles  = {} 
		self.__dawg   = DAWG(open('ospd-us.txt').read().split('\n'))

		for i in range(Board.SIZE):
			self.__squares.append([])
			for j in range(Board.SIZE):
				if (i,j) in self.__tw_squares:
					type_ = Board.TRIPLE_WORD
				elif (i,j) in self.__dw_sqaures:
					type_ = Board.DOUBLE_WORD
				elif (i,j) in self.__tl_squares:
					type_ = Board.TRIPLE_LETTER
				elif (i,j) in self.__dl_squares:
					type_ = Board.DOUBLE_LETTER
				else:
					type_ = Board.NORMAL 
				self.__squares[i].append((None,type_))

Exemplo n.º 2

0

Exibir arquivo

Arquivo: main.py Projeto: Daggerfall-is-the-best-TES-game/superstring

    def __init__(self):
        with open('enable1.txt', 'r') as file:
            self.valid_scrabble_words = set()  # could use chain.from_iterable here when we start using wildcards again
            for string in file:
                # self.valid_scrabble_words |= self.wildcard_it(string.strip())
                self.valid_scrabble_words.add(string.strip())

        self.scrabble_tile_frequencies = {'e': 12, 'a': 9, 'i': 9, 'o': 8, 'n': 6, 'r': 6, 't': 6, 'l': 4, 's': 4,
                                          'u': 4,
                                          'd': 4, 'g': 3,
                                          'b': 2, 'c': 2, 'm': 2, 'p': 2,
                                          'f': 2, 'h': 2, 'v': 2, 'w': 2, 'y': 2,
                                          'k': 1,
                                          'j': 1, 'x': 1,
                                          'q': 1, 'z': 1}
        # dummy tiles representing wildcards
        self.scrabble_tile_frequencies.update(dict.fromkeys("ABCDEFGHIJKLMNOPQRSTUVWXYZ", 2))
        self.scrabble_tiles = [tile for tile in self.scrabble_tile_frequencies for x in
                               range(self.scrabble_tile_frequencies[tile])]
        self.test_solution = ""
        #  wildcard tiles, which are represented by uppercase letters, will default to a value of 0
        self.letter_scores = defaultdict(int, {'e': 1, 'a': 1, 'i': 1, 'o': 1, 'n': 1, 'r': 1, 't': 1, 'l': 1, 's': 1,
                                               'u': 1,
                                               'd': 2, 'g': 2,
                                               'b': 3, 'c': 3, 'm': 3, 'p': 3,
                                               'f': 4, 'h': 4, 'v': 4, 'w': 4, 'y': 4,
                                               'k': 5,
                                               'j': 8, 'x': 8,
                                               'q': 10, 'z': 10})

        if not isfile("word scores.pkl"):
            with Pool(8) as p:
                self.word_scores = dict(
                    zip(self.valid_scrabble_words, p.map(self.string_score_2, self.valid_scrabble_words)))
            with open("word scores.pkl", 'wb') as file:
                dump(self.word_scores, file, HIGHEST_PROTOCOL)
        else:
            with open("word scores.pkl", 'rb') as file:
                self.word_scores = load(file)

        if not isfile('word graph.dawg'):
            self.word_graph = DAWG(self.valid_scrabble_words)
            self.word_graph.save('word graph.dawg')
        else:
            self.word_graph = DAWG().load('word graph.dawg')

Exemplo n.º 3

0

Exibir arquivo

class NonT_W(NonT):
    sym, prod, prob = 'W', '', 0.0
    english_dawg = IntDAWG().load(GRAMMAR_PATH + 'words.dawg')
    chinese_dawg = IntDAWG().load(GRAMMAR_PATH + 'pinyin.dawg')
    total_f = english_dawg[u"__total__"] + chinese_dawg[u'__total__']
    l33t_replaces = DAWG.compile_replaces({
        '3': 'e',
        '4': 'a',
        '@': 'a',
        '$': 's',
        '0': 'o',
        '1': 'i',
        'z': 's'
    })

    def __init__(self, word):
        # 传入参数为待分析的密码
        # super(NonT_W, self).__init__()
        w = word.lower()
        dawg = []
        for d in [self.english_dawg, self.chinese_dawg]:
            # 使用replaces的替换，找到和w相似的内容，返回一个list，【0】为与w最相似的部分
            k = d.similar_keys(w, self.l33t_replaces)
            if k:
                dawg.append((d, k[0]))
        # dawg中存放了之前word,fname,lname中与密码最相似的部分
        if dawg:
            # d[1]中存放的是word，fname，lname；里面的字符串可能会有重复的地发
            v = list(set([d[1] for d in dawg]))
            # 假如这个v中存在两个以上的字符串，或者说第一个元素不全是字符串（？？？会这样的咩）
            if len(v) > 1 or not v[0].isalpha():
                return  #
            # 这里说明，这个字符串至少出现过一次，这里在不同的字典中统计这个字符串的出现次数
            v = v[0]
            f = sum([d[0][v] for d in dawg])

            self.prod = v
            self.sym = 'W%s' % get_nont_class('W', v)

            self.L = NonT_L(v, word)  # 引入NonT_L 分析password的大小写情况
            # print(self.L)
            self.prob = self.L.prob * float(f) / self.total_f  # 添加特殊字符对概率的影响

    def parse_tree(self):
        pt = ParseTree()
        pt.add_rule((self.sym, self.prod))
        pt.extend_rules(self.L.parse_tree())
        return pt

    def rule_set(self):
        rs = RuleSet()
        rs.add_rule(self.sym, self.prod)
        # rs.update_set(self.L.rule_set())
        return rs

    def __str__(self):
        return '%s: %s<%s> (%g)' % (self.sym, self.prod, self.L, self.prob)

Exemplo n.º 4

0

Exibir arquivo

class NonT_W(NonT):
    sym, prod, prob = 'W', '', 0.0
    thisdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    word_dawg = IntDAWG().load('{}dictionary1.1.dawg'.format(thisdir))
    fname_dawg = IntDAWG().load('{}eng_dict.dawg'.format(thisdir))
    lname_dawg = IntDAWG().load('{}eng_dict.dawg'.format(thisdir))
    total_f = word_dawg[u'__total__'] + \
        fname_dawg[u'__total__'] + \
        lname_dawg[u'__total__']

    l33t_replaces = DAWG.compile_replaces({
        '3': 'e',
        '4': 'a',
        '@': 'a',
        '$': 's',
        '0': 'o',
        '1': 'i',
        'z': 's'
    })

    def __init__(self, word):
        # super(NonT_W, self).__init__()
        w = unicode(word.lower())
        dawg = []
        for d in [
                self.word_dawg,  # 
                self.fname_dawg,  # 
                self.lname_dawg
        ]:
            k = d.similar_keys(w, self.l33t_replaces)
            if k:
                dawg.append((d, k[0]))
        if dawg:
            v = list(set([d[1] for d in dawg]))
            if len(v) > 1 or not v[0].isalpha():
                return  #
            v = v[0]
            f = sum([d[0][v] for d in dawg])
            self.prod = v
            self.sym = 'W%s' % get_nont_class('W', v)
            # self.L = NonT_L(v, word)
            self.prob = self.L.prob * float(f) / self.total_f  #

    def parse_tree(self):
        pt = ParseTree()
        pt.add_rule((self.sym, self.prod))
        pt.extend_rules(self.L.parse_tree())
        return pt

    def rule_set(self):
        rs = RuleSet()
        rs.add_rule(self.sym, self.prod)
        rs.update_set(self.L.rule_set())
        return rs

    def __str__(self):
        return '%s: %s<%s> (%g)' % (self.sym, self.prod, self.L, self.prob)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: lexer.py Projeto: benedict-beuscher/nocrack

class NonT_W(NonT):
    sym, prod, prob = 'W', '', 0.0
    word_dawg = IntDAWG().load('data/English_30000.dawg')
    fname_dawg = IntDAWG().load('data/facebook-firstnames-withcount.dawg')
    lname_dawg = IntDAWG().load('data/facebook-lastnames-withcount.dawg')
    total_f = word_dawg[u'__total__'] + \
        fname_dawg[u'__total__'] + \
        lname_dawg[u'__total__']

    l33t_replaces = DAWG.compile_replaces({
        '3': 'e',
        '4': 'a',
        '@': 'a',
        '$': 's',
        '0': 'o',
        '1': 'i',
        'z': 's'
    })

    def __init__(self, word):
        # super(NonT_W, self).__init__()
        w = unicode(word.lower())
        dawg = []
        for d in [self.word_dawg, self.fname_dawg, self.lname_dawg]:
            k = d.similar_keys(w, self.l33t_replaces)
            if k:
                dawg.append((d, k[0]))
        if dawg:
            v = list(set([d[1] for d in dawg]))
            if len(v) > 1 or not v[0].isalpha():
                return
            v = v[0]
            f = sum([d[0][v] for d in dawg])
            self.prod = v
            self.sym = 'W%s' % get_nont_class('W', v)
            self.L = NonT_L(v, word)
            self.prob = self.L.prob * float(f) / self.total_f

    def parse_tree(self):
        pt = ParseTree()
        pt.add_rule((self.sym, self.prod))
        pt.extend_rule(self.L.parse_tree())
        return pt

    def rule_set(self):
        rs = RuleSet()
        rs.add_rule(self.sym, self.prod)
        rs.update_set(self.L.rule_set())
        return rs

    def __str__(self):
        return '%s: %s<%s> (%g)' % (self.sym, self.prod, self.L, self.prob)

Exemplo n.º 6

0

Exibir arquivo

class NonT_W(NonT):
    sym, prod, prob = 'W', '', 0.0
    thisdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    word_dawg = load_dawg('{}/data/English_30000.dawg.gz'.format(thisdir))
    fname_dawg = load_dawg('{}/data/facebook-firstnames-withcount.dawg.gz'
                           .format(thisdir))
    lname_dawg = load_dawg('{}/data/facebook-lastnames-withcount.dawg.gz'
                           .format(thisdir))
    total_f = word_dawg['__total__'] + fname_dawg['__total__'] + lname_dawg['__total__']

    l33t_replaces = DAWG.compile_replaces(L33T)

    def __init__(self, word):
        # super(NonT_W, self).__init__()
        w = str(word.lower())
        dawg = []
        for d in [self.word_dawg,
                  self.fname_dawg,
                  self.lname_dawg]:
            k = d.similar_keys(w, self.l33t_replaces)
            if k:
                dawg.append((d, k[0]))
        if dawg:
            v = list(set([d[1] for d in dawg]))
            if len(v) > 1 or not v[0].isalpha():
                return
            v = v[0]
            f = sum([d[0][v] for d in dawg])
            self.prod = v
            self.sym = 'W%s' % get_nont_class('W', v)
            self.L = NonT_L(v, word)
            self.prob = self.L.prob * float(f) / self.total_f

    def parse_tree(self):
        pt = ParseTree()
        pt.add_rule((self.sym, self.prod))
        pt.extend_rules(self.L.parse_tree())
        return pt

    def rule_set(self):
        rs = RuleSet()
        rs.add_rule(self.sym, self.prod)
        rs.update_set(self.L.rule_set())
        return rs

    def __str__(self):
        return '%s: %s<%s> (%g)' % (self.sym, self.prod,
                                    self.L, self.prob)

Exemplo n.º 7

0

Exibir arquivo

Arquivo: to_dawg.py Projeto: Mingchenchen/mhc_ligandome

if __name__ == '__main__':
	assert len(sys.argv) == 2
	source_dir = sys.argv[1]
	if source_dir.endswith("/"):
		source_dir = source_dir[:-1]
	assert exists(source_dir)
    target_dir = source_dir + "_dawg"
	if exists(target_dir):
		os.rmdir(target_dir)
	
	makedirs(target_dir)
	source_files = listdir(source_dir)


	for filename in source_files:
		print filename 
			
		with open(join(source_dir, filename), 'r') as input_file:
			contents = input_file.read()
		
			if filename == 'mappings':
				with open(join(target_dir, 'mappings'), 'w'):
					# copy source to destination 
					output_file.write(contents)
			else:
				with open(join(target_dir, filename + ".dawg"), 'w') as output_file:
					lines = contents.split("\n")
					d = DAWG(l for l in lines if len(l) > 0)
					d.write(output_file)

Exemplo n.º 8

0

Exibir arquivo

class TrainedGrammar(object):
    l33t_replaces = DAWG.compile_replaces({
            '3':'e', '4':'a', '@':'a',
            '$':'s', '0':'o', '1':'i',
            'z':'s'
            })

    def __init__(self, g_file=GRAMMAR_FILE, cal_cdf=False):
        self.cal_cdf = cal_cdf
        self.load(g_file)
        self.NonT_set = filter(lambda x: x.find('_') < 0,  
                               self.G.keys())

    def load(self, filename):
        self.G = json.load(open_(filename),
                           object_pairs_hook=OrderedDict)
        for k,v in self.G.items():
            if self.cal_cdf:
                print_err("Calculating CDF!")
                lf = 0
                for l,f in v.items():
                    v[l] += lf
                    lf += f
                v['__total__'] = lf
            else:
                v['__total__'] = sum(v.values())

        # Create dawg/trie of the Wlist items for fast retrieval
        Wlist = [x 
                 for k,v in self.G.items()
                 for x in v
                 if k.startswith('W')]
        self.date = Date()
        self.Wdawg = IntDAWG(Wlist)

    def get_prob(self, l, r):
        f = self.G.get(l, {}).get(r, 0)
        return max(float(f)/self.G[l]['__total__'], 0.0)

    def isNonTerm(self, lhs): # this means given lhs, rhs will be in NonT 
        return lhs in self.NonT_set
        
    def get_actual_NonTlist(self, lhs, rhs):
        if lhs == 'G':
            # Don't include, "W1,G", "D1,G" etc.
            if rhs.endswith(',G'):
                return []
            return rhs.split(',')
        elif lhs == 'T':
            return ['%s_%s' % (lhs,c)
                    for c in (rhs.split(',') if ',' in rhs
                              else rhs)]
        elif lhs == 'L':
            return ['%s_%s' % (lhs,c)
                    for c in rhs]
        elif lhs in ['W', 'D', 'Y', 'R', 'K']:
            return []
        else:
            return []

    def get_freq(self, l, r):
        return self.G.get(l, {}).get(r, 0)

    def get_W_rule(self, word):
        w = unicode(word.lower())
        k = self.Wdawg.similar_keys(w, self.l33t_replaces)
        if k:
            k = k[0]
            L = NonT_L(k, word)
            sym = 'W%s' % get_nont_class('W', k)
            return (sym, [(k, L)], self.get_prob(sym, k))

    def get_T_rule(self, word):
        T = self.date.IsDate(word)
        if T:
            p = 10**(len(word)-8)
            for r in T.tree:
                p *= self.get_prob(*r)
            p *= self.get_prob(*(T.get_rule()))
            return ('T', [(word, T)], p)

    def get_all_matches(self, word):
        rules = []
        for nt in self.NonT_set:
            if nt.startswith('W'):
                l = self.get_W_rule(word)
                if l: rules.append(l)
            elif nt == 'T':
                l = self.get_T_rule(word)
                if l: rules.append(l)
            else:
                f = self.G[nt].get(word, 0)
                if f>0:
                    rules.append((nt, [(word)], float(f)/self.G[nt]['__total__']))
        rules = filter(lambda x: x and x[-1], rules)
        if rules:
            return max(rules, key=lambda x: x[-1])

    def join(self, r, s):
        not_startswith_L_T = lambda x: x and \
            not (x[0].startswith('L_') or x[0].startswith('T_'))
        if not_startswith_L_T(s) and not_startswith_L_T(r):
            k = ','.join([r[0],s[0]])
            p = r[-1] * s[-1]
            a = r[1] + s[1]
            return (k, a, p)

    def random_parse(self, word, try_num=3):
        """
        Returns a random parse of the word following the grammar.
        """
        # First- rejection sampling, most inefficient version
        # break the word into random parts and then see if that parse exist
        print "\n^^^^^^^^^^^_______________^^^^^^^^^^^^^^"
        if try_num<0:
            print "I am very sorry. I could not parse this :(!!"
            return None
        # NO IDEA HOW TO randomly pick a parse tree!! @@TODO

    def parse(self, word):   
        A = {}
        if not word:
            return ()
        for j in range(len(word)):
            for i in range(len(word)-j):
                A[(i, i+j)] = self.get_all_matches(word[i:j+i+1])
                t = [A[(i, i+j)]]
                t.extend([self.join(A[(i,k)], A[(k+1, i+j)])
                          for k in range(i, i+j)])
                if t:
                    A[(i, i+j)] = \
                        max(t, key = lambda x: x[-1] if x else 0)
                else:
                    A[(i, i+j)] = ()
                    # print "Not sure why it reached here. But it did!"
                    # print i, j, word[i: i+j+1]
        return A[(0, len(word)-1)]

    def default_parse_tree(self, word):
        """
        Returns the default parse of a word. Default parse is
        G -> W1,G | D1,G | Y1,G | W1 | D1 | Y1
        This parses any string over the allowed alphabet
        returns a l-o-r traversed parse tree
        """
        pt = ParseTree()
        n = len(word)
        for i,c in enumerate(word):
            r = whatchar(c) + '1'
            if i<n-1:
                r = r + ',G'
            pt.add_rule(('G', r))
            pt.add_rule((r[:2], c.lower()))
            if r.startswith('W'):
                nont_l = NonT_L(c, c)
                pt.extend_rules(nont_l.parse_tree())

        return pt

    def l_parse_tree(self, word): # leftmost parse-tree
        pt = ParseTree()
        p = self.parse(word)
        if not p:
            print "Failing at ", word.encode('utf-8')
            return pt
        #assert p[0] in self.G['G'], "Wrong rule: {} --> {}".format('G', p[0])
        if p[0] not in self.G['G']:
            return self.default_parse_tree(word)

        pt.add_rule(('G', p[0]))
        for l, each_r in zip(p[0].split(','), p[1]):
            if isinstance(each_r, basestring):
                pt.add_rule((l, each_r))
            elif l.startswith('W'):
                pt.add_rule((l, each_r[0]))
                L_parse_tree = each_r[1].parse_tree()
                pt.add_rule(L_parse_tree[0])
                if len(L_parse_tree.tree)>1:
                    pt.tree.extend(L_parse_tree[1][1])
            elif l == 'T':
                p = each_r[1]
                rule_name = ','.join([r[0].replace('T_','')
                                     for r in p])
                pt.add_rule((l, rule_name))
                pt.extend_rules(p)
            else:
                print "Something is severly wrong"
        return pt

    def rule_set(self, word):
        rs = RuleSet()
        pt = self.l_parse_tree(word)
        for p in pt.tree:
            rs.add_rule(*p)
        return rs

    def encode_rule(self, l, r):
        rhs_dict = self.G[l]
        try:
            i = rhs_dict.keys().index(r)
            if DEBUG:
                c = rhs_dict.keys()[i]
                assert c==r, "The index is wrong"
        except ValueError:
            print "'{}' not in the rhs_dict (l: '{}', rhs_dict: {})".format(r, l, self.G[l])
            raise ValueError
        l_pt = sum(rhs_dict.values()[:i])
        r_pt = l_pt + rhs_dict[r]-1
        return convert2group(random.randint(l_pt,r_pt),
                             rhs_dict['__total__'])

    def encode_pw(self, pw):
        pt = self.l_parse_tree(pw)
        try:
            code_g = [self.encode_rule(*p)
                  for p in pt]
        except ValueError:
            print "Error in encoding: \"{}\"".format(pw)
            raise ValueError
            return []
        extra = hny_config.PASSWORD_LENGTH - len(code_g);
        code_g.extend([convert2group(0,1) for x in range(extra)])
        return code_g

    def decode_rule(self, l, p):
        rhs_dict = self.G[l]
        if not rhs_dict:
            return ''
        assert '__total__' in rhs_dict, "__total__ not in {!r}, l={!r}"\
            .format(rhs_dict, l)
        p %= rhs_dict['__total__']
        if self.cal_cdf:
            if len(rhs_dict)>1000: print_once(l, len(rhs_dict))
            return bin_search(rhs_dict.items(), p, 0, len(rhs_dict))
        for k,v in rhs_dict.items():
            if p<v:
                return k
            else:
                p -= v
        print "Allas could not find.", l, p

    def decode_l33t(self, w, iterp):
        l = self.decode_rule('L', iterp.next())
        if l == 'Caps': return w.capitalize()
        elif l == 'lower': return w.lower()
        elif l == 'UPPER': return w.upper()
        else: 
            nw = ''.join([self.decode_rule('L_%s'%c, iterp.next())
                   for c in w])
            return nw
                
    def decode_pw(self, P):
        assert len(P) == hny_config.PASSWORD_LENGTH, \
            "Not correct length to decode, Expecting {}, got {}"\
            .format(hny_config.PASSWORD_LENGTH, len(P))

        iterp = iter(P)
        plaintext = '';
        stack = ['G']
        while stack:
            lhs = stack.pop()
            rhs = self.decode_rule(lhs, iterp.next())
            if lhs in ['G', 'T', 'W', 'R', 'Y', 'D']:
                arr = rhs.split(',') if lhs != 'T' \
                    else ['T_%s'% c for c in rhs.split(',')]
                arr.reverse()
                stack.extend(arr)
            elif lhs.startswith('W'):
                rhs = self.decode_l33t(rhs, iterp)
                plaintext += rhs
            else:
                plaintext += rhs
        return plaintext

    def encode_grammar(self, G):
        """
        Encodes a sub-grammar @G under the current grammar.
        """
        
        vd = VaultDistPCFG()
        stack = ['G']
        code_g = []
        done = list(G.default_keys())

        while stack:
            head = stack.pop()
            assert head not in done, "head={} already in done={}".format(head, done)
            done.append(head)
            rule_dict = G[head]
            t_set = []
            for rhs, f in rule_dict.items():
                if rhs != '__total__':
                    r = filter(lambda x: x not in done+stack, 
                               self.get_actual_NonTlist(head, rhs))
                    if r:
                        for x in r:
                            if (x not in t_set):
                                t_set.append(x)
            t_set.reverse()
            stack.extend(t_set)
            n = len(rule_dict.keys())-1
            code_g.append(vd.encode_vault_size(head, n))
            if n<0: 
                print "Sorry I cannot encode your password ('{}')! \nPlease choose"\
                    " something different, like password12".format((head, rule_dict.keys()))
                exit(0)
            assert n == vd.decode_vault_size(head, code_g[-1]), "Vault size encoding mismatch. "\
                "\nhead: \"{}\", code_g: {}, n: {}, decoded_vault_size: {}"\
                    .format(head, code_g[-1], n, vd.decode_vault_size(head, code_g[-1]))
            code_g.extend([self.encode_rule(head, r) 
                           for r in rule_dict.keys()
                           if r != '__total__'])
        extra = hny_config.HONEY_VAULT_GRAMMAR_SIZE - len(code_g);
        code_g.extend([convert2group(0,1) for x in range(extra)])
        return code_g

    def decode_grammar(self, P):
        """
        Decodes a subgrammar under self.G using the random numbers from P.
        """
        g=SubGrammar(self)
        vd = VaultDistPCFG()
        iterp = iter(P)
        stack = ['G']
        done = []
        while stack:
            head = stack.pop()
            assert head not in done, "@Head ({}) in @done. It should not!".format(head) 
            done.append(head)
            p = iterp.next()
            #print "RuleSizeDecoding:", head, done
            n = vd.decode_vault_size(head, p)
            t_set = []
            for x in range(n):
                rhs = self.decode_rule(head, iterp.next())
                #print "Decoding:", stack, head, '==>', rhs
                if rhs != '__totoal__':
                    r = filter(lambda x: x not in done+stack, 
                               self.get_actual_NonTlist(head, rhs))
                    if r:
                        for x in r:
                            if (x not in t_set):
                                t_set.append(x)
                g.add_rule(head, rhs)
            t_set.reverse()
            stack.extend(t_set)
        g.finalize() # fixes the freq and some other book keepings
        return g

    def __getitem__(self, l):
        return self.G[l]

    def __contains__(self, k):
        return k in self.G

    def is_grammar(self):
        return bool(self.G['G'])

    def __str__(self):
        return json.dumps(self.G['G'], indent=2)
    
    def nonterminals(self):
        return self.G.keys()

Exemplo n.º 9

0

Exibir arquivo

class Grammer(object):
    """
    功能：处理pcfg加密和解密的类
    属性：
    G OrderedDict：存放各种规则
    l33t_replace：各类可替换规则

    方法：
    parse：分析密码的组成内容s
    """

    G = ''
    l33t_replaces = DAWG.compile_replaces({
        '3': 'e',
        '4': 'a',
        '@': 'a',
        '$': 's',
        '0': 'o',
        '1': 'i',
        'z': 's'
    })

    def __init__(self, filename, cfg=False):

        self.cal_cdf = cfg
        self.load(filename)
        self.Non_set = []
        for each in self.G:
            # 不能有_存在G中
            if each.find('_') < 0:
                self.Non_set.append(each)

    # 读入字典，并且统计所有的规则的sym
    def load(self, filename):
        self.G = json.load(open_(filename), object_pairs_hook=OrderedDict)
        # 这里读取字典的键值
        for k, v in self.G.items():
            if self.cal_cdf:
                # print_err("Calculating CDF!")
                # lf表示的是当前规则中的数量
                # 每一个规则都要把上一次的规则的数量加载其中（有点像是处理（5）-（0）就能求出从1~5的规则出现的次数
                lf = 0
                for l, f in v.items():
                    # v[l] += lf
                    lf += f
                v['__total__'] = lf
            else:
                v['__total__'] = sum(v.values())

        # 然后这里统计出现的所有的W字符串，在一会得字符串生成过程中使用
        self.Wlist = []
        for k, D in self.G.items():
            if k.startswith('W'):
                self.Wlist.extend([x for x in D])

        # 设定data变量，方便管理日期规则
        self.date = Date()
        # 建立dawg，方便生成管理word规则
        self.Wlist = IntDAWG(self.Wlist)

    # 产生一个（sym,([word]).prod)的规律
    def getProb(self, l, r):
        f = self.G.get(l, {}).get(r, 0)
        return max(float(f) / self.G[l]['__total__'], 0.0)

# 方法：得到最可能符合密码的规则
# 返回值：list，内部存有类似于节点的点
# 其中：
# 如果为word类型，则返回值定义为('W1',[(similar_keys,Nont_T)],prob)
# 如果为time类型，则返回值定义为('T',[(passwd,Date)],prob)]
# 否则返回值定义为(sym,[(passwd)],prob)

    def genRuleMatches(self, passwd):

        # 用于存储所有可能的规则
        l = []
        # 首先要查找这段密码属于哪个规则：
        for rule in self.Non_set:
            # =================如果是词汇规则的话============================
            if rule.startswith('W'):
                # 在之前整理的dawg中查找
                k = self.Wlist.similar_keys(passwd.lower(), self.l33t_replaces)
                # 将最相似的作为规则
                if k:
                    sym = "W%s" % (get_nont_class('W', passwd))
                    prod = NonT_L(k, passwd)
                    prob = self.getProb(sym, passwd.lower())
                    l.append((sym, [(k[0], prod)], prob))
            # ================如果是时间规则的话=============================
            elif rule.startswith('T'):
                # 在之前找到的Date中处理passwd,
                # 假如是日期的话，返回值[('T_Y', '2013'), ('T_m', '10'), ('T_d', '26')]类似
                T = self.date.IsDate(passwd)
                if T:
                    sym = 'T'
                    prod = (passwd, T)
                    prob = 10**(len(passwd) - 8)
                    for each in T:
                        prob *= self.getProb(*each)
                    # print((sym,[prod],prob))
                    l.append((sym, [prod], prob))
                # 如果不是这两个种类的话，其他种类的规则是没有节点的
            else:
                # 只需要计算出现概率即可
                f = self.G[rule].get(passwd, 0)
                if f > 0:
                    l.append((rule, [(passwd)],
                              float(f) / self.G[rule]['__total__']))

        # 然后我们查找，把概率最高的规则作为返回值
        temp_prob = 0
        tu = ()
        for each in l:
            if temp_prob < each[2]:
                tu = each
                temp_prob = each[2]

        return tu

    def not_startswith_L_T(self, passwd):
        if passwd:
            if passwd[0].startswith('L_') or passwd[0].startswith('T_'):
                return False
            else:
                return True
        else:
            return passwd


# 方法：把两个不同的节点连接起来

    def join(self, l, r):
        # 如果不是特使的节点就把它们连起来
        if self.not_startswith_L_T(l) and self.not_startswith_L_T(r):

            sym = ','.join([l[0], r[0]])
            prob = l[-1] * r[-1]
            prod = l[1] + r[1]

            return (sym, prod, prob)

    def parse(self, passwd):

        # 首先检验读入的字符串不是空字符串
        if not passwd:
            return ''

        nonTRule = {}

        # 然后是对读入的字符串进行分析
        # 使用它的算法：先算每一个叫部分的规则，然后组合起来（有点像。。。。那个。。分治的思想）
        index = 0
        first = True
        for rep in range(len(passwd)):
            for start in range(len(passwd) - rep):
                index += 1
                # 1、（分）将字符串分成不同的小块进行分析（治）,得到此部分的方法
                # (此处思想是二维的动归,rep表示的是此时跨过多少个字符串)

                nonTRule[(start, start + rep)] = self.genRuleMatches(
                    passwd[start:start + rep + 1])

                rule_list = []
                rule_list.append(nonTRule[(start, start + rep)])

                # 2、（合）分析各个部分的小块的发生概率，分别记录下来
                for bet in range(start, start + rep):

                    temp_non = self.join(nonTRule[(start, bet)],
                                         nonTRule[(bet + 1, start + rep)])
                    rule_list.append(temp_non)

                # 3、（计）找到发生概率最大的规则，将这个规则当作此时[start：start+rep+1]的值
                # 使用fliter生成迭代对象，更好找我们要的变量prob
                # temp = filter(lambda k:k,rule_list)

                # 记录下此时的最可能的规则
                if rule_list:
                    nonTRule[(start, start + rep)] = max(rule_list,
                                                         key=lambda x: x[-1]
                                                         if x else 0)
                    # print(nonTRule[(start,start+rep)])
                else:
                    nonTRule[(start, start + rep)] = ()

        return nonTRule[(0, len(passwd) - 1)]

    # 简单解析函数，将简单规则的密码进行加密（这个简单规则是指类似于123456）或者无法解释的内容
    def defaultPasswordParse(self, word):
        # 将所有的密码格式设置成G -> W1,G | D1,G | Y1,G | W1 | D1 | Y1的形式
        pt = ParseTree()
        n = len(word)
        for i, c in enumerate(word):
            r = whatchar(c) + '1'
            # if i<n-1:
            #     r = r + ',G'
            pt.add_rule(('G', r))
            pt.add_rule((r[:2], c.lower()))
            if r.startswith('W'):
                nont_l = NonT_L(c, c)
                pt.extend_rules(nont_l.parse_tree())

        return pt
        # 简单规则中，其实也差不多，就是直接看是

    # 解析函数，目的是将函数解析成需要的语法树，然后在cfg中查找需要的值
    def lParseTree(self, passwd):

        pt = ParseTree()

        rule = self.parse(passwd)
        print("our rule is ")
        print(rule)

        # 如果返回值为空的话，则说明翻译失败。记录此时密码
        if not rule:
            print("Failed encode %s" % passwd)
            return pt

        # 假如是无G状态，就是说简单的密码时，就使用简单的密码加密
        if rule[0] not in self.G['G']:
            return self.defaultPasswordParse(passwd)

        # 否则的话，首先设定第一层的规则
        pt.add_rule(('G', rule[0]))

        # 然后，将每一层规则和每一个内容读出来，安插到parsetree中

        for sym, rhs in zip(rule[0].split(','), rule[1]):

            # 首先确认一下，假如规则不是W或者T的话，rhs此时应该只是字符串
            if isinstance(rhs, str):
                # 然后可以直接把这个规则放入
                pt.add_rule((sym, rhs))

            # 假如这个规则是W的话，那么后面跟着的就是(similarkeys_list,NonT_L)则此时要记得先把最相似对象内容放入存档中，并且记录下此时的内容大小写状态
            elif sym.startswith('W'):

                pt.add_rule((sym, rhs[0]))
                # 这里使用parse_tree变量，把此时的单词的状态子叶记录
                ltree = rhs[1].parse_tree()
                # 然后，此时先把最初的规则放进去
                pt.add_rule(ltree[0])
                # 假如此时为’133t'规则的话，此时在'133t'之后会记录下此时可能发生替换的元素，则要把这些元素也放入(这些元素已经打包好了)
                if len(ltree) > 1:
                    pt.tree.extend(ltree[1][1])
            # 假如规则是T的话，那么肯定是('T',[('T_Y','1993')..]..)之类的
            elif sym.startswith('T'):
                # 为了与cfg文件内部保持一致，我们此时需要把文件转换成与cfg内的文件一致的格式
                temp_sym = ''
                for each_label in rhs[1]:
                    temp_sym += each_label[0].replace("T_", "")
                pt.add_rule((sym, temp_sym))

                # 然后把其他的节点也放进去
                pt.extend_rules(rhs[1])

            else:
                print("we can't figure out this word")

        # 完成
        return pt

    # 核心加密函数：用于替换我们的密码
    def encode_password(self, password):
        # 首先得到我们的密码的密码树
        ptree = self.lParseTree(password)
        print("our password is ", end='')
        print(ptree)
        if not ptree:
            print("encode failed,change")
        # 然后将这个密码树映射到不同的数字中：
        encd = []
        # print(ptree)
        for each_node in ptree:
            try:
                encd.append(self.encode_encd(*each_node))
                # print(encd)
            except ValueError as e:
                print("Error in encoding: \"{}\"".format(password))
                print(e)
                return []

        # 假如不出错的话此时就完成了加密，然后注意此时我们的密码可能没有填充完（因为密码本身过短，我们需要使用空白值来填充）
        length = PASSWORD_MAX_LENGTH - len(encd)

        # 此时，如果length的长度还是len（encd），那么说明加密失败，返回空列表
        if length == PASSWORD_MAX_LENGTH:
            return []

        for i in range(length):
            encd.append(convert2group(0, 1))

        # 映射完成,返回加密完成的数字
        return encd

    # 比例加密函数，用于在一个固定额度区间中获得一个随机数
    def encode_encd(self, l, r):
        # 临时字典，存储此l规则对应的值

        rhs_dict = self.G[l]
        # print(rhs_dict[r])
        # 然后获得r的下标
        i = list(rhs_dict.keys()).index(r)
        # 然后开始循环，将其之前的数字进行相加
        l_hs = 0
        r_hs = 0
        for each_index in range(i):
            l_hs += list(rhs_dict.values())[each_index]
        # 然后记录下随机数的右侧
        r_hs = l_hs + rhs_dict[r] - 1

        # 最后调用随机函数，生成介于两者之间的随机数(这里记得把最大值也放上)
        rn = random.randint(l_hs, r_hs)
        # print("l_hs is %d,r_hs is %d and the random is %d"%(l_hs,r_hs,rn))
        # wn = rn + random.randint(0, int((3000000-rn)/rhs_dict['__total__'])) * rhs_dict['__total__']
        wn = convert2group(rn, rhs_dict['__total__'])
        # print("the wn is %d and it come back is %d"%(wn,wn%rhs_dict['__total__']))
        return wn

    # 比例解密函数
    def decode_encd(self, l, r):
        # 临时字典，存储此时l规则对应的值
        rhs_dict = self.G[l]
        # 然后此时检擦一下是否储存在这个规则（虽然一般都有，可能反解码的时候没有（？）
        if not rhs_dict:
            return ''
        # 还要确保__total__这个属性一定要有，否则就GG
        assert '__total__' in self.G[l] ,"The __total__ was lost in {!r},l = {!r}"\
            .format(rhs_dict,l)

        # 然后可以开始计算这个值得位置：
        index = r % rhs_dict['__total__']
        # print("the r is %d ,index is %d"%(r,index))
        # 接下来，判断参数，决定查找方式
        # if self.cal_cdf:
        #     # 假如这个规则比较大的话，我们顺便记录一下这个映射（不知道是否有必要）是否输出
        #     if len(rhs_dict)>1000:
        #         print_once(l,len(rhs_dict))
        #     # 使用二分搜索快速查找
        #     return bin_search(list(rhs_dict.items()),index,0,len(rhs_dict))

        # 未使用参数的话，使用比较慢的查找方式
        for k, t in rhs_dict.items():

            if index < t:
                return k
            else:
                index -= t

        # 到达这里，说明没有找到。。。检查一下输入是什么吧
        print("not find the rule !l is %s and r is %d" % (l, r))
        return ''

        # 尝试进行解密
    def decode_password(self, passwd):
        """
        函数：解密加密的随机串
        作用：通过取余运算，将每一个数字对应的原来的法则进行还原，同时利用G点找到之前加密过的密码位置，依次解密
        重要参数作用：
        stack：存放存入的节点
        plaintext：存放解密后的字符串
        lhs：存放父节点，父节点上存放了某种规则，必定不是字符串
        rhs：存放子节点，可能是下一个元素的子节点，可能是字符串
        """

        if not passwd:
            return ''

        # 解密的过程有点像栈堆一样
        # 首先新建一个list（如果成功了就换成stack)
        stack = []
        # 然后放入第一个节点（一定是这个，及即时是无法找到对应规则的我们也有G节点）
        stack.append('G')

        plaintext = ''

        index = 0
        # 然后进行循环，将密码进行解析
        while len(stack) > 0:
            lhs = stack.pop()
            # 使用读取功能，检测当前的nond，然后返回当前的状态值
            rhs = self.decode_encd(lhs, passwd[index])
            index += 1
            # 检查此时的rhs节点情况
            # 假如该节点为普通节点（而不是什么T_y,L_s那种）
            if lhs in ['G', 'Y', 'R', 'D', 'T', 'W']:
                # 那么节点后跟着的就是内容了
                if lhs == 'T':
                    # !!可能出错！！
                    sym = ['T_%s' % c for c in rhs]
                # 普通节点后面跟着的就是普通的规则，用‘，’作为分割符把其分开
                else:
                    # print("the rhs is %s"%rhs)
                    sym = rhs.split(',')
                # 无论哪种情况，都需要把内容颠倒过来（因为放到栈里面，后进先出）
                sym.reverse()
                # 然后放入栈中
                stack.extend(sym)
            # 假如此时节点已经是字符节点了，则此时右侧的字符串还未完全的还原，此时还需要把部分元素替换，使用特殊的函数还原
            elif lhs.startswith('W'):
                # 这里passwd放进去，因为下一位必定是大小写判断
                l = self.decode_encd('L', passwd[index])
                index += 1
                # 然后此时判断类型
                if l == "lower":
                    plaintext += rhs
                elif l == "Caps":
                    plaintext += rhs.capitalize()
                elif l == "UPPER":
                    plaintext += rhs.upper()
            # 假如是l33t，则此时每个符号都进行了加密，将每个符号进行解密
                elif l == "l33t":

                    for c in rhs:
                        plaintext += self.decode_encd('L_%s' % c,
                                                      passwd[index])
                        index += 1
            # 否则，此时已经是最终节点了
            else:
                plaintext += rhs

        return plaintext

Exemplo n.º 10

0

Exibir arquivo

class TrainedGrammar(object):

    l33t_replaces = DAWG.compile_replaces({
        '3': 'e',
        '4': 'a',
        '@': 'a',
        '$': 's',
        '0': 'o',
        '1': 'i',
        'z': 's'
    })

    def __init__(self, g_file=grammar_file, cal_cdf=False):
        self.cal_cdf = cal_cdf
        self.load(g_file)
        self.NonT_set = filter(lambda x: x.find('_') < 0, self.G.keys())

    def load(self, filename):
        self.G = json.load(open_(filename), object_pairs_hook=OrderedDict)
        for k, v in self.G.items():
            if self.cal_cdf:
                print_err("Calculating CDF!")
                lf = 0
                for l, f in v.items():
                    v[l] += lf
                    lf += f
                v['__total__'] = lf
            else:
                v['__total__'] = sum(v.values())
        Wlist = [x for k, v in self.G.items() for x in v if k.startswith('W')]
        self.date = Date()
        self.Wdawg = IntDAWG(Wlist)

    def get_prob(self, l, r):
        f = self.G.get(l, {}).get(r, 0)
        if f > 0:
            return float(f) / self.G[l]['__total__']

    def isNonTerm(self, lhs):  # this means given lhs, rhs will be in NonT
        return lhs in self.NonT_set

    def get_actual_NonTlist(self, lhs, rhs):
        if lhs == 'G':
            return rhs.split(',')
        elif lhs == 'T':
            return ['%s_%s' % (lhs, c) for c in rhs.split(',')]
        elif lhs == 'L':
            return ['%s_%s' % (lhs, c) for c in rhs]
        else:
            return []

    def get_freq(self, l, r):
        return self.G.get(l, {}).get(r, 0)

    def get_W_rule(self, word):
        w = unicode(word.lower())
        k = self.Wdawg.similar_keys(w, self.l33t_replaces)
        if k:
            k = k[0]
            L = NonT_L(k, word)
            sym = 'W%s' % get_nont_class('W', k)
            return (sym, [(k, L)], self.get_prob(sym, k))

    def get_T_rule(self, word):
        T = self.date.IsDate(word)
        if T:
            p = 10**(len(word) - 8)
            # for r in T.tree:
            #     p *= self.get_prob(*r)
            # p *= self.get_prob(*(T.get_rule()))
            return ('T', [(word, T)], p)

    def get_all_matches(self, word):
        rules = []
        for nt in self.NonT_set:
            if nt.startswith('W'):
                l = self.get_W_rule(word)
                if l: rules.append(l)
            elif nt == 'T':
                l = self.get_T_rule(word)
                if l: rules.append(l)
            else:
                f = self.G[nt].get(word, 0)
                if f > 0:
                    rules.append(
                        (nt, [(word)], float(f) / self.G[nt]['__total__']))
        rules = filter(lambda x: x and x[-1], rules)
        if rules:
            return max(rules, key=lambda x: x[-1])

    def join(self, r, s):
        not_startswith_L_T = lambda x: x and \
            not (x[0].startswith('L_') or x[0].startswith('T_'))
        if not_startswith_L_T(s) and not_startswith_L_T(r):
            k = ','.join([r[0], s[0]])
            p = r[-1] * s[-1]
            a = r[1] + s[1]
            return (k, a, p)

    def parse(self, word):
        A = {}
        for j in range(len(word)):
            for i in range(len(word) - j):
                A[(i, i + j)] = self.get_all_matches(word[i:j + i + 1])
                t = [A[(i, i + j)]]
                t.extend([
                    self.join(A[(i, k)], A[(k + 1, i + j)])
                    for k in range(i, i + j)
                ])
                if t:
                    A[(i, i+j)] = \
                        max(t, key = lambda x: x[-1] if x else 0)
                else:
                    A[(i, i + j)] = ()
                    # print "Not sure why it reached here. But it did!"
                    # print i, j, word[i: i+j+1]
        return A[(0, len(word) - 1)]

    def l_parse_tree(self, word):  # leftmost parse-tree
        pt = ParseTree()
        p = self.parse(word)
        if not p:
            print "Failing at ", word.encode('utf-8')
            return pt
        pt.add_rule(('G', p[0]))
        for l, each_r in zip(p[0].split(','), p[1]):
            if isinstance(each_r, basestring):
                pt.add_rule((l, each_r))
            elif l.startswith('W'):
                pt.add_rule((l, each_r[0]))
                L_parse_tree = each_r[1].parse_tree()
                pt.add_rule(L_parse_tree[0])
                if len(L_parse_tree.tree) > 1:
                    pt.tree.extend(L_parse_tree[1][1])
            elif l == 'T':
                p = each_r[1]
                rule_name = ','.join([r[0].replace('T_', '') for r in p])
                pt.add_rule((l, rule_name))
                pt.extend_rule(p)
            else:
                print "Something is severly wrong"
        return pt

    def rule_set(self, word):
        rs = RuleSet()
        pt = self.l_parse_tree(word)
        for p in pt.tree:
            rs.add_rule(*p)
        return rs

    def encode_rule(self, l, r):
        rhs_dict = self.G[l]
        i = rhs_dict.keys().index(r)
        assert i >= 0
        l_pt = sum(rhs_dict.values()[:i])
        r_pt = l_pt + rhs_dict[r]
        return convert2group(random.randint(l_pt, r_pt), rhs_dict['__total__'])

    def encode_pw(self, pw):
        pt = self.l_parse_tree(pw)
        code_g = [self.encode_rule(*p) for p in pt]
        extra = hny_config.PASSWORD_LENGTH - len(code_g)
        code_g.extend([convert2group(0, 1) for x in range(extra)])
        return code_g

    def decode_rule(self, l, p):
        rhs_dict = self.G[l]
        p %= rhs_dict['__total__']
        if self.cal_cdf:
            if len(rhs_dict) > 1000: print_once(l, len(rhs_dict))
            return bin_search(rhs_dict.items(), p, 0, len(rhs_dict))
        for k, v in rhs_dict.items():
            if p < v:
                return k
            else:
                p -= v
        print "Allas could not find.", l, p

    def decode_l33t(self, w, iterp):
        l = self.decode_rule('L', iterp.next())
        if l == 'Caps': return w.capitalize()
        elif l == 'lower': return w.lower()
        elif l == 'UPPER': return w.upper()
        else:
            nw = ''.join(
                [self.decode_rule('L_%s' % c, iterp.next()) for c in w])
            return nw

    def decode_pw(self, P):
        assert len(P) == hny_config.PASSWORD_LENGTH
        iterp = iter(P)
        plaintext = ''
        stack = ['G']
        while stack:
            lhs = stack.pop()
            rhs = self.decode_rule(lhs, iterp.next())
            if lhs in ['G', 'T']:
                arr = rhs.split(',') if lhs == 'G' \
                    else ['T_%s'% c for c in rhs.split(',')]
                arr.reverse()
                stack.extend(arr)
            elif lhs.startswith('W'):
                rhs = self.decode_l33t(rhs, iterp)
                plaintext += rhs
            else:
                plaintext += rhs
        return plaintext

    def __getitem__(self, l):
        return self.G[l]

    def __contains__(self, k):
        return k in self.G

    def is_grammar(self):
        return bool(self.G['G'])

Exemplo n.º 11

0

Exibir arquivo

Arquivo: main.py Projeto: Daggerfall-is-the-best-TES-game/superstring

class Solve:

    def __init__(self):
        with open('enable1.txt', 'r') as file:
            self.valid_scrabble_words = set()  # could use chain.from_iterable here when we start using wildcards again
            for string in file:
                # self.valid_scrabble_words |= self.wildcard_it(string.strip())
                self.valid_scrabble_words.add(string.strip())

        self.scrabble_tile_frequencies = {'e': 12, 'a': 9, 'i': 9, 'o': 8, 'n': 6, 'r': 6, 't': 6, 'l': 4, 's': 4,
                                          'u': 4,
                                          'd': 4, 'g': 3,
                                          'b': 2, 'c': 2, 'm': 2, 'p': 2,
                                          'f': 2, 'h': 2, 'v': 2, 'w': 2, 'y': 2,
                                          'k': 1,
                                          'j': 1, 'x': 1,
                                          'q': 1, 'z': 1}
        # dummy tiles representing wildcards
        self.scrabble_tile_frequencies.update(dict.fromkeys("ABCDEFGHIJKLMNOPQRSTUVWXYZ", 2))
        self.scrabble_tiles = [tile for tile in self.scrabble_tile_frequencies for x in
                               range(self.scrabble_tile_frequencies[tile])]
        self.test_solution = ""
        #  wildcard tiles, which are represented by uppercase letters, will default to a value of 0
        self.letter_scores = defaultdict(int, {'e': 1, 'a': 1, 'i': 1, 'o': 1, 'n': 1, 'r': 1, 't': 1, 'l': 1, 's': 1,
                                               'u': 1,
                                               'd': 2, 'g': 2,
                                               'b': 3, 'c': 3, 'm': 3, 'p': 3,
                                               'f': 4, 'h': 4, 'v': 4, 'w': 4, 'y': 4,
                                               'k': 5,
                                               'j': 8, 'x': 8,
                                               'q': 10, 'z': 10})

        if not isfile("word scores.pkl"):
            with Pool(8) as p:
                self.word_scores = dict(
                    zip(self.valid_scrabble_words, p.map(self.string_score_2, self.valid_scrabble_words)))
            with open("word scores.pkl", 'wb') as file:
                dump(self.word_scores, file, HIGHEST_PROTOCOL)
        else:
            with open("word scores.pkl", 'rb') as file:
                self.word_scores = load(file)

        if not isfile('word graph.dawg'):
            self.word_graph = DAWG(self.valid_scrabble_words)
            self.word_graph.save('word graph.dawg')
        else:
            self.word_graph = DAWG().load('word graph.dawg')

    def reset(self):
        self.test_solution = ""
        self.scrabble_tiles = [tile for tile in self.scrabble_tile_frequencies for x in
                               range(self.scrabble_tile_frequencies[tile])]

    def wildcard_it(self, string):
        return {str(string[:x].lower() + string[x:].capitalize())[:y]
                + str(string[:x].lower() + string[x:].capitalize())[y:].capitalize() for x in range(len(string)) for y
                in range(x, len(string))}

    def string_score(self, solution):
        """solution is a string that is worth points
        returns the point value of the string including subwords"""

        return sum(self.word_scores[word] for word in
                   self.words_in_string(solution))

    def string_score_2(self, solution):
        """solution is a string that is worth points
        returns the point value of the string NOT including subwords"""
        return sum(self.letter_scores[letter] for letter in solution)

    def words_in_string(self, string):
        return {word for x in range(len(string)) for word in self.word_graph.prefixes(string[x:])}

    def evaluate_part(self, candidate_tiles):
        """candidate_tiles is a string
        returns a point-value
        of the string that candidate_tiles represent"""
        return self.string_score(self.test_solution + candidate_tiles)

    def make_solution_method_1(self):
        """returns a string that is worth as many points as possible"""

        def part_value(part):
            return part[1]

        def get_part(part):
            return part[0]

        with Pool(32) as p:
            while self.scrabble_tiles:

                possible_part_list = p.map(self.evaluate_part, set(permutations(self.scrabble_tiles,
                                                                                r=4)))  # doesn't work with wildcard tiles represented by dummy tiles
                best_part = max(possible_part_list, key=part_value)

                print(best_part)
                self.test_solution += "".join(get_part(best_part))
                for tile in get_part(best_part):
                    self.scrabble_tiles.remove(tile)
                print(self.test_solution)
        return self.test_solution

    def add_to_solution(self, part):
        self.test_solution += part
        for tile in part:  # remove used tiles from bag of scrabble tiles
            if tile.isupper():
                for owned_tile in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
                    self.scrabble_tiles.remove(owned_tile)
            else:
                self.scrabble_tiles.remove(tile)

    def generate_word_combinations(self, words, max_length):
        """words is a list of words. max_length is a positive integer
        returns a generator of strings composed of permutations of words
        for each length up to the length specified by maxlength"""
        return ("".join(word_tuple) for word_tuple in
                chain.from_iterable(permutations(words, r=length) for length in range(1, max_length + 1)))

    def get_feasible_parts(self, word_list):
        """returns the set of strings that can be made from the current set of tiles left"""
        current_tile_count = Counter(self.scrabble_tiles)
        return (words for words in word_list if
                all(current_tile_count[letter] >= Counter(words)[letter] for letter in words))

    def make_solution_method_2(self):
        """returns a string that is worth as many points as possible"""

        while self.scrabble_tiles:
            possible_part_list = self.get_feasible_parts(self.valid_scrabble_words)
            best_parts = nlargest(100, possible_part_list, self.evaluate_part)  # get top n words
            if best_parts:
                best_part = max(self.get_feasible_parts(self.generate_word_combinations(best_parts, 2)),
                                key=self.evaluate_part)
                self.add_to_solution(best_part)
                print(self.test_solution)
                print(self.string_score(self.test_solution))
            else:
                break

        return self.test_solution

    def make_solution_method_3(self):
        """returns a string that is worth as many points as possible"""

        while self.scrabble_tiles:
            possible_part_list = self.get_feasible_parts(self.valid_scrabble_words)
            best_parts = nlargest(int(20 - len(self.test_solution) / 5), possible_part_list,
                                  self.evaluate_part)  # get top n words
            if best_parts:
                best_part = choice(list(self.get_feasible_parts(self.generate_word_combinations(best_parts, 3))))
                self.add_to_solution(best_part)
                print(self.test_solution)
                print(self.string_score(self.test_solution))
            else:
                break

        return self.test_solution

Exemplo n.º 12

0

Exibir arquivo

def _count(sym2_id: defaultdict(set), pmid2_id: defaultdict(set)):
    # pruning: remove the "empty" symbol
    if '' in sym2_id:
        del sym2_id['']

    logging.info("initalizing counters")
    symbols = {s: 0 for s in sym2_id.keys()}  # global count per symbol
    references = {}  # count per id & symbol in the referenced titles

    for sym, ids in sym2_id.items():
        for id_ in ids:
            if id_ in references:
                references[id_][sym] = 0
            else:
                references[id_] = {sym: 0}

    logging.info("initializing DAFSA graph")
    dwag = DAWG(sym2_id.keys())
    medline = MedlineSession()

    for pmid, known_ids in pmid2_id.items():
        logging.info("counting PMID %d", pmid)
        relevant = {}  # checked symbols

        while True:
            try:
                for (txt, ) in medline.query(
                        Section.content).filter(Section.pmid == pmid).filter(
                            Section.name != 'Copyright').filter(
                                Section.name != 'Vernacular'):
                    offsets = set(TokenOffsets(txt))

                    # only attempt prefix matches at offsets
                    for idx in offsets:
                        keys = dwag.prefixes(txt[idx:])

                        if keys:
                            sym = keys[-1]

                            # only offset-delimited matches
                            if idx + len(sym) in offsets:
                                symbols[sym] += 1

                                if sym in relevant:
                                    if relevant[sym]:
                                        for id_ in known_ids & sym2_id[sym]:
                                            references[id_][sym] += 1
                                else:
                                    relevant[sym] = False

                                    for id_ in known_ids & sym2_id[sym]:
                                        references[id_][sym] += 1
                                        relevant[sym] = True
                break
            except DatabaseError:
                medline = MedlineSession()

    for _id, counts in references.items():
        for sym, count in counts.items():
            print("{}\t{}\t{}\t{}".format(_id,
                                          repr(sym)[1:-1], count,
                                          symbols[sym]))

Exemplo n.º 13

0

Exibir arquivo

Arquivo: gpcount.py Projeto: fnl/libfnl

def _count(sym2_id:defaultdict(set), pmid2_id:defaultdict(set)):
    # pruning: remove the "empty" symbol
    if '' in sym2_id:
        del sym2_id['']

    logging.info("initalizing counters")
    symbols = {s: 0 for s in sym2_id.keys()} # global count per symbol
    references = {} # count per id & symbol in the referenced titles

    for sym, ids in sym2_id.items():
        for id_ in ids:
            if id_ in references:
                references[id_][sym] = 0
            else:
                references[id_] = {sym: 0}

    logging.info("initializing DAFSA graph")
    dwag = DAWG(sym2_id.keys())
    medline = MedlineSession()

    for pmid, known_ids in pmid2_id.items():
        logging.info("counting PMID %d", pmid)
        relevant = {} # checked symbols

        while True:
            try:
                for (txt,) in medline.query(Section.content
                ).filter(Section.pmid == pmid
                ).filter(Section.name != 'Copyright'
                ).filter(Section.name != 'Vernacular'
                ):
                    offsets = set(TokenOffsets(txt))

                    # only attempt prefix matches at offsets
                    for idx in offsets:
                        keys = dwag.prefixes(txt[idx:])

                        if keys:
                            sym = keys[-1]

                            # only offset-delimited matches
                            if idx + len(sym) in offsets:
                                symbols[sym] += 1

                                if sym in relevant:
                                    if relevant[sym]:
                                        for id_ in known_ids & sym2_id[sym]:
                                            references[id_][sym] += 1
                                else:
                                    relevant[sym] = False

                                    for id_ in known_ids & sym2_id[sym]:
                                        references[id_][sym] += 1
                                        relevant[sym] = True
                break
            except DatabaseError:
                medline = MedlineSession()

    for _id, counts in references.items():
        for sym, count in counts.items():
            print("{}\t{}\t{}\t{}".format(_id, repr(sym)[1:-1], count, symbols[sym]))