def ngrams_plus_wikipedia(): ngrams = ngrams_plus_dictionary() for line in codecs.open("enwiki-titles", encoding="utf-8"): line = line.strip().upper() if line: if any([ord(c) > 127 for c in line]): continue chars = [] space = True parens = 0 for ch in line: if parens: if ch == "(": parens += 1 if ch == ")": parens -= 1 else: if ch in string.uppercase: chars.append(ch) space = False elif ch == "(": parens += 1 elif space == False: chars.append(" ") space = True if chars: text = "".join(chars) nwords = len(text.split()) bagnum = make_bag(text) if (bagnum not in ngrams) or (ngrams[bagnum][2] < 1000): ngrams[bagnum] = (text, nwords, 1000) print text return ngrams
def ngrams_plus_wikipedia(): print 'Loading.' ngrams = pickle.load(open('coanagram_data.pickle')) print 'Done loading.' for line in codecs.open('enwiki-titles', encoding='utf-8'): line = line.strip().upper() if line: if any([ord(c) > 127 for c in line]): continue chars = [] space = True parens = 0 for ch in line: if parens: if ch == '(': parens += 1 if ch == ')': parens -= 1 else: if ch in string.uppercase: chars.append(ch) space = False elif ch == '(': parens += 1 elif space == False: chars.append(' ') space = True if chars: text = ''.join(chars) nwords = len(text.split()) bagnum = make_bag(text) if (bagnum not in ngrams) or (ngrams[bagnum][0][2] < 1000): ngrams[bagnum] = [(text, nwords, 1000)] print text return ngrams
def ngrams_plus_dictionary(): ngrams = ngram_data() for line in open("enable1.txt"): if line.strip(): text = line.strip().upper() bagnum = make_bag(text) if bagnum not in ngrams: ngrams[bagnum] = (text, 1, 100) print text return ngrams
def complex_anagram(text): bagnum = make_bag(text) firsttry = simple_anagram_numeric(bagnum) if firsttry: return firsttry[0], 1, firsttry[2] bestfreq = 0 besttext = None for text, words, freq in complex_anagram_gen(bagnum): if "/" not in text and freq > bestfreq: besttext, bestfreq = text, freq return besttext, 2, bestfreq
def ngram_data(): ngrams = {} for filename in "1grams.txt", "2grams.txt", "3grams.txt": for line in codecs.open("ngrams/" + filename, encoding="utf-8"): if line.strip(): words, freq = eval(line) nwords = len(words) if freq >= 10000: text = " ".join(words) bagnum = make_bag(text) if bagnum not in ngrams: ngrams[bagnum] = (text, nwords, freq) elif freq > ngrams[bagnum][2]: # we found a better anagram, let's see it ngrams[bagnum] = (text, nwords, freq) print (text, freq, bagnum) return ngrams
def ngram_data(): ngrams = {} for filename in ['1grams.txt', '2grams.txt', '3grams.txt']: for line in codecs.open('ngrams/'+filename, encoding='utf-8'): if line.strip(): words, freq = eval(line) nwords = len(words) if freq >= 10000: text = ' '.join(words) bagnum = make_bag(text) bagtuple=(text, nwords, freq) if bagnum not in ngrams: ngrams[bagnum] = [bagtuple] elif (freq > ngrams[bagnum][-1][2]): # we found a better anagram, let's see if it stays in the top n ngrams[bagnum].append(bagtuple) ngrams[bagnum].sort(key=bagtuple_compare) ngrams[bagnum] = ngrams[bagnum][:3] if bagtuple in ngrams[bagnum]: print bagtuple return ngrams
def multi_anagram(text, n=10): got = [] bagnum = make_bag(text) firsttry = simple_anagram_numeric(bagnum) if firsttry: got.append((-1, firsttry[2], firsttry[0])) for text, words, freq in complex_anagram_gen(bagnum): got.append((-2, freq, text)) got.sort() best = [] used = set() for i in range(1, len(got) + 1): text = got[-i][2] ordered = " ".join(sorted(text.split())) if ordered not in used: used.add(ordered) best.append(got[-i]) if len(used) >= n: break return best
def multi_anagram(text, n=10): got = [] bagnum = make_bag(text) firsttry = simple_anagram_numeric(bagnum) for text, words, freq in firsttry: got.append((-1, freq, text)) for text, words, freq in complex_anagram_gen(bagnum): got.append((-2, freq, text)) got.sort() best = [] used = set() for i in range(1, n*2): text = got[-i][2] while '/' in text: # got a reasonable phrase plus a garble of leftover letters # try to do something with the rest print '\tRe-anagramming:', text before, after = text.split('/') reanagram = multi_anagram(after, 1) if reanagram: newtext = before+' '+reanagram[0][2] newfreq = min(got[-i][1], reanagram[0][1]) got[-i] = (-3, newfreq, newtext) text = got[-i][2] else: break got.sort() for i in range(1, len(got)+1): text = got[-i][2] if '/' in text: # edge case where we have a garble we didn't expand # because it was too far down the list continue ordered = ' '.join(sorted(text.split())) if ordered not in used: used.add(ordered) best.append(got[-i]) if len(used) >= n: break return best
def simple_anagram(text): bagnum = make_bag(text) return simple_anagram_numeric(bagnum)