def createWordsObjects(self, dataDict, language, translateTo): wordObjects = [] for k in dataDict.keys(): if int(k) < 7: wordObjects.append(Words(dataDict[k][language], dataDict[k][translateTo])) elif int(k) < 12: wordObjects.append(Words(dataDict[k][language], dataDict[k][translateTo], 1)) else: wordObjects.append(Words(dataDict[k][language], dataDict[k][translateTo], 2)) return wordObjects
def __init__(self): emotion_list = ["Angry", "Fear", "Happy", "Sad", "Surprise"] self.feeling = random.choice(emotion_list) emotion_list.remove(self.feeling) self.opposite = random.choice(emotion_list) self.read_rate = random.randint(1, 10) # number of sentences to read #self.current_power = random.randint(0,30) # starting power? self.sentences = [] self.words = Words(self.feeling, self.opposite) self.markov_blob = None
def __init__(self): # initialize all "global" data logger.debug('loading...') logger.debug(' corpus...') # FIXME: using absolute paths is the easiest way to make us work from cmdline and invoked # in a web app. perhaps we could set up softlinks in /var/ to make this slightly more respectable. self.g = GramsBin( '/home/pizza/proj/spill-chick/data/corpus/google-ngrams/word.bin', '/home/pizza/proj/spill-chick/data/corpus/google-ngrams/ngram3.bin' ) self.w = Words(NGram3BinWordCounter(self.g.ng)) logger.debug(' phon') self.p = Phon(self.w, self.g) logger.debug('done.') # sanity-check junk """
def __init__(self): # initialize all "global" data logger.debug('loading...') logger.debug(' corpus...') # FIXME: using absolute paths is the easiest way to make us work from cmdline and invoked # in a web app. perhaps we could set up softlinks in /var/ to make this slightly more respectable. self.g = GramsBin( '/home/pizza/proj/spill-chick/data/corpus/google-ngrams/word.bin', '/home/pizza/proj/spill-chick/data/corpus/google-ngrams/ngram3.bin') self.w = Words(NGram3BinWordCounter(self.g.ng)) logger.debug(' phon') self.p = Phon(self.w, self.g) logger.debug('done.') # sanity-check junk """
class Ghost: def __init__(self): emotion_list = ["Angry", "Fear", "Happy", "Sad", "Surprise"] self.feeling = random.choice(emotion_list) emotion_list.remove(self.feeling) self.opposite = random.choice(emotion_list) self.read_rate = random.randint(1, 10) # number of sentences to read #self.current_power = random.randint(0,30) # starting power? self.sentences = [] self.words = Words(self.feeling, self.opposite) self.markov_blob = None # def get_power_words(self): # new_set = [] # for word in self.words: # if word.type == "Power": # new_set.append(word) # return new_set # def get_opposite_words(self): # return np.select([self.words.type == "Obstacle"],self.words) # def get_other_words(self): # return np.select([self.words.type == "Null"], self.words) # def get_power(self): # return np.sum(self.get_power_words().power) def read(self, story, timestep): # Get the story text #f = open(story, "r") #text_blob_object = TextBlob(f.read()) #f.close() # Get the current state of the story and read where the ghost is in the story at it's read rate start_sentence = timestep * self.read_rate # Check that the start word isn't greater than the number of sentences # if it is then you're done if (len(story.sentences) < start_sentence): return str(story), 'DONE' new_story = "" for i in range(start_sentence): new_story += str(story.sentences[i]) + " " # otherwise start reading until either the story is over or you've read your read rate # Get chunk starting at start_word and read for i in range(start_sentence, start_sentence + self.read_rate): if (i < len(story.sentences)): #print("------------") #print("Original Sentence: " + str(story.sentences[i])) new_sentence = self.parse_sentence(story.sentences[i]) story.sentences[i] = new_sentence new_story += str(new_sentence) + " " #if(str(new_sentence) != str(story.sentences[i])): #print("------------") #print("String mismatch", flush=True) #print("New Sentence: " + str(story.sentences[i]), flush=True) if (start_sentence + self.read_rate < len(story.sentences)): for i in range(start_sentence + self.read_rate, len(story.sentences)): new_story += str(story.sentences[i]) + " " #self.write(story, story.text) # If still reading, return READING, otherwise, return DONE if (start_sentence + self.read_rate >= len(story.sentences)): return new_story, 'DONE' else: return new_story, 'READING' def parse_sentence(self, sentence): emotion_analysis = te.get_emotion(str(sentence)) if emotion_analysis[self.opposite] > emotion_analysis[self.feeling]: # Re write it sentence = self.rewrite(sentence) elif emotion_analysis[self.feeling] > emotion_analysis[self.opposite]: # Take its words #self.words_markov = markovify.combine([self.words_markov,markovify.Text(str(sentence))],[1,1]) self.sentences.append(str(sentence)) self.markov_blob = markovify.Text(self.sentences) for word in sentence.words: #self.words.append(Word(word,te.get_emotion(word),self.feeling,self.opposite)) self.words.add_word(word) #for word in word_bag: #if word == self.SCREAM: # power surge? return sentence def rewrite(self, sentence): if (self.markov_blob == None): return sentence emotion_analysis = te.get_emotion(str(sentence)) #b.sentence = #used_words = [] #words_to_use = self.get_power_words().copy() #no_power = False i = 0 #while emotion_analysis[self.opposite] >= emotion_analysis[self.feeling] and not no_power: while emotion_analysis[self.opposite] >= emotion_analysis[ self.feeling] and i < 5: used_words = [] #words_to_use = self.get_power_words().copy() # Change the sentence test_sentence = self.markov_blob.make_short_sentence( len(str(sentence))) if (test_sentence != None): emotion_analysis = te.get_emotion(test_sentence) if (emotion_analysis[self.opposite] < emotion_analysis[self.feeling]): new_blob = TextBlob(test_sentence) if (self.words.has_words(new_blob.sentences[0].words)): sentence = new_blob.sentences[0] self.words.remove_words(new_blob.sentences[0].words) i = 5 else: i = 5 i += 1 # Remove negative words? Find markovify fits for good words? # Tell it to just change words until it gets a good score set? # for word in sentence.words: # pos, neg = self.parse_word(word) # if(neg > pos): # if len(words_to_use) > 0: # new = random.choice(words_to_use) # words_to_use.remove(new) # used_words.append(new) # word = new # if len(words_to_use) < 1: # no_power = True # used_words = [] # break #emotion_analysis = te.get_emotion(str(sentence)) #if(i < 5 or (emotion_analysis[self.opposite] < emotion_analysis[self.feeling])): # Remove all the words we used #for word in used_words: # self.words.remove(word) return sentence def parse_word(self, word): analyse = te.get_emotion(word) positive = analyse[self.feeling] negative = analyse[self.opposite] return positive, negative def write(self, story, text): f.open(story, "w") f.write(text) f.close()
t = ' '.join(snd[:j]) words = self.phon.get(t) if words: for s in self.soundsToWords(snd[j:]): yield [words] + s if __name__ == '__main__': def words(str): return re.findall('[a-z\']+', str.lower()) def pron(wl, wd): print(' '.join( [str(wd[w][0]) if w in wd else '<%s>' % (w, ) for w in wl])) P = Phon(Words()) for a in sys.argv[1:]: pron(words(a), P.W) print(P.word['there']) print(P.phon[P.word['there'][0]]) P.phraseSound(['making', 'mistake']) P.phraseSound(['may', 'king', 'mist', 'ache']) x = P.phraseSound(['making', 'miss', 'steak']) from itertools import product for f in P.soundsToWords(x): print(f) #print(list(product(*f)))
if x == y: return 0 damlev = ngd.diff.damlev sx, sy = p.phraseSound([x]), p.phraseSound([y]) if sx == sy and sx: # sound the same, e.g. there/their. consider these equal. return damlev # otherwise, calculate phonic/edit difference return max(damlev, min(NGramDiffScore.overlap(sx, sy), abs(len(x) - len(y)))) if __name__ == '__main__': import sys sys.path.append('..') from grambin import GramsBin from word import Words, NGram3BinWordCounter from phon import Phon import logging logging.basicConfig(stream=sys.stderr, level=logging.DEBUG) logging.debug('loading...') g = GramsBin( '/home/pizza/proj/spill-chick/data/corpus/google-ngrams/word.bin', '/home/pizza/proj/spill-chick/data/corpus/google-ngrams/ngram3.bin') w = Words(NGram3BinWordCounter(g.ng)) p = Phon(w, g) logging.debug('loaded.') pass
class Chick: def __init__(self): # initialize all "global" data logger.debug('loading...') logger.debug(' corpus...') # FIXME: using absolute paths is the easiest way to make us work from cmdline and invoked # in a web app. perhaps we could set up softlinks in /var/ to make this slightly more respectable. self.g = GramsBin( '/home/pizza/proj/spill-chick/data/corpus/google-ngrams/word.bin', '/home/pizza/proj/spill-chick/data/corpus/google-ngrams/ngram3.bin') self.w = Words(NGram3BinWordCounter(self.g.ng)) logger.debug(' phon') self.p = Phon(self.w, self.g) logger.debug('done.') # sanity-check junk """ logger.debug('w.correct(naieve)=%s' % self.w.correct(u'naieve')) logger.debug('w.correct(refridgerator)=%s' % self.w.correct(u'refridgerator')) logger.debug('g.freqs(refridgerator)=%s' % self.g.freqs(u'refridgerator')) logger.debug('g.freqs(refrigerator)=%s' % self.g.freqs(u'refrigerator')) logger.debug('g.freq((didn))=%s' % self.g.freq((u'didn',))) logger.debug('g.freq((a,mistake))=%s' % self.g.freq((u'a',u'mistake'))) logger.debug('g.freq((undoubtedly,be,changed))=%s' % self.g.freq((u'undoubtedly',u'be',u'changed'))) logger.debug('g.freq((undoubtedly,be))=%s' % self.g.freq((u'undoubtedly',u'be'))) logger.debug('g.freq((be,changed))=%s' % self.g.freq((u'be',u'changed'))) logger.debug('g.freq((it,it,did))=%s' % self.g.freq((u'it',u'it',u'did'))) logger.debug('g.freq((it,it))=%s' % self.g.freq((u'it',u'it'))) logger.debug('g.freq((it,did))=%s' % self.g.freq((u'it',u'did'))) logger.debug('g.freq((hello,there,sir))=%s' % self.g.freq((u'hello',u'there',u'sir'))) logger.debug('g.freq((hello,there))=%s' % self.g.freq((u'hello',u'there'))) logger.debug('g.freq((hello,there,,))=%s' % self.g.freq((u'hello',u'there',u','))) logger.debug('g.freq((they,\',re))=%s' % self.g.freq((u'they',u"'",u're'))) """ # FIXME: soundsToWords is expensive and should only be run as a last resort def phonGuess(self, toks, minfreq): """ given a list of tokens search for a list of words with similar pronunciation having g.freq(x) > minfreq """ # create a phonetic signature of the ngram phonsig = self.p.phraseSound(toks) logger.debug('phonsig=%s' % phonsig) phonwords = list(self.p.soundsToWords(phonsig)) logger.debug('phonwords=%s' % (phonwords,)) if phonwords == [[]]: phonpop = [] else: # remove any words that do not meet the minimum frequency; # they cannot possibly be part of the answer phonwords2 = [[[w for w in p if self.g.freq(tuple(w)) > minfreq] for p in pw] for pw in phonwords] logger.debug('phonwords2 lengths=%s product=%u' % \ (' '.join([str(len(p)) for p in phonwords2[0]]), reduce(lambda x,y:x*y, [len(p) for p in phonwords2[0]]))) if not all(phonwords2): return [] #logger.debug('phonwords2=(%u)%s...' % (len(phonwords2), phonwords2[:10],)) # remove any signatures that contain completely empty items after previous phonwords3 = phonwords2 #logger.debug('phonwords3=(%u)%s...' % (len(phonwords3), phonwords3)) # FIXME: product() function is handy in this case but is potentially hazardous. # we should force a limit to the length of any list passed to it to ensure # the avoidance of any pathological, memory-filling, swap-inducing behavior phonwords4 = list(flatten([list(product(*pw)) for pw in phonwords3])) logger.debug('phonwords4=(%u)%s...' % (len(phonwords4), phonwords4[:20])) # look up ngram popularity, toss anything not more popular than original and sort phonwordsx = [tuple(flatten(p)) for p in phonwords4] phonpop = rsort1([(pw, self.g.freq(pw, min)) for pw in phonwordsx]) #logger.debug('phonpop=(%u)%s...' % (len(phonpop), phonpop[:10])) phonpop = list(takewhile(lambda x:x[1] > minfreq, phonpop)) #logger.debug('phonpop=%s...' % (phonpop[:10],)) if phonpop == []: return [] best = phonpop[0][0] return [[x] for x in best] """ return a list of ngrampos permutations where each token has been replaced by a word with similar pronunciation, and g.freqs(word) > minfreq """ def permphon(self, ngrampos, minfreq): perms = [] for i in range(len(ngrampos)): prefix = ngrampos[:i] suffix = ngrampos[i+1:] tokpos = ngrampos[i] tok = tokpos[0] sounds = self.p.word[tok] if not sounds: continue #logger.debug('tok=%s sounds=%s' % (tok, sounds)) for sound in sounds: soundslikes = self.p.phon[sound] #logger.debug('tok=%s soundslikes=%s' % (tok, soundslikes)) for soundslike in soundslikes: if len(soundslike) > 1: continue soundslike = soundslike[0] if soundslike == tok: continue #logger.debug('soundslike %s -> %s' % (tok, soundslike)) if self.g.freqs(soundslike) <= minfreq: continue newtok = (soundslike,) + tokpos[1:] damlev = damerau_levenshtein(tok, soundslike) td = TokenDiff([tokpos], [newtok], damlev) perms.append(NGramDiff(prefix, td, suffix, self.g, soundalike=True)) return perms @staticmethod def ngrampos_merge(x, y): return (x[0]+y[0], x[1], x[2], x[3]) def permjoin(self, l, minfreq): """ given a list of strings, produce permutations by joining two tokens together example [a,b,c,d] -> [[ab,c,d],[a,bc,d],[a,b,cd] """ perms = [] if len(l) > 1: for i in range(len(l)-1): joined = Chick.ngrampos_merge(l[i],l[i+1]) if self.g.freqs(joined[0]) > minfreq: td = TokenDiff(l[i:i+2], [joined], 1) ngd = NGramDiff(l[:i], td, l[i+2:], self.g) perms.append(ngd) return perms @staticmethod def ngrampos_split_back(x, y): return (x[0]+y[0][:1], x[1], x[2], x[3]), (y[0][1:], y[1], y[2], y[3]) @staticmethod def ngrampos_split_forward(x, y): return (x[0][:-1], x[1], x[2], x[3]), (x[0][-1:]+y[0], y[1], y[2], y[3]) def intertoken_letterswap(self, l, target_freq): # generate permutations of token list with the beginning and ending letter of each # token swapped between adjacent tokens if len(l) < 2: return [] perms = [] for i in range(len(l)-1): if len(l[i][0]) > 1: x,y = Chick.ngrampos_split_forward(l[i], l[i+1]) if self.g.freq((x[0],y[0])) >= target_freq: td = TokenDiff(l[i:i+2], [x,y], 0) ngd = NGramDiff(l[:i], td, l[i+2:], self.g) perms.append(ngd) if len(l[i+1][0]) > 1: x,y = Chick.ngrampos_split_back(l[i], l[i+1]) if self.g.freq((x[0],y[0])) >= target_freq: td = TokenDiff(l[i:i+2], [x,y], 0) ngd = NGramDiff(l[:i], td, l[i+2:], self.g) perms.append(ngd) #print 'intertoken_letterswap=',perms return perms def do_suggest(self, target_ngram, target_freq, ctx, d, max_suggest=5): """ given an infrequent ngram from a document, attempt to calculate a more frequent one that is similar textually and/or phonetically but is more frequent """ target_ngram = list(target_ngram) part = [] # permutations via token joining # expense: cheap, though rarely useful # TODO: smarter token joining; pre-calculate based on tokens part += self.permjoin(target_ngram, target_freq) #logger.debug('permjoin(%s)=%s' % (target_ngram, part,)) part += self.intertoken_letterswap(target_ngram, target_freq) part += self.permphon(target_ngram, target_freq) part += self.g.ngram_like(target_ngram, target_freq) logger.debug('part after ngram_like=(%u)%s...' % (len(part), part[:5],)) # calculate the closest, best ngram in part sim = sorted([NGramDiffScore(ngd, self.p) for ngd in part]) for s in sim[:25]: logger.debug('sim %4.1f %2u %u %6u %6u %s' % \ (s.score, s.ediff, s.sl, s.ngd.oldfreq, s.ngd.newfreq, ' '.join(s.ngd.newtoks()))) best = list(takewhile(lambda s:s.score > 0, sim))[:max_suggest] for b in best: logger.debug('best %s' % (b,)) return best def ngram_suggest(self, target_ngram, target_freq, d, max_suggest=1): """ we calculate ngram context and collect solutions for each context containing the target, then merge them into a cohesive, best suggestion. c d e a b c d e f g given ngram (c,d,e), calculate context and solve: [S(a,b,c), S(b,c,d), S(c,d,e), S(d,e,f), S(e,f,g)] """ logger.debug('target_ngram=%s' % (target_ngram,)) tlen = len(target_ngram) context = list(d.ngram_context(target_ngram, tlen)) logger.debug('context=%s' % (context,)) ctoks = [c[0] for c in context] clen = len(context) logger.debug('tlen=%d clen=%d' % (tlen, clen)) context_ngrams = list2ngrams(context, tlen) logger.debug('context_ngrams=%s' % (context_ngrams,)) # gather suggestions for each ngram overlapping target_ngram sugg = [(ng, self.do_suggest(ng, self.g.freq([x[0] for x in ng]), context_ngrams, d)) for ng in [target_ngram]] #context_ngrams] for ng,su in sugg: for s in su: logger.debug('sugg %s' % (s,)) """ previously we leaned heavily on ngram frequencies and the sums of them for evaluating suggestions in context. instead, we will focus specifically on making the smallest changes which have the largest improvements, and in trying to normalize a document, i.e. "filling in the gaps" of as many 0-freq ngrams as possible. """ # merge suggestions based on what they change realdiff = {} for ng,su in sugg: for s in su: rstr = ' '.join(s.ngd.newtoks()) if rstr in realdiff: realdiff[rstr] += s else: realdiff[rstr] = s logger.debug('real %s %s' % (rstr, realdiff[rstr])) # sort the merged suggestions based on their combined score rdbest = sorted(realdiff.values(), key=lambda x:x.score, reverse=True) # finally, allow frequency to overcome small differences in score, but only # for scores that are within 1 to begin with. # if we account for frequency too much the common language idioms always crush # valid but less common phrases; if we don't account for frequency at all we often # recommend very similar but uncommon and weird phrases. this attempts to strike a balance. rdbest.sort(lambda x,y: y.score - x.score if abs(x.score - y.score) > 1 \ else (y.score + int(log(y.ngd.newfreq))) - \ (x.score + int(log(x.ngd.newfreq)))) for ngds in rdbest: logger.debug('best %s' % (ngds,)) return rdbest def suggest(self, txt, max_suggest=1, skip=[]): """ given a string, run suggest() and apply the first suggestion """ logger.debug('Chick.suggest(txt=%s max_suggest=%s, skip=%s)' % (txt, max_suggest, skip)) d = Doc(txt, self.w) logger.debug('doc=%s' % d) """ locate uncommon n-gram sequences which may indicate grammatical errors see if we can determine better replacements for them given their context """ # order n-grams by unpopularity ngsize = min(3, d.totalTokens()) logger.debug('ngsize=%s d.totalTokens()=%s' % (ngsize, d.totalTokens())) logger.debug('ngram(1) freq=%s' % list(d.ngramfreqctx(self.g,1))) # locate the least-common ngrams # TODO: in some cases an ngram is unpopular, but overlapping ngrams on either side # are relatively popular. # is this useful in differentiating between uncommon but valid phrases from invalid ones? """ sugg did the future 156 sugg the future would 3162 sugg future would undoubtedly 0 sugg would undoubtedly be 3111 sugg undoubtedly be changed 0 """ least_common = sort1(d.ngramfreqctx(self.g, ngsize)) logger.debug('least_common=%s' % least_common[:20]) # remove any ngrams present in 'skip' least_common = list(dropwhile(lambda x: x[0] in skip, least_common)) # filter ngrams containing numeric tokens or periods, they generate too many poor suggestions least_common = list(filter( lambda ng: not any(re.match('^(?:\d+|\.)$', n[0][0], re.U) for n in ng[0]), least_common)) # FIXME: limit to reduce work least_common = least_common[:max(20, len(least_common)/2)] # gather all suggestions for all least_common ngrams suggestions = [] for target_ngram,target_freq in least_common: suggs = self.ngram_suggest(target_ngram, target_freq, d, max_suggest) if suggs: suggestions.append(suggs) if not suggestions: """ """ ut = list(d.unknownToks()) logger.debug('unknownToks=%s' % ut) utChanges = [(u, (self.w.correct(u[0]), u[1], u[2], u[3])) for u in ut] logger.debug('utChanges=%s' % utChanges) utChanges2 = list(filter(lambda x: x not in skip, utChanges)) for old,new in utChanges2: td = TokenDiff([old], [new], damerau_levenshtein(old[0], new[0])) ngd = NGramDiff([], td, [], self.g) ngds = NGramDiffScore(ngd, None, 1) suggestions.append([ngds]) logger.debug('------------') logger.debug('suggestions=%s' % (suggestions,)) suggs = filter(lambda x:x and x[0].ngd.newfreq != x[0].ngd.oldfreq, suggestions) logger.debug('suggs=%s' % (suggs,)) # sort suggestions by their score, highest first bestsuggs = rsort(suggs, key=lambda x: x[0].score) # by total new frequency... bestsuggs = rsort(bestsuggs, key=lambda x: x[0].ngd.newfreq) # then by improvement pct. for infinite improvements this results in # the most frequent recommendation coming to the top bestsuggs = rsort(bestsuggs, key=lambda x: x[0].improve_pct()) # finally, allow frequency to overcome small differences in score, but only # for scores that are within 1 to begin with. # if we account for frequency too much the common language idioms always crush # valid but less common phrases; if we don't account for frequency at all we often # recommend very similar but uncommon and weird phrases. this attempts to strike a balance. """ bestsuggs.sort(lambda x,y: x[0].score - y[0].score if abs(x[0].score - y[0].score) > 1 \ else \ (y[0].score + int(log(y[0].ngd.newfreq))) - \ (x[0].score + int(log(x[0].ngd.newfreq)))) """ for bs in bestsuggs: for bss in bs: logger.debug('bestsugg %6.2f %2u %2u %7u %6.0f%% %s' % \ (bss.score, bss.ediff, bss.ngd.diff.damlev, bss.ngd.newfreq, bss.improve_pct(), ' '.join(bss.ngd.newtoks()))) for bs in bestsuggs: logger.debug('> bs=%s' % (bs,)) yield bs # TODO: now the trick is to a) associate these together based on target_ngram # to make them persist along with the document # and to recalculate them as necessary when a change is applied to the document that # affects anything they overlap def correct(self, txt): """ given a string, identify the least-common n-gram not present in 'skip' and return a list of suggested replacements """ d = Doc(txt, self.w) changes = list(self.suggest(d, 1)) for ch in changes: logger.debug('ch=%s' % (ch,)) change = [ch[0].ngd] logger.debug('change=%s' % (change,)) d.applyChanges(change) logger.debug('change=%s after applyChanges d=%s' % (change, d)) d = Doc(d, self.w) break # FIXME: loops forever changes = list(self.suggest(d, 1)) res = str(d).decode('utf8') logger.debug('correct res=%s %s' % (type(res),res)) return res
class Chick: def __init__(self): # initialize all "global" data logger.debug('loading...') logger.debug(' corpus...') # FIXME: using absolute paths is the easiest way to make us work from cmdline and invoked # in a web app. perhaps we could set up softlinks in /var/ to make this slightly more respectable. self.g = GramsBin( '/home/pizza/proj/spill-chick/data/corpus/google-ngrams/word.bin', '/home/pizza/proj/spill-chick/data/corpus/google-ngrams/ngram3.bin' ) self.w = Words(NGram3BinWordCounter(self.g.ng)) logger.debug(' phon') self.p = Phon(self.w, self.g) logger.debug('done.') # sanity-check junk """ logger.debug('w.correct(naieve)=%s' % self.w.correct(u'naieve')) logger.debug('w.correct(refridgerator)=%s' % self.w.correct(u'refridgerator')) logger.debug('g.freqs(refridgerator)=%s' % self.g.freqs(u'refridgerator')) logger.debug('g.freqs(refrigerator)=%s' % self.g.freqs(u'refrigerator')) logger.debug('g.freq((didn))=%s' % self.g.freq((u'didn',))) logger.debug('g.freq((a,mistake))=%s' % self.g.freq((u'a',u'mistake'))) logger.debug('g.freq((undoubtedly,be,changed))=%s' % self.g.freq((u'undoubtedly',u'be',u'changed'))) logger.debug('g.freq((undoubtedly,be))=%s' % self.g.freq((u'undoubtedly',u'be'))) logger.debug('g.freq((be,changed))=%s' % self.g.freq((u'be',u'changed'))) logger.debug('g.freq((it,it,did))=%s' % self.g.freq((u'it',u'it',u'did'))) logger.debug('g.freq((it,it))=%s' % self.g.freq((u'it',u'it'))) logger.debug('g.freq((it,did))=%s' % self.g.freq((u'it',u'did'))) logger.debug('g.freq((hello,there,sir))=%s' % self.g.freq((u'hello',u'there',u'sir'))) logger.debug('g.freq((hello,there))=%s' % self.g.freq((u'hello',u'there'))) logger.debug('g.freq((hello,there,,))=%s' % self.g.freq((u'hello',u'there',u','))) logger.debug('g.freq((they,\',re))=%s' % self.g.freq((u'they',u"'",u're'))) """ # FIXME: soundsToWords is expensive and should only be run as a last resort def phonGuess(self, toks, minfreq): """ given a list of tokens search for a list of words with similar pronunciation having g.freq(x) > minfreq """ # create a phonetic signature of the ngram phonsig = self.p.phraseSound(toks) logger.debug('phonsig=%s' % phonsig) phonwords = list(self.p.soundsToWords(phonsig)) logger.debug('phonwords=%s' % (phonwords, )) if phonwords == [[]]: phonpop = [] else: # remove any words that do not meet the minimum frequency; # they cannot possibly be part of the answer phonwords2 = [[[w for w in p if self.g.freq(tuple(w)) > minfreq] for p in pw] for pw in phonwords] logger.debug('phonwords2 lengths=%s product=%u' % \ (' '.join([str(len(p)) for p in phonwords2[0]]), reduce(lambda x,y:x*y, [len(p) for p in phonwords2[0]]))) if not all(phonwords2): return [] #logger.debug('phonwords2=(%u)%s...' % (len(phonwords2), phonwords2[:10],)) # remove any signatures that contain completely empty items after previous phonwords3 = phonwords2 #logger.debug('phonwords3=(%u)%s...' % (len(phonwords3), phonwords3)) # FIXME: product() function is handy in this case but is potentially hazardous. # we should force a limit to the length of any list passed to it to ensure # the avoidance of any pathological, memory-filling, swap-inducing behavior phonwords4 = list( flatten([list(product(*pw)) for pw in phonwords3])) logger.debug('phonwords4=(%u)%s...' % (len(phonwords4), phonwords4[:20])) # look up ngram popularity, toss anything not more popular than original and sort phonwordsx = [tuple(flatten(p)) for p in phonwords4] phonpop = rsort1([(pw, self.g.freq(pw, min)) for pw in phonwordsx]) #logger.debug('phonpop=(%u)%s...' % (len(phonpop), phonpop[:10])) phonpop = list(takewhile(lambda x: x[1] > minfreq, phonpop)) #logger.debug('phonpop=%s...' % (phonpop[:10],)) if phonpop == []: return [] best = phonpop[0][0] return [[x] for x in best] """ return a list of ngrampos permutations where each token has been replaced by a word with similar pronunciation, and g.freqs(word) > minfreq """ def permphon(self, ngrampos, minfreq): perms = [] for i in range(len(ngrampos)): prefix = ngrampos[:i] suffix = ngrampos[i + 1:] tokpos = ngrampos[i] tok = tokpos[0] sounds = self.p.word[tok] if not sounds: continue #logger.debug('tok=%s sounds=%s' % (tok, sounds)) for sound in sounds: soundslikes = self.p.phon[sound] #logger.debug('tok=%s soundslikes=%s' % (tok, soundslikes)) for soundslike in soundslikes: if len(soundslike) > 1: continue soundslike = soundslike[0] if soundslike == tok: continue #logger.debug('soundslike %s -> %s' % (tok, soundslike)) if self.g.freqs(soundslike) <= minfreq: continue newtok = (soundslike, ) + tokpos[1:] damlev = damerau_levenshtein(tok, soundslike) td = TokenDiff([tokpos], [newtok], damlev) perms.append( NGramDiff(prefix, td, suffix, self.g, soundalike=True)) return perms @staticmethod def ngrampos_merge(x, y): return (x[0] + y[0], x[1], x[2], x[3]) def permjoin(self, l, minfreq): """ given a list of strings, produce permutations by joining two tokens together example [a,b,c,d] -> [[ab,c,d],[a,bc,d],[a,b,cd] """ perms = [] if len(l) > 1: for i in range(len(l) - 1): joined = Chick.ngrampos_merge(l[i], l[i + 1]) if self.g.freqs(joined[0]) > minfreq: td = TokenDiff(l[i:i + 2], [joined], 1) ngd = NGramDiff(l[:i], td, l[i + 2:], self.g) perms.append(ngd) return perms @staticmethod def ngrampos_split_back(x, y): return (x[0] + y[0][:1], x[1], x[2], x[3]), (y[0][1:], y[1], y[2], y[3]) @staticmethod def ngrampos_split_forward(x, y): return (x[0][:-1], x[1], x[2], x[3]), (x[0][-1:] + y[0], y[1], y[2], y[3]) def intertoken_letterswap(self, l, target_freq): # generate permutations of token list with the beginning and ending letter of each # token swapped between adjacent tokens if len(l) < 2: return [] perms = [] for i in range(len(l) - 1): if len(l[i][0]) > 1: x, y = Chick.ngrampos_split_forward(l[i], l[i + 1]) if self.g.freq((x[0], y[0])) >= target_freq: td = TokenDiff(l[i:i + 2], [x, y], 0) ngd = NGramDiff(l[:i], td, l[i + 2:], self.g) perms.append(ngd) if len(l[i + 1][0]) > 1: x, y = Chick.ngrampos_split_back(l[i], l[i + 1]) if self.g.freq((x[0], y[0])) >= target_freq: td = TokenDiff(l[i:i + 2], [x, y], 0) ngd = NGramDiff(l[:i], td, l[i + 2:], self.g) perms.append(ngd) #print 'intertoken_letterswap=',perms return perms def do_suggest(self, target_ngram, target_freq, ctx, d, max_suggest=5): """ given an infrequent ngram from a document, attempt to calculate a more frequent one that is similar textually and/or phonetically but is more frequent """ target_ngram = list(target_ngram) part = [] # permutations via token joining # expense: cheap, though rarely useful # TODO: smarter token joining; pre-calculate based on tokens part += self.permjoin(target_ngram, target_freq) #logger.debug('permjoin(%s)=%s' % (target_ngram, part,)) part += self.intertoken_letterswap(target_ngram, target_freq) part += self.permphon(target_ngram, target_freq) part += self.g.ngram_like(target_ngram, target_freq) logger.debug('part after ngram_like=(%u)%s...' % ( len(part), part[:5], )) # calculate the closest, best ngram in part sim = sorted([NGramDiffScore(ngd, self.p) for ngd in part]) for s in sim[:25]: logger.debug('sim %4.1f %2u %u %6u %6u %s' % \ (s.score, s.ediff, s.sl, s.ngd.oldfreq, s.ngd.newfreq, ' '.join(s.ngd.newtoks()))) best = list(takewhile(lambda s: s.score > 0, sim))[:max_suggest] for b in best: logger.debug('best %s' % (b, )) return best def ngram_suggest(self, target_ngram, target_freq, d, max_suggest=1): """ we calculate ngram context and collect solutions for each context containing the target, then merge them into a cohesive, best suggestion. c d e a b c d e f g given ngram (c,d,e), calculate context and solve: [S(a,b,c), S(b,c,d), S(c,d,e), S(d,e,f), S(e,f,g)] """ logger.debug('target_ngram=%s' % (target_ngram, )) tlen = len(target_ngram) context = list(d.ngram_context(target_ngram, tlen)) logger.debug('context=%s' % (context, )) ctoks = [c[0] for c in context] clen = len(context) logger.debug('tlen=%d clen=%d' % (tlen, clen)) context_ngrams = list2ngrams(context, tlen) logger.debug('context_ngrams=%s' % (context_ngrams, )) # gather suggestions for each ngram overlapping target_ngram sugg = [(ng, self.do_suggest(ng, self.g.freq([x[0] for x in ng]), context_ngrams, d)) for ng in [target_ngram]] #context_ngrams] for ng, su in sugg: for s in su: logger.debug('sugg %s' % (s, )) """ previously we leaned heavily on ngram frequencies and the sums of them for evaluating suggestions in context. instead, we will focus specifically on making the smallest changes which have the largest improvements, and in trying to normalize a document, i.e. "filling in the gaps" of as many 0-freq ngrams as possible. """ # merge suggestions based on what they change realdiff = {} for ng, su in sugg: for s in su: rstr = ' '.join(s.ngd.newtoks()) if rstr in realdiff: realdiff[rstr] += s else: realdiff[rstr] = s logger.debug('real %s %s' % (rstr, realdiff[rstr])) # sort the merged suggestions based on their combined score rdbest = sorted(realdiff.values(), key=lambda x: x.score, reverse=True) # finally, allow frequency to overcome small differences in score, but only # for scores that are within 1 to begin with. # if we account for frequency too much the common language idioms always crush # valid but less common phrases; if we don't account for frequency at all we often # recommend very similar but uncommon and weird phrases. this attempts to strike a balance. rdbest.sort(lambda x,y: y.score - x.score if abs(x.score - y.score) > 1 \ else (y.score + int(log(y.ngd.newfreq))) - \ (x.score + int(log(x.ngd.newfreq)))) for ngds in rdbest: logger.debug('best %s' % (ngds, )) return rdbest def suggest(self, txt, max_suggest=1, skip=[]): """ given a string, run suggest() and apply the first suggestion """ logger.debug('Chick.suggest(txt=%s max_suggest=%s, skip=%s)' % (txt, max_suggest, skip)) d = Doc(txt, self.w) logger.debug('doc=%s' % d) """ locate uncommon n-gram sequences which may indicate grammatical errors see if we can determine better replacements for them given their context """ # order n-grams by unpopularity ngsize = min(3, d.totalTokens()) logger.debug('ngsize=%s d.totalTokens()=%s' % (ngsize, d.totalTokens())) logger.debug('ngram(1) freq=%s' % list(d.ngramfreqctx(self.g, 1))) # locate the least-common ngrams # TODO: in some cases an ngram is unpopular, but overlapping ngrams on either side # are relatively popular. # is this useful in differentiating between uncommon but valid phrases from invalid ones? """ sugg did the future 156 sugg the future would 3162 sugg future would undoubtedly 0 sugg would undoubtedly be 3111 sugg undoubtedly be changed 0 """ least_common = sort1(d.ngramfreqctx(self.g, ngsize)) logger.debug('least_common=%s' % least_common[:20]) # remove any ngrams present in 'skip' least_common = list(dropwhile(lambda x: x[0] in skip, least_common)) # filter ngrams containing numeric tokens or periods, they generate too many poor suggestions least_common = list( filter( lambda ng: not any( re.match('^(?:\d+|\.)$', n[0][0], re.U) for n in ng[0]), least_common)) # FIXME: limit to reduce work least_common = least_common[:max(20, len(least_common) / 2)] # gather all suggestions for all least_common ngrams suggestions = [] for target_ngram, target_freq in least_common: suggs = self.ngram_suggest(target_ngram, target_freq, d, max_suggest) if suggs: suggestions.append(suggs) if not suggestions: """ """ ut = list(d.unknownToks()) logger.debug('unknownToks=%s' % ut) utChanges = [(u, (self.w.correct(u[0]), u[1], u[2], u[3])) for u in ut] logger.debug('utChanges=%s' % utChanges) utChanges2 = list(filter(lambda x: x not in skip, utChanges)) for old, new in utChanges2: td = TokenDiff([old], [new], damerau_levenshtein(old[0], new[0])) ngd = NGramDiff([], td, [], self.g) ngds = NGramDiffScore(ngd, None, 1) suggestions.append([ngds]) logger.debug('------------') logger.debug('suggestions=%s' % (suggestions, )) suggs = filter(lambda x: x and x[0].ngd.newfreq != x[0].ngd.oldfreq, suggestions) logger.debug('suggs=%s' % (suggs, )) # sort suggestions by their score, highest first bestsuggs = rsort(suggs, key=lambda x: x[0].score) # by total new frequency... bestsuggs = rsort(bestsuggs, key=lambda x: x[0].ngd.newfreq) # then by improvement pct. for infinite improvements this results in # the most frequent recommendation coming to the top bestsuggs = rsort(bestsuggs, key=lambda x: x[0].improve_pct()) # finally, allow frequency to overcome small differences in score, but only # for scores that are within 1 to begin with. # if we account for frequency too much the common language idioms always crush # valid but less common phrases; if we don't account for frequency at all we often # recommend very similar but uncommon and weird phrases. this attempts to strike a balance. """ bestsuggs.sort(lambda x,y: x[0].score - y[0].score if abs(x[0].score - y[0].score) > 1 \ else \ (y[0].score + int(log(y[0].ngd.newfreq))) - \ (x[0].score + int(log(x[0].ngd.newfreq)))) """ for bs in bestsuggs: for bss in bs: logger.debug('bestsugg %6.2f %2u %2u %7u %6.0f%% %s' % \ (bss.score, bss.ediff, bss.ngd.diff.damlev, bss.ngd.newfreq, bss.improve_pct(), ' '.join(bss.ngd.newtoks()))) for bs in bestsuggs: logger.debug('> bs=%s' % (bs, )) yield bs # TODO: now the trick is to a) associate these together based on target_ngram # to make them persist along with the document # and to recalculate them as necessary when a change is applied to the document that # affects anything they overlap def correct(self, txt): """ given a string, identify the least-common n-gram not present in 'skip' and return a list of suggested replacements """ d = Doc(txt, self.w) changes = list(self.suggest(d, 1)) for ch in changes: logger.debug('ch=%s' % (ch, )) change = [ch[0].ngd] logger.debug('change=%s' % (change, )) d.applyChanges(change) logger.debug('change=%s after applyChanges d=%s' % (change, d)) d = Doc(d, self.w) break # FIXME: loops forever changes = list(self.suggest(d, 1)) res = str(d).decode('utf8') logger.debug('correct res=%s %s' % (type(res), res)) return res