예제 #1
0
	def createWordsObjects(self, dataDict, language, translateTo):
		wordObjects = []
		for k in dataDict.keys():
			if int(k) < 7:
				wordObjects.append(Words(dataDict[k][language], dataDict[k][translateTo]))
			elif int(k) < 12:
				wordObjects.append(Words(dataDict[k][language], dataDict[k][translateTo], 1))
			else:
				wordObjects.append(Words(dataDict[k][language], dataDict[k][translateTo], 2))
		return wordObjects
예제 #2
0
 def __init__(self):
     emotion_list = ["Angry", "Fear", "Happy", "Sad", "Surprise"]
     self.feeling = random.choice(emotion_list)
     emotion_list.remove(self.feeling)
     self.opposite = random.choice(emotion_list)
     self.read_rate = random.randint(1, 10)  # number of sentences to read
     #self.current_power = random.randint(0,30) # starting power?
     self.sentences = []
     self.words = Words(self.feeling, self.opposite)
     self.markov_blob = None
예제 #3
0
파일: chick.py 프로젝트: rflynn/spill-chick
 def __init__(self):
     # initialize all "global" data
     logger.debug('loading...')
     logger.debug('  corpus...')
     # FIXME: using absolute paths is the easiest way to make us work from cmdline and invoked
     # in a web app. perhaps we could set up softlinks in /var/ to make this slightly more respectable.
     self.g = GramsBin(
         '/home/pizza/proj/spill-chick/data/corpus/google-ngrams/word.bin',
         '/home/pizza/proj/spill-chick/data/corpus/google-ngrams/ngram3.bin'
     )
     self.w = Words(NGram3BinWordCounter(self.g.ng))
     logger.debug('  phon')
     self.p = Phon(self.w, self.g)
     logger.debug('done.')
     # sanity-check junk
     """
예제 #4
0
파일: chick.py 프로젝트: rflynn/spill-chick
	def __init__(self):
		# initialize all "global" data
		logger.debug('loading...')
		logger.debug('  corpus...')
		# FIXME: using absolute paths is the easiest way to make us work from cmdline and invoked
		# in a web app. perhaps we could set up softlinks in /var/ to make this slightly more respectable.
		self.g = GramsBin(
			'/home/pizza/proj/spill-chick/data/corpus/google-ngrams/word.bin',
			'/home/pizza/proj/spill-chick/data/corpus/google-ngrams/ngram3.bin')
		self.w = Words(NGram3BinWordCounter(self.g.ng))
		logger.debug('  phon')
		self.p = Phon(self.w, self.g)
		logger.debug('done.')
		# sanity-check junk
		"""
예제 #5
0
class Ghost:
    def __init__(self):
        emotion_list = ["Angry", "Fear", "Happy", "Sad", "Surprise"]
        self.feeling = random.choice(emotion_list)
        emotion_list.remove(self.feeling)
        self.opposite = random.choice(emotion_list)
        self.read_rate = random.randint(1, 10)  # number of sentences to read
        #self.current_power = random.randint(0,30) # starting power?
        self.sentences = []
        self.words = Words(self.feeling, self.opposite)
        self.markov_blob = None

    # def get_power_words(self):
    #     new_set = []
    #     for word in self.words:
    #         if word.type == "Power":
    #             new_set.append(word)
    #     return new_set

    # def get_opposite_words(self):
    #     return np.select([self.words.type == "Obstacle"],self.words)

    # def get_other_words(self):
    #     return np.select([self.words.type == "Null"], self.words)

    # def get_power(self):
    #     return np.sum(self.get_power_words().power)

    def read(self, story, timestep):
        # Get the story text
        #f = open(story, "r")
        #text_blob_object = TextBlob(f.read())
        #f.close()

        # Get the current state of the story and read where the ghost is in the story at it's read rate
        start_sentence = timestep * self.read_rate

        # Check that the start word isn't greater than the number of sentences
        # if it is then you're done
        if (len(story.sentences) < start_sentence):
            return str(story), 'DONE'

        new_story = ""
        for i in range(start_sentence):
            new_story += str(story.sentences[i]) + " "
        # otherwise start reading until either the story is over or you've read your read rate
        # Get chunk starting at start_word and read
        for i in range(start_sentence, start_sentence + self.read_rate):
            if (i < len(story.sentences)):
                #print("------------")
                #print("Original Sentence: " + str(story.sentences[i]))
                new_sentence = self.parse_sentence(story.sentences[i])
                story.sentences[i] = new_sentence
                new_story += str(new_sentence) + " "
                #if(str(new_sentence) != str(story.sentences[i])):
                #print("------------")
                #print("String mismatch", flush=True)
                #print("New Sentence: " + str(story.sentences[i]), flush=True)

        if (start_sentence + self.read_rate < len(story.sentences)):
            for i in range(start_sentence + self.read_rate,
                           len(story.sentences)):
                new_story += str(story.sentences[i]) + " "

        #self.write(story, story.text)

        # If still reading, return READING, otherwise, return DONE
        if (start_sentence + self.read_rate >= len(story.sentences)):
            return new_story, 'DONE'
        else:
            return new_story, 'READING'

    def parse_sentence(self, sentence):
        emotion_analysis = te.get_emotion(str(sentence))
        if emotion_analysis[self.opposite] > emotion_analysis[self.feeling]:
            # Re write it
            sentence = self.rewrite(sentence)
        elif emotion_analysis[self.feeling] > emotion_analysis[self.opposite]:
            # Take its words
            #self.words_markov = markovify.combine([self.words_markov,markovify.Text(str(sentence))],[1,1])
            self.sentences.append(str(sentence))
            self.markov_blob = markovify.Text(self.sentences)
            for word in sentence.words:
                #self.words.append(Word(word,te.get_emotion(word),self.feeling,self.opposite))
                self.words.add_word(word)

        #for word in word_bag:
        #if word == self.SCREAM:
        # power surge?

        return sentence

    def rewrite(self, sentence):
        if (self.markov_blob == None):
            return sentence
        emotion_analysis = te.get_emotion(str(sentence))
        #b.sentence =
        #used_words = []
        #words_to_use = self.get_power_words().copy()
        #no_power = False

        i = 0
        #while emotion_analysis[self.opposite] >= emotion_analysis[self.feeling] and not no_power:
        while emotion_analysis[self.opposite] >= emotion_analysis[
                self.feeling] and i < 5:
            used_words = []
            #words_to_use = self.get_power_words().copy()
            # Change the sentence

            test_sentence = self.markov_blob.make_short_sentence(
                len(str(sentence)))
            if (test_sentence != None):
                emotion_analysis = te.get_emotion(test_sentence)
                if (emotion_analysis[self.opposite] <
                        emotion_analysis[self.feeling]):
                    new_blob = TextBlob(test_sentence)
                    if (self.words.has_words(new_blob.sentences[0].words)):
                        sentence = new_blob.sentences[0]
                        self.words.remove_words(new_blob.sentences[0].words)
                        i = 5
            else:
                i = 5

            i += 1
            # Remove negative words? Find markovify fits for good words?
            # Tell it to just change words until it gets a good score set?
            # for word in sentence.words:
            #     pos, neg = self.parse_word(word)
            #     if(neg > pos):
            #         if len(words_to_use) > 0:
            #             new = random.choice(words_to_use)
            #             words_to_use.remove(new)
            #             used_words.append(new)
            #             word = new
            #     if len(words_to_use) < 1:
            #         no_power = True
            #         used_words = []
            #         break
            #emotion_analysis = te.get_emotion(str(sentence))

        #if(i < 5 or (emotion_analysis[self.opposite] < emotion_analysis[self.feeling])):

        # Remove all the words we used
        #for word in used_words:
        #    self.words.remove(word)

        return sentence

    def parse_word(self, word):
        analyse = te.get_emotion(word)
        positive = analyse[self.feeling]
        negative = analyse[self.opposite]
        return positive, negative

    def write(self, story, text):
        f.open(story, "w")
        f.write(text)
        f.close()
예제 #6
0
파일: phon.py 프로젝트: rflynn/spill-chick
            t = ' '.join(snd[:j])
            words = self.phon.get(t)
            if words:
                for s in self.soundsToWords(snd[j:]):
                    yield [words] + s


if __name__ == '__main__':

    def words(str):
        return re.findall('[a-z\']+', str.lower())

    def pron(wl, wd):
        print(' '.join(
            [str(wd[w][0]) if w in wd else '<%s>' % (w, ) for w in wl]))

    P = Phon(Words())
    for a in sys.argv[1:]:
        pron(words(a), P.W)

    print(P.word['there'])
    print(P.phon[P.word['there'][0]])

    P.phraseSound(['making', 'mistake'])
    P.phraseSound(['may', 'king', 'mist', 'ache'])
    x = P.phraseSound(['making', 'miss', 'steak'])
    from itertools import product
    for f in P.soundsToWords(x):
        print(f)
        #print(list(product(*f)))
예제 #7
0
        if x == y:
            return 0
        damlev = ngd.diff.damlev
        sx, sy = p.phraseSound([x]), p.phraseSound([y])
        if sx == sy and sx:
            # sound the same, e.g. there/their. consider these equal.
            return damlev
        # otherwise, calculate phonic/edit difference
        return max(damlev,
                   min(NGramDiffScore.overlap(sx, sy), abs(len(x) - len(y))))


if __name__ == '__main__':
    import sys
    sys.path.append('..')
    from grambin import GramsBin
    from word import Words, NGram3BinWordCounter
    from phon import Phon
    import logging

    logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
    logging.debug('loading...')
    g = GramsBin(
        '/home/pizza/proj/spill-chick/data/corpus/google-ngrams/word.bin',
        '/home/pizza/proj/spill-chick/data/corpus/google-ngrams/ngram3.bin')
    w = Words(NGram3BinWordCounter(g.ng))
    p = Phon(w, g)
    logging.debug('loaded.')

    pass
예제 #8
0
파일: chick.py 프로젝트: rflynn/spill-chick
class Chick:
	def __init__(self):
		# initialize all "global" data
		logger.debug('loading...')
		logger.debug('  corpus...')
		# FIXME: using absolute paths is the easiest way to make us work from cmdline and invoked
		# in a web app. perhaps we could set up softlinks in /var/ to make this slightly more respectable.
		self.g = GramsBin(
			'/home/pizza/proj/spill-chick/data/corpus/google-ngrams/word.bin',
			'/home/pizza/proj/spill-chick/data/corpus/google-ngrams/ngram3.bin')
		self.w = Words(NGram3BinWordCounter(self.g.ng))
		logger.debug('  phon')
		self.p = Phon(self.w, self.g)
		logger.debug('done.')
		# sanity-check junk
		"""
		logger.debug('w.correct(naieve)=%s' % self.w.correct(u'naieve'))
		logger.debug('w.correct(refridgerator)=%s' % self.w.correct(u'refridgerator'))
		logger.debug('g.freqs(refridgerator)=%s' % self.g.freqs(u'refridgerator'))
		logger.debug('g.freqs(refrigerator)=%s' % self.g.freqs(u'refrigerator'))
		logger.debug('g.freq((didn))=%s' % self.g.freq((u'didn',)))
		logger.debug('g.freq((a,mistake))=%s' % self.g.freq((u'a',u'mistake')))
		logger.debug('g.freq((undoubtedly,be,changed))=%s' % self.g.freq((u'undoubtedly',u'be',u'changed')))
		logger.debug('g.freq((undoubtedly,be))=%s' % self.g.freq((u'undoubtedly',u'be')))
		logger.debug('g.freq((be,changed))=%s' % self.g.freq((u'be',u'changed')))
		logger.debug('g.freq((it,it,did))=%s' % self.g.freq((u'it',u'it',u'did')))
		logger.debug('g.freq((it,it))=%s' % self.g.freq((u'it',u'it')))
		logger.debug('g.freq((it,did))=%s' % self.g.freq((u'it',u'did')))
		logger.debug('g.freq((hello,there,sir))=%s' % self.g.freq((u'hello',u'there',u'sir')))
		logger.debug('g.freq((hello,there))=%s' % self.g.freq((u'hello',u'there')))
		logger.debug('g.freq((hello,there,,))=%s' % self.g.freq((u'hello',u'there',u',')))
		logger.debug('g.freq((they,\',re))=%s' % self.g.freq((u'they',u"'",u're')))
		"""

	# FIXME: soundsToWords is expensive and should only be run as a last resort
	def phonGuess(self, toks, minfreq):
		"""
		given a list of tokens search for a list of words with similar pronunciation
		having g.freq(x) > minfreq
		"""
		# create a phonetic signature of the ngram
		phonsig = self.p.phraseSound(toks)
		logger.debug('phonsig=%s' % phonsig)
		phonwords = list(self.p.soundsToWords(phonsig))
		logger.debug('phonwords=%s' % (phonwords,))
		if phonwords == [[]]:
			phonpop = []
		else:
			# remove any words that do not meet the minimum frequency;
			# they cannot possibly be part of the answer
			phonwords2 = [[[w for w in p if self.g.freq(tuple(w)) > minfreq]
						for p in pw]
							for pw in phonwords]
			logger.debug('phonwords2 lengths=%s product=%u' % \
				(' '.join([str(len(p)) for p in phonwords2[0]]),
				 reduce(lambda x,y:x*y, [len(p) for p in phonwords2[0]])))
			if not all(phonwords2):
				return []
			#logger.debug('phonwords2=(%u)%s...' % (len(phonwords2), phonwords2[:10],))
			# remove any signatures that contain completely empty items after previous
			phonwords3 = phonwords2
			#logger.debug('phonwords3=(%u)%s...' % (len(phonwords3), phonwords3))
			# FIXME: product() function is handy in this case but is potentially hazardous.
			# we should force a limit to the length of any list passed to it to ensure
			# the avoidance of any pathological, memory-filling, swap-inducing behavior
			phonwords4 = list(flatten([list(product(*pw)) for pw in phonwords3]))
			logger.debug('phonwords4=(%u)%s...' % (len(phonwords4), phonwords4[:20]))
			# look up ngram popularity, toss anything not more popular than original and sort
			phonwordsx = [tuple(flatten(p)) for p in phonwords4]

			phonpop = rsort1([(pw, self.g.freq(pw, min)) for pw in phonwordsx])
			#logger.debug('phonpop=(%u)%s...' % (len(phonpop), phonpop[:10]))
			phonpop = list(takewhile(lambda x:x[1] > minfreq, phonpop))
			#logger.debug('phonpop=%s...' % (phonpop[:10],))
		if phonpop == []:
			return []
		best = phonpop[0][0]
		return [[x] for x in best]

	"""
	return a list of ngrampos permutations where each token has been replaced by a word with
	similar pronunciation, and g.freqs(word) > minfreq
	"""
	def permphon(self, ngrampos, minfreq):
		perms = []
		for i in range(len(ngrampos)):
			prefix = ngrampos[:i]
			suffix = ngrampos[i+1:]
			tokpos = ngrampos[i]
			tok = tokpos[0]
			sounds = self.p.word[tok]
			if not sounds:
				continue
			#logger.debug('tok=%s sounds=%s' % (tok, sounds))
			for sound in sounds:
				soundslikes = self.p.phon[sound]
				#logger.debug('tok=%s soundslikes=%s' % (tok, soundslikes))
				for soundslike in soundslikes:
					if len(soundslike) > 1:
						continue
					soundslike = soundslike[0]
					if soundslike == tok:
						continue
					#logger.debug('soundslike %s -> %s' % (tok, soundslike))
					if self.g.freqs(soundslike) <= minfreq:
						continue
					newtok = (soundslike,) + tokpos[1:]
					damlev = damerau_levenshtein(tok, soundslike)
					td = TokenDiff([tokpos], [newtok], damlev)
					perms.append(NGramDiff(prefix, td, suffix, self.g, soundalike=True))
		return perms

	@staticmethod
	def ngrampos_merge(x, y):
		return (x[0]+y[0], x[1], x[2], x[3])

	def permjoin(self, l, minfreq):
		"""
		given a list of strings, produce permutations by joining two tokens together
		example [a,b,c,d] -> [[ab,c,d],[a,bc,d],[a,b,cd]
		"""
		perms = []
		if len(l) > 1:
			for i in range(len(l)-1):
				joined = Chick.ngrampos_merge(l[i],l[i+1])
				if self.g.freqs(joined[0]) > minfreq:
					td = TokenDiff(l[i:i+2], [joined], 1)
					ngd = NGramDiff(l[:i], td, l[i+2:], self.g)
					perms.append(ngd)
		return perms

	@staticmethod
	def ngrampos_split_back(x, y):
		return (x[0]+y[0][:1], x[1], x[2], x[3]), (y[0][1:], y[1], y[2], y[3])

	@staticmethod
	def ngrampos_split_forward(x, y):
		return (x[0][:-1], x[1], x[2], x[3]), (x[0][-1:]+y[0], y[1], y[2], y[3])

	def intertoken_letterswap(self, l, target_freq):
		# generate permutations of token list with the beginning and ending letter of each
		# token swapped between adjacent tokens
		if len(l) < 2:
			return []
		perms = []
		for i in range(len(l)-1):
			if len(l[i][0]) > 1:
				x,y = Chick.ngrampos_split_forward(l[i], l[i+1])
				if self.g.freq((x[0],y[0])) >= target_freq:
					td = TokenDiff(l[i:i+2], [x,y], 0)
					ngd = NGramDiff(l[:i], td, l[i+2:], self.g)
					perms.append(ngd)
			if len(l[i+1][0]) > 1:
				x,y = Chick.ngrampos_split_back(l[i], l[i+1])
				if self.g.freq((x[0],y[0])) >= target_freq:
					td = TokenDiff(l[i:i+2], [x,y], 0)
					ngd = NGramDiff(l[:i], td, l[i+2:], self.g)
					perms.append(ngd)
		#print 'intertoken_letterswap=',perms
		return perms

	def do_suggest(self, target_ngram, target_freq, ctx, d, max_suggest=5):
		"""
		given an infrequent ngram from a document, attempt to calculate a more frequent one
		that is similar textually and/or phonetically but is more frequent
		"""

		target_ngram = list(target_ngram)
		part = []

		# permutations via token joining
		# expense: cheap, though rarely useful
		# TODO: smarter token joining; pre-calculate based on tokens
		part += self.permjoin(target_ngram, target_freq)
		#logger.debug('permjoin(%s)=%s' % (target_ngram, part,))

		part += self.intertoken_letterswap(target_ngram, target_freq)

		part += self.permphon(target_ngram, target_freq)

		part += self.g.ngram_like(target_ngram, target_freq)

		logger.debug('part after ngram_like=(%u)%s...' % (len(part), part[:5],))

		# calculate the closest, best ngram in part
		sim = sorted([NGramDiffScore(ngd, self.p) for ngd in part])
		for s in sim[:25]:
			logger.debug('sim %4.1f %2u %u %6u %6u %s' % \
				(s.score, s.ediff, s.sl, s.ngd.oldfreq, s.ngd.newfreq, ' '.join(s.ngd.newtoks())))

		best = list(takewhile(lambda s:s.score > 0, sim))[:max_suggest]
		for b in best:
			logger.debug('best %s' % (b,))
		return best

	def ngram_suggest(self, target_ngram, target_freq, d, max_suggest=1):
		"""
		we calculate ngram context and collect solutions for each context
		containing the target, then merge them into a cohesive, best suggestion.
			c d e
		    a b c d e f g
		given ngram (c,d,e), calculate context and solve:
		[S(a,b,c), S(b,c,d), S(c,d,e), S(d,e,f), S(e,f,g)]
		"""

		logger.debug('target_ngram=%s' % (target_ngram,))
		tlen = len(target_ngram)

		context = list(d.ngram_context(target_ngram, tlen))
		logger.debug('context=%s' % (context,))
		ctoks = [c[0] for c in context]
		clen = len(context)

		logger.debug('tlen=%d clen=%d' % (tlen, clen))
		context_ngrams = list2ngrams(context, tlen)
		logger.debug('context_ngrams=%s' % (context_ngrams,))

		# gather suggestions for each ngram overlapping target_ngram
		sugg = [(ng, self.do_suggest(ng, self.g.freq([x[0] for x in ng]), context_ngrams, d))
			for ng in [target_ngram]] #context_ngrams]

		for ng,su in sugg:
			for s in su:
				logger.debug('sugg %s' % (s,))

		"""
		previously we leaned heavily on ngram frequencies and the sums of them for
		evaluating suggestions in context.
		instead, we will focus specifically on making the smallest changes which have the
		largest improvements, and in trying to normalize a document, i.e.
		"filling in the gaps" of as many 0-freq ngrams as possible.
		"""

		# merge suggestions based on what they change
		realdiff = {}
		for ng,su in sugg:
			for s in su:
				rstr = ' '.join(s.ngd.newtoks())
				if rstr in realdiff:
					realdiff[rstr] += s
				else:
					realdiff[rstr] = s
				logger.debug('real %s %s' % (rstr, realdiff[rstr]))

		# sort the merged suggestions based on their combined score
		rdbest = sorted(realdiff.values(), key=lambda x:x.score, reverse=True)

		# finally, allow frequency to overcome small differences in score, but only
		# for scores that are within 1 to begin with.
		# if we account for frequency too much the common language idioms always crush
		# valid but less common phrases; if we don't account for frequency at all we often
		# recommend very similar but uncommon and weird phrases. this attempts to strike a balance.
		rdbest.sort(lambda x,y:
			y.score - x.score if abs(x.score - y.score) > 1	\
			else	(y.score + int(log(y.ngd.newfreq))) -	\
				(x.score + int(log(x.ngd.newfreq))))

		for ngds in rdbest:
			logger.debug('best %s' % (ngds,))

		return rdbest

	def suggest(self, txt, max_suggest=1, skip=[]):
		"""
		given a string, run suggest() and apply the first suggestion
		"""
		logger.debug('Chick.suggest(txt=%s max_suggest=%s, skip=%s)' % (txt, max_suggest, skip))

		d = Doc(txt, self.w)
		logger.debug('doc=%s' % d)

		"""
		locate uncommon n-gram sequences which may indicate grammatical errors
		see if we can determine better replacements for them given their context
		"""

		# order n-grams by unpopularity
		ngsize = min(3, d.totalTokens())
		logger.debug('ngsize=%s d.totalTokens()=%s' % (ngsize, d.totalTokens()))
		logger.debug('ngram(1) freq=%s' % list(d.ngramfreqctx(self.g,1)))

		# locate the least-common ngrams
		# TODO: in some cases an ngram is unpopular, but overlapping ngrams on either side
		# are relatively popular.
		# is this useful in differentiating between uncommon but valid phrases from invalid ones?
		"""
sugg       did the future 156
sugg            the future would 3162
sugg                future would undoubtedly 0
sugg                       would undoubtedly be 3111
sugg                             undoubtedly be changed 0
		"""

		least_common = sort1(d.ngramfreqctx(self.g, ngsize))
		logger.debug('least_common=%s' % least_common[:20])
		# remove any ngrams present in 'skip'
		least_common = list(dropwhile(lambda x: x[0] in skip, least_common))
		# filter ngrams containing numeric tokens or periods, they generate too many poor suggestions
		least_common = list(filter(
					lambda ng: not any(re.match('^(?:\d+|\.)$', n[0][0], re.U)
							for n in ng[0]),
					least_common))

		# FIXME: limit to reduce work
		least_common = least_common[:max(20, len(least_common)/2)]

		# gather all suggestions for all least_common ngrams
		suggestions = []
		for target_ngram,target_freq in least_common:
			suggs = self.ngram_suggest(target_ngram, target_freq, d, max_suggest)
			if suggs:
				suggestions.append(suggs)

		if not suggestions:
			"""
			"""
			ut = list(d.unknownToks())
			logger.debug('unknownToks=%s' % ut)
			utChanges = [(u, (self.w.correct(u[0]), u[1], u[2], u[3])) for u in ut]
			logger.debug('utChanges=%s' % utChanges)
			utChanges2 = list(filter(lambda x: x not in skip, utChanges))
			for old,new in utChanges2:
				td = TokenDiff([old], [new], damerau_levenshtein(old[0], new[0]))
				ngd = NGramDiff([], td, [], self.g)
				ngds = NGramDiffScore(ngd, None, 1)
				suggestions.append([ngds])

		logger.debug('------------')
		logger.debug('suggestions=%s' % (suggestions,))
		suggs = filter(lambda x:x and x[0].ngd.newfreq != x[0].ngd.oldfreq, suggestions)
		logger.debug('suggs=%s' % (suggs,))
		# sort suggestions by their score, highest first
		bestsuggs = rsort(suggs, key=lambda x: x[0].score)
		# by total new frequency...
		bestsuggs = rsort(bestsuggs, key=lambda x: x[0].ngd.newfreq)
		# then by improvement pct. for infinite improvements this results in
		# the most frequent recommendation coming to the top
		bestsuggs = rsort(bestsuggs, key=lambda x: x[0].improve_pct())

		# finally, allow frequency to overcome small differences in score, but only
		# for scores that are within 1 to begin with.
		# if we account for frequency too much the common language idioms always crush
		# valid but less common phrases; if we don't account for frequency at all we often
		# recommend very similar but uncommon and weird phrases. this attempts to strike a balance.
		"""
		bestsuggs.sort(lambda x,y:
			x[0].score - y[0].score if abs(x[0].score - y[0].score) > 1 \
			else \
				(y[0].score + int(log(y[0].ngd.newfreq))) - \
				(x[0].score + int(log(x[0].ngd.newfreq))))
		"""

		for bs in bestsuggs:
			for bss in bs:
				logger.debug('bestsugg %6.2f %2u %2u %7u %6.0f%% %s' % \
					(bss.score, bss.ediff, bss.ngd.diff.damlev,
					 bss.ngd.newfreq, bss.improve_pct(), ' '.join(bss.ngd.newtoks())))

		for bs in bestsuggs:
			logger.debug('> bs=%s' % (bs,))
			yield bs

		# TODO: now the trick is to a) associate these together based on target_ngram
		# to make them persist along with the document
		# and to recalculate them as necessary when a change is applied to the document that
		# affects anything they overlap

	def correct(self, txt):
		"""
		given a string, identify the least-common n-gram not present in 'skip'
		and return a list of suggested replacements
		"""
		d = Doc(txt, self.w)
		changes = list(self.suggest(d, 1))
		for ch in changes:
			logger.debug('ch=%s' % (ch,))
			change = [ch[0].ngd]
			logger.debug('change=%s' % (change,))
			d.applyChanges(change)
			logger.debug('change=%s after applyChanges d=%s' % (change, d))
			d = Doc(d, self.w)
			break # FIXME: loops forever
			changes = list(self.suggest(d, 1))
		res = str(d).decode('utf8')
		logger.debug('correct res=%s %s' % (type(res),res))
		return res
예제 #9
0
파일: chick.py 프로젝트: rflynn/spill-chick
class Chick:
    def __init__(self):
        # initialize all "global" data
        logger.debug('loading...')
        logger.debug('  corpus...')
        # FIXME: using absolute paths is the easiest way to make us work from cmdline and invoked
        # in a web app. perhaps we could set up softlinks in /var/ to make this slightly more respectable.
        self.g = GramsBin(
            '/home/pizza/proj/spill-chick/data/corpus/google-ngrams/word.bin',
            '/home/pizza/proj/spill-chick/data/corpus/google-ngrams/ngram3.bin'
        )
        self.w = Words(NGram3BinWordCounter(self.g.ng))
        logger.debug('  phon')
        self.p = Phon(self.w, self.g)
        logger.debug('done.')
        # sanity-check junk
        """
		logger.debug('w.correct(naieve)=%s' % self.w.correct(u'naieve'))
		logger.debug('w.correct(refridgerator)=%s' % self.w.correct(u'refridgerator'))
		logger.debug('g.freqs(refridgerator)=%s' % self.g.freqs(u'refridgerator'))
		logger.debug('g.freqs(refrigerator)=%s' % self.g.freqs(u'refrigerator'))
		logger.debug('g.freq((didn))=%s' % self.g.freq((u'didn',)))
		logger.debug('g.freq((a,mistake))=%s' % self.g.freq((u'a',u'mistake')))
		logger.debug('g.freq((undoubtedly,be,changed))=%s' % self.g.freq((u'undoubtedly',u'be',u'changed')))
		logger.debug('g.freq((undoubtedly,be))=%s' % self.g.freq((u'undoubtedly',u'be')))
		logger.debug('g.freq((be,changed))=%s' % self.g.freq((u'be',u'changed')))
		logger.debug('g.freq((it,it,did))=%s' % self.g.freq((u'it',u'it',u'did')))
		logger.debug('g.freq((it,it))=%s' % self.g.freq((u'it',u'it')))
		logger.debug('g.freq((it,did))=%s' % self.g.freq((u'it',u'did')))
		logger.debug('g.freq((hello,there,sir))=%s' % self.g.freq((u'hello',u'there',u'sir')))
		logger.debug('g.freq((hello,there))=%s' % self.g.freq((u'hello',u'there')))
		logger.debug('g.freq((hello,there,,))=%s' % self.g.freq((u'hello',u'there',u',')))
		logger.debug('g.freq((they,\',re))=%s' % self.g.freq((u'they',u"'",u're')))
		"""

    # FIXME: soundsToWords is expensive and should only be run as a last resort
    def phonGuess(self, toks, minfreq):
        """
		given a list of tokens search for a list of words with similar pronunciation
		having g.freq(x) > minfreq
		"""
        # create a phonetic signature of the ngram
        phonsig = self.p.phraseSound(toks)
        logger.debug('phonsig=%s' % phonsig)
        phonwords = list(self.p.soundsToWords(phonsig))
        logger.debug('phonwords=%s' % (phonwords, ))
        if phonwords == [[]]:
            phonpop = []
        else:
            # remove any words that do not meet the minimum frequency;
            # they cannot possibly be part of the answer
            phonwords2 = [[[w for w in p if self.g.freq(tuple(w)) > minfreq]
                           for p in pw] for pw in phonwords]
            logger.debug('phonwords2 lengths=%s product=%u' % \
             (' '.join([str(len(p)) for p in phonwords2[0]]),
              reduce(lambda x,y:x*y, [len(p) for p in phonwords2[0]])))
            if not all(phonwords2):
                return []
            #logger.debug('phonwords2=(%u)%s...' % (len(phonwords2), phonwords2[:10],))
            # remove any signatures that contain completely empty items after previous
            phonwords3 = phonwords2
            #logger.debug('phonwords3=(%u)%s...' % (len(phonwords3), phonwords3))
            # FIXME: product() function is handy in this case but is potentially hazardous.
            # we should force a limit to the length of any list passed to it to ensure
            # the avoidance of any pathological, memory-filling, swap-inducing behavior
            phonwords4 = list(
                flatten([list(product(*pw)) for pw in phonwords3]))
            logger.debug('phonwords4=(%u)%s...' %
                         (len(phonwords4), phonwords4[:20]))
            # look up ngram popularity, toss anything not more popular than original and sort
            phonwordsx = [tuple(flatten(p)) for p in phonwords4]

            phonpop = rsort1([(pw, self.g.freq(pw, min)) for pw in phonwordsx])
            #logger.debug('phonpop=(%u)%s...' % (len(phonpop), phonpop[:10]))
            phonpop = list(takewhile(lambda x: x[1] > minfreq, phonpop))
            #logger.debug('phonpop=%s...' % (phonpop[:10],))
        if phonpop == []:
            return []
        best = phonpop[0][0]
        return [[x] for x in best]

    """
	return a list of ngrampos permutations where each token has been replaced by a word with
	similar pronunciation, and g.freqs(word) > minfreq
	"""

    def permphon(self, ngrampos, minfreq):
        perms = []
        for i in range(len(ngrampos)):
            prefix = ngrampos[:i]
            suffix = ngrampos[i + 1:]
            tokpos = ngrampos[i]
            tok = tokpos[0]
            sounds = self.p.word[tok]
            if not sounds:
                continue
            #logger.debug('tok=%s sounds=%s' % (tok, sounds))
            for sound in sounds:
                soundslikes = self.p.phon[sound]
                #logger.debug('tok=%s soundslikes=%s' % (tok, soundslikes))
                for soundslike in soundslikes:
                    if len(soundslike) > 1:
                        continue
                    soundslike = soundslike[0]
                    if soundslike == tok:
                        continue
                    #logger.debug('soundslike %s -> %s' % (tok, soundslike))
                    if self.g.freqs(soundslike) <= minfreq:
                        continue
                    newtok = (soundslike, ) + tokpos[1:]
                    damlev = damerau_levenshtein(tok, soundslike)
                    td = TokenDiff([tokpos], [newtok], damlev)
                    perms.append(
                        NGramDiff(prefix, td, suffix, self.g, soundalike=True))
        return perms

    @staticmethod
    def ngrampos_merge(x, y):
        return (x[0] + y[0], x[1], x[2], x[3])

    def permjoin(self, l, minfreq):
        """
		given a list of strings, produce permutations by joining two tokens together
		example [a,b,c,d] -> [[ab,c,d],[a,bc,d],[a,b,cd]
		"""
        perms = []
        if len(l) > 1:
            for i in range(len(l) - 1):
                joined = Chick.ngrampos_merge(l[i], l[i + 1])
                if self.g.freqs(joined[0]) > minfreq:
                    td = TokenDiff(l[i:i + 2], [joined], 1)
                    ngd = NGramDiff(l[:i], td, l[i + 2:], self.g)
                    perms.append(ngd)
        return perms

    @staticmethod
    def ngrampos_split_back(x, y):
        return (x[0] + y[0][:1], x[1], x[2], x[3]), (y[0][1:], y[1], y[2],
                                                     y[3])

    @staticmethod
    def ngrampos_split_forward(x, y):
        return (x[0][:-1], x[1], x[2], x[3]), (x[0][-1:] + y[0], y[1], y[2],
                                               y[3])

    def intertoken_letterswap(self, l, target_freq):
        # generate permutations of token list with the beginning and ending letter of each
        # token swapped between adjacent tokens
        if len(l) < 2:
            return []
        perms = []
        for i in range(len(l) - 1):
            if len(l[i][0]) > 1:
                x, y = Chick.ngrampos_split_forward(l[i], l[i + 1])
                if self.g.freq((x[0], y[0])) >= target_freq:
                    td = TokenDiff(l[i:i + 2], [x, y], 0)
                    ngd = NGramDiff(l[:i], td, l[i + 2:], self.g)
                    perms.append(ngd)
            if len(l[i + 1][0]) > 1:
                x, y = Chick.ngrampos_split_back(l[i], l[i + 1])
                if self.g.freq((x[0], y[0])) >= target_freq:
                    td = TokenDiff(l[i:i + 2], [x, y], 0)
                    ngd = NGramDiff(l[:i], td, l[i + 2:], self.g)
                    perms.append(ngd)
        #print 'intertoken_letterswap=',perms
        return perms

    def do_suggest(self, target_ngram, target_freq, ctx, d, max_suggest=5):
        """
		given an infrequent ngram from a document, attempt to calculate a more frequent one
		that is similar textually and/or phonetically but is more frequent
		"""

        target_ngram = list(target_ngram)
        part = []

        # permutations via token joining
        # expense: cheap, though rarely useful
        # TODO: smarter token joining; pre-calculate based on tokens
        part += self.permjoin(target_ngram, target_freq)
        #logger.debug('permjoin(%s)=%s' % (target_ngram, part,))

        part += self.intertoken_letterswap(target_ngram, target_freq)

        part += self.permphon(target_ngram, target_freq)

        part += self.g.ngram_like(target_ngram, target_freq)

        logger.debug('part after ngram_like=(%u)%s...' % (
            len(part),
            part[:5],
        ))

        # calculate the closest, best ngram in part
        sim = sorted([NGramDiffScore(ngd, self.p) for ngd in part])
        for s in sim[:25]:
            logger.debug('sim %4.1f %2u %u %6u %6u %s' % \
             (s.score, s.ediff, s.sl, s.ngd.oldfreq, s.ngd.newfreq, ' '.join(s.ngd.newtoks())))

        best = list(takewhile(lambda s: s.score > 0, sim))[:max_suggest]
        for b in best:
            logger.debug('best %s' % (b, ))
        return best

    def ngram_suggest(self, target_ngram, target_freq, d, max_suggest=1):
        """
		we calculate ngram context and collect solutions for each context
		containing the target, then merge them into a cohesive, best suggestion.
			c d e
		    a b c d e f g
		given ngram (c,d,e), calculate context and solve:
		[S(a,b,c), S(b,c,d), S(c,d,e), S(d,e,f), S(e,f,g)]
		"""

        logger.debug('target_ngram=%s' % (target_ngram, ))
        tlen = len(target_ngram)

        context = list(d.ngram_context(target_ngram, tlen))
        logger.debug('context=%s' % (context, ))
        ctoks = [c[0] for c in context]
        clen = len(context)

        logger.debug('tlen=%d clen=%d' % (tlen, clen))
        context_ngrams = list2ngrams(context, tlen)
        logger.debug('context_ngrams=%s' % (context_ngrams, ))

        # gather suggestions for each ngram overlapping target_ngram
        sugg = [(ng,
                 self.do_suggest(ng, self.g.freq([x[0] for x in ng]),
                                 context_ngrams, d))
                for ng in [target_ngram]]  #context_ngrams]

        for ng, su in sugg:
            for s in su:
                logger.debug('sugg %s' % (s, ))
        """
		previously we leaned heavily on ngram frequencies and the sums of them for
		evaluating suggestions in context.
		instead, we will focus specifically on making the smallest changes which have the
		largest improvements, and in trying to normalize a document, i.e.
		"filling in the gaps" of as many 0-freq ngrams as possible.
		"""

        # merge suggestions based on what they change
        realdiff = {}
        for ng, su in sugg:
            for s in su:
                rstr = ' '.join(s.ngd.newtoks())
                if rstr in realdiff:
                    realdiff[rstr] += s
                else:
                    realdiff[rstr] = s
                logger.debug('real %s %s' % (rstr, realdiff[rstr]))

        # sort the merged suggestions based on their combined score
        rdbest = sorted(realdiff.values(), key=lambda x: x.score, reverse=True)

        # finally, allow frequency to overcome small differences in score, but only
        # for scores that are within 1 to begin with.
        # if we account for frequency too much the common language idioms always crush
        # valid but less common phrases; if we don't account for frequency at all we often
        # recommend very similar but uncommon and weird phrases. this attempts to strike a balance.
        rdbest.sort(lambda x,y:
         y.score - x.score if abs(x.score - y.score) > 1	\
         else (y.score + int(log(y.ngd.newfreq))) -	\
          (x.score + int(log(x.ngd.newfreq))))

        for ngds in rdbest:
            logger.debug('best %s' % (ngds, ))

        return rdbest

    def suggest(self, txt, max_suggest=1, skip=[]):
        """
		given a string, run suggest() and apply the first suggestion
		"""
        logger.debug('Chick.suggest(txt=%s max_suggest=%s, skip=%s)' %
                     (txt, max_suggest, skip))

        d = Doc(txt, self.w)
        logger.debug('doc=%s' % d)
        """
		locate uncommon n-gram sequences which may indicate grammatical errors
		see if we can determine better replacements for them given their context
		"""

        # order n-grams by unpopularity
        ngsize = min(3, d.totalTokens())
        logger.debug('ngsize=%s d.totalTokens()=%s' %
                     (ngsize, d.totalTokens()))
        logger.debug('ngram(1) freq=%s' % list(d.ngramfreqctx(self.g, 1)))

        # locate the least-common ngrams
        # TODO: in some cases an ngram is unpopular, but overlapping ngrams on either side
        # are relatively popular.
        # is this useful in differentiating between uncommon but valid phrases from invalid ones?
        """
sugg       did the future 156
sugg            the future would 3162
sugg                future would undoubtedly 0
sugg                       would undoubtedly be 3111
sugg                             undoubtedly be changed 0
		"""

        least_common = sort1(d.ngramfreqctx(self.g, ngsize))
        logger.debug('least_common=%s' % least_common[:20])
        # remove any ngrams present in 'skip'
        least_common = list(dropwhile(lambda x: x[0] in skip, least_common))
        # filter ngrams containing numeric tokens or periods, they generate too many poor suggestions
        least_common = list(
            filter(
                lambda ng: not any(
                    re.match('^(?:\d+|\.)$', n[0][0], re.U) for n in ng[0]),
                least_common))

        # FIXME: limit to reduce work
        least_common = least_common[:max(20, len(least_common) / 2)]

        # gather all suggestions for all least_common ngrams
        suggestions = []
        for target_ngram, target_freq in least_common:
            suggs = self.ngram_suggest(target_ngram, target_freq, d,
                                       max_suggest)
            if suggs:
                suggestions.append(suggs)

        if not suggestions:
            """
			"""
            ut = list(d.unknownToks())
            logger.debug('unknownToks=%s' % ut)
            utChanges = [(u, (self.w.correct(u[0]), u[1], u[2], u[3]))
                         for u in ut]
            logger.debug('utChanges=%s' % utChanges)
            utChanges2 = list(filter(lambda x: x not in skip, utChanges))
            for old, new in utChanges2:
                td = TokenDiff([old], [new],
                               damerau_levenshtein(old[0], new[0]))
                ngd = NGramDiff([], td, [], self.g)
                ngds = NGramDiffScore(ngd, None, 1)
                suggestions.append([ngds])

        logger.debug('------------')
        logger.debug('suggestions=%s' % (suggestions, ))
        suggs = filter(lambda x: x and x[0].ngd.newfreq != x[0].ngd.oldfreq,
                       suggestions)
        logger.debug('suggs=%s' % (suggs, ))
        # sort suggestions by their score, highest first
        bestsuggs = rsort(suggs, key=lambda x: x[0].score)
        # by total new frequency...
        bestsuggs = rsort(bestsuggs, key=lambda x: x[0].ngd.newfreq)
        # then by improvement pct. for infinite improvements this results in
        # the most frequent recommendation coming to the top
        bestsuggs = rsort(bestsuggs, key=lambda x: x[0].improve_pct())

        # finally, allow frequency to overcome small differences in score, but only
        # for scores that are within 1 to begin with.
        # if we account for frequency too much the common language idioms always crush
        # valid but less common phrases; if we don't account for frequency at all we often
        # recommend very similar but uncommon and weird phrases. this attempts to strike a balance.
        """
		bestsuggs.sort(lambda x,y:
			x[0].score - y[0].score if abs(x[0].score - y[0].score) > 1 \
			else \
				(y[0].score + int(log(y[0].ngd.newfreq))) - \
				(x[0].score + int(log(x[0].ngd.newfreq))))
		"""

        for bs in bestsuggs:
            for bss in bs:
                logger.debug('bestsugg %6.2f %2u %2u %7u %6.0f%% %s' % \
                 (bss.score, bss.ediff, bss.ngd.diff.damlev,
                  bss.ngd.newfreq, bss.improve_pct(), ' '.join(bss.ngd.newtoks())))

        for bs in bestsuggs:
            logger.debug('> bs=%s' % (bs, ))
            yield bs

        # TODO: now the trick is to a) associate these together based on target_ngram
        # to make them persist along with the document
        # and to recalculate them as necessary when a change is applied to the document that
        # affects anything they overlap

    def correct(self, txt):
        """
		given a string, identify the least-common n-gram not present in 'skip'
		and return a list of suggested replacements
		"""
        d = Doc(txt, self.w)
        changes = list(self.suggest(d, 1))
        for ch in changes:
            logger.debug('ch=%s' % (ch, ))
            change = [ch[0].ngd]
            logger.debug('change=%s' % (change, ))
            d.applyChanges(change)
            logger.debug('change=%s after applyChanges d=%s' % (change, d))
            d = Doc(d, self.w)
            break  # FIXME: loops forever
            changes = list(self.suggest(d, 1))
        res = str(d).decode('utf8')
        logger.debug('correct res=%s %s' % (type(res), res))
        return res