def get_tweet(self, msger=None, query_string=None, followed=False): self.__lock() if msger and msger not in self.conversation_contexts: self.conversation_contexts[msger] = ConversationContext(msger) max_len = config.getint("brain", "tweet_len") if query_string and msger: # XXX: nltk.pos_tag doesn't do so well if the first word in a question # is capitalized. Should we add an option to the normalizer for this? query_string = word_detokenize( self.conversation_contexts[msger].normalizer.normalize_tokens(word_tokenize(query_string)) ) query_string = PronounInverter.invert_all(query_string) print "Normalized Inverted Query: " + query_string query_text = SearchableText(query_string, strip=True) curr_vect = self.pending_tweets.score_query(query_text) if followed: qvect = curr_vect else: if self.last_vect != None: self.conversation_contexts[msger].prime_memory(self.last_vect) qvect = self.conversation_contexts[msger].decay_query(curr_vect, query_text) max_len -= len("@" + msger + " ") (score, last_vect, ret) = self.pending_tweets.vector_query( qvect, exclude=self.remove_tweets, max_len=max_len ) if followed: min_score = config.getfloat("query", "min_follow_reply_score") else: min_score = config.getfloat("query", "min_msg_reply_score") if score >= min_score: self.last_vect = last_vect else: print "Minimum score of " + str(min_score) + " not met: " + str(score) print str(ret.tagged_tokens) print "Not responding with: " + ret.text return None if followed: # If this was a followed tweet, we should now record that it made # us say something. self.conversation_contexts[msger].decay_query(curr_vect, query_text) # Remember the last thing we said. self.conversation_contexts[msger].remember_query(self.last_vect) else: # query should be None here if query_string: query_text = SearchableText(query_string, strip=True) else: query_text = None (score, self.last_vect, ret) = self.pending_tweets.text_query( query_text, exclude=self.remove_tweets, max_len=max_len ) self.remove_tweets.append(ret) tokens = ret.tokens() self.already_tweeted.append(set(tokens)) self.__unlock() print str(ret.tagged_tokens) if msger: return "@" + msger + " " + ret.text else: return ret.text
def __init__(self, directory): self.normalizer = TokenNormalizer() self.quote_engine_only = config.getboolean('soul', 'quote_engine_only') # FIXME: http://www.w3schools.com/HTML/html_entities.asp clean_ents = [("<", "<"), (">", ">"), ("&", "&")] tagged_tweets = [] tweet_texts = [] self.vocab = set([]) for root, dirs, files in os.walk(directory): for f in files: # .jtwt: json-encoded twitter tweets, 1 per line # TODO: Add @msgs to this user as hidden text if f.endswith(".jtwt"): fl = open(root+"/"+f, "r") for jtweet in fl.readlines(): tweet = json.loads(jtweet) txt = tweet['text'].encode('ascii', 'ignore') if re.search("( |^)RT(:| )", txt, re.IGNORECASE): continue if txt[0] == '@': txt = re.sub('^@[\S]+ ', '', txt) for e in clean_ents: txt = re.sub(e[0], e[1], txt) if self.quote_engine_only: tagged_tweets.append(txt) else: tokens = self.normalizer.normalize_tokens(word_tokenize(txt)) if tokens: self.vocab.update(tokens) tagged_tweet = pos_tag(tokens, config.getboolean("soul","attempt_agfl"), config.getboolean("soul","reject_agfl_failures"), config.getboolean("soul","agfl_nltk_fallback")) if tagged_tweet: tweet_texts.append(word_detokenize(tokens)) tagged_tweets.append(tagged_tweet) print "Loaded tweet #"+str(len(tagged_tweets)) #+"/"+str(len(files)) # .twt: plain-text tweets, 1 per line elif f.endswith(".twt"): fl = open(root+"/"+f, "r") for tweet in fl.readlines(): txt = tweet.encode('ascii', 'ignore') if txt.startswith('RT'): continue if txt[0] == '@': txt = re.sub('^@[\S]+ ', '', txt) for e in clean_ents: txt = re.sub(e[0], e[1], txt) if self.quote_engine_only: tagged_tweets.append(txt) else: tokens = self.normalizer.normalize_tokens(word_tokenize(txt)) if tokens: self.vocab.update(tokens) tagged_tweet = pos_tag(tokens, config.getboolean("soul","attempt_agfl"), config.getboolean("soul","reject_agfl_failures"), config.getboolean("soul","agfl_nltk_fallback")) if tagged_tweet: tweet_texts.append(word_detokenize(tokens)) tagged_tweets.append(tagged_tweet) print "Loaded tweet #"+str(len(tagged_tweets)) #+"/"+str(len(files)) pass # .post: long-winded material (blog/mailinglist posts, essays, articles, etc) elif f.endswith(".post"): fl = open(root+"/"+f, "r") post = fl.read() tweets = self.post_to_tweets(post) for txt in tweets: #txt = txt.encode('ascii', 'ignore') for e in clean_ents: txt = re.sub(e[0], e[1], txt) if self.quote_engine_only: tagged_tweets.append(txt) else: tokens = self.normalizer.normalize_tokens(word_tokenize(txt)) if tokens: self.vocab.update(tokens) tagged_tweet = pos_tag(tokens, config.getboolean("soul","attempt_agfl"), config.getboolean("soul","reject_agfl_failures"), config.getboolean("soul","agfl_nltk_fallback")) if tagged_tweet: tweet_texts.append(word_detokenize(tokens)) tagged_tweets.append(tagged_tweet) print "Loaded post-tweet #"+str(len(tagged_tweets)) # .irclog: irc log files. irssi format. elif f.endswith(".irclog"): pass # .4sq: foursquare data elif f.endswith(".4sq"): pass self.tagged_tweets = tagged_tweets num_clusters = config.getint('soul', 'tweet_topics') if num_clusters > 1: self.cluster_tweets(tweet_texts, num_clusters) else: self.cluster_rates = {} self.clustered_tweets = {} self.clustered_tweets[0] = tagged_tweets self.cluster_rates[0] = len(self.tagged_tweets)