def tokens(self): # FIXME: If we decide to drop tagged_tokens, switch to saving # just the tokens if self.tagged_tokens: retlist = [t[0] for t in self.tagged_tokens] else: retlist = word_tokenize(self.text) if self.hidden_text: retlist.extend(word_tokenize(self.hidden_text)) return retlist
def get_tweet(self, msger=None, query_string=None, followed=False): self.__lock() if msger and msger not in self.conversation_contexts: self.conversation_contexts[msger] = ConversationContext(msger) max_len = config.getint("brain", "tweet_len") if query_string and msger: # XXX: nltk.pos_tag doesn't do so well if the first word in a question # is capitalized. Should we add an option to the normalizer for this? query_string = word_detokenize( self.conversation_contexts[msger].normalizer.normalize_tokens(word_tokenize(query_string)) ) query_string = PronounInverter.invert_all(query_string) print "Normalized Inverted Query: " + query_string query_text = SearchableText(query_string, strip=True) curr_vect = self.pending_tweets.score_query(query_text) if followed: qvect = curr_vect else: if self.last_vect != None: self.conversation_contexts[msger].prime_memory(self.last_vect) qvect = self.conversation_contexts[msger].decay_query(curr_vect, query_text) max_len -= len("@" + msger + " ") (score, last_vect, ret) = self.pending_tweets.vector_query( qvect, exclude=self.remove_tweets, max_len=max_len ) if followed: min_score = config.getfloat("query", "min_follow_reply_score") else: min_score = config.getfloat("query", "min_msg_reply_score") if score >= min_score: self.last_vect = last_vect else: print "Minimum score of " + str(min_score) + " not met: " + str(score) print str(ret.tagged_tokens) print "Not responding with: " + ret.text return None if followed: # If this was a followed tweet, we should now record that it made # us say something. self.conversation_contexts[msger].decay_query(curr_vect, query_text) # Remember the last thing we said. self.conversation_contexts[msger].remember_query(self.last_vect) else: # query should be None here if query_string: query_text = SearchableText(query_string, strip=True) else: query_text = None (score, self.last_vect, ret) = self.pending_tweets.text_query( query_text, exclude=self.remove_tweets, max_len=max_len ) self.remove_tweets.append(ret) tokens = ret.tokens() self.already_tweeted.append(set(tokens)) self.__unlock() print str(ret.tagged_tokens) if msger: return "@" + msger + " " + ret.text else: return ret.text
def __init__( self, text, tokens=None, tagged_tokens=None, strip=False, hidden_text="", generalize_terms=config.getboolean("query", "generalize_terms"), ): if hidden_text: hidden_text = hidden_text.rstrip() if not curses.ascii.ispunct(hidden_text[-1]): hidden_text += ". " else: hidden_text += " " self.hidden_text = hidden_text if not curses.ascii.ispunct(text[-1]): text += "." self.tagged_tokens = tagged_tokens if not tokens: tokens = word_tokenize(text) self.text = text if hidden_text: tokens.extend(word_tokenize(hidden_text)) # Include hidden text in search tokens pos_tags = nltk.pos_tag(tokens) if strip: search_tokens = [porter.stem(t[0]).lower() for t in QueryStripper.strip_tagged_query(pos_tags)] else: search_tokens = [porter.stem(t[0]).lower() for t in pos_tags] self.word_info = {} self.total_words = 0 if generalize_terms: # Add senses, antonyms, and hypernyms to this list with # http://nodebox.net/code/index.php/Linguistics # Also add normalized versions with en.spelling() first # FIXME: This is biasing results. Words with lots of hyponyms are being # favored by TF-IDF. We need word sense disambiguation to prune this # down. # http://groups.google.com/group/nltk-users/browse_thread/thread/ad191241e5d9ee78 for v in xrange(len(search_tokens)): sv = search_tokens[v] add_terms = set([sv]) # en.spelling.correct(v) tag = POSTrim.trim(pos_tags[v][1]) mod = None if tag == "NN": mod = en.noun elif tag == "JJ": mod = en.adjective elif tag == "VB": mod = en.verb elif tag == "RB": mod = en.adverb else: mod = en.wordnet if mod: # add_terms.update(en.list.flatten(mod.senses(sv))) add_terms.update(en.list.flatten(mod.antonym(sv))) add_terms.update(en.list.flatten(mod.hypernym(sv))) add_terms.update(en.list.flatten(mod.hyponym(sv))) for t in add_terms: if t not in self.word_info: self.word_info[t] = TextWordInfo() if tag not in self.word_info[t].pos_counts: self.word_info[t].pos_counts[tag] = 0 self.word_info[t].count += 1 self.word_info[t].pos_counts[tag] += 1 self.total_words += 1 else: for t in search_tokens: if t not in self.word_info: self.word_info[t] = TextWordInfo() if tag not in self.word_info[t].pos_counts: self.word_info[t].pos_counts[tag] = 0 self.word_info[t].count += 1 self.word_info[t].pos_counts[tag] += 1 self.total_words += 1
def __init__(self, directory): self.normalizer = TokenNormalizer() self.quote_engine_only = config.getboolean('soul', 'quote_engine_only') # FIXME: http://www.w3schools.com/HTML/html_entities.asp clean_ents = [("<", "<"), (">", ">"), ("&", "&")] tagged_tweets = [] self.vocab = set([]) for root, dirs, files in os.walk(directory): for f in files: # .jtwt: json-encoded twitter tweets, 1 per line # TODO: Add @msgs to this user as hidden text if f.endswith(".jtwt"): fl = open(root+"/"+f, "r") for jtweet in fl.readlines(): tweet = json.loads(jtweet) txt = tweet['text'].encode('ascii', 'ignore') if re.search("( |^)RT(:| )", txt, re.IGNORECASE): continue if txt[0] == '@': txt = re.sub('^@[\S]+ ', '', txt) for e in clean_ents: txt = re.sub(e[0], e[1], txt) if self.quote_engine_only: tagged_tweets.append(txt) else: tokens = self.normalizer.normalize_tokens(word_tokenize(txt)) self.vocab.update(tokens) tagged_tweet = pos_tag(tokens, config.getboolean("soul","attempt_agfl"), config.getboolean("soul","reject_agfl_failures"), config.getboolean("soul","agfl_nltk_fallback")) if tagged_tweet: tagged_tweets.append(tagged_tweet) print "Loaded tweet #"+str(len(tagged_tweets)) #+"/"+str(len(files)) # .twt: plain-text tweets, 1 per line elif f.endswith(".twt"): fl = open(root+"/"+f, "r") for tweet in fl.readlines(): txt = tweet.encode('ascii', 'ignore') if txt.startswith('RT'): continue if txt[0] == '@': txt = re.sub('^@[\S]+ ', '', txt) for e in clean_ents: txt = re.sub(e[0], e[1], txt) if self.quote_engine_only: tagged_tweets.append(txt) else: tokens = self.normalizer.normalize_tokens(word_tokenize(txt)) self.vocab.update(tokens) tagged_tweet = pos_tag(tokens, config.getboolean("soul","attempt_agfl"), config.getboolean("soul","reject_agfl_failures"), config.getboolean("soul","agfl_nltk_fallback")) if tagged_tweet: tagged_tweets.append(tagged_tweet) print "Loaded tweet #"+str(len(tagged_tweets)) #+"/"+str(len(files)) pass # .post: long-winded material (blog/mailinglist posts, essays, articles, etc) elif f.endswith(".post"): fl = open(root+"/"+f, "r") post = fl.read() tweets = self.post_to_tweets(post) for txt in tweets: #txt = txt.encode('ascii', 'ignore') for e in clean_ents: txt = re.sub(e[0], e[1], txt) if self.quote_engine_only: tagged_tweets.append(txt) else: tokens = self.normalizer.normalize_tokens(word_tokenize(txt)) if tokens: self.vocab.update(tokens) tagged_tweet = pos_tag(tokens, config.getboolean("soul","attempt_agfl"), config.getboolean("soul","reject_agfl_failures"), config.getboolean("soul","agfl_nltk_fallback")) if tagged_tweet: tagged_tweets.append(tagged_tweet) print "Loaded post-tweet #"+str(len(tagged_tweets)) # .irclog: irc log files. irssi format. elif f.endswith(".irclog"): pass # .4sq: foursquare data elif f.endswith(".4sq"): pass self.tagged_tweets = tagged_tweets