Exemplo n.º 1
0
    def get_tweet(self, msger=None, query_string=None, followed=False):
        self.__lock()
        if msger and msger not in self.conversation_contexts:
            self.conversation_contexts[msger] = ConversationContext(msger)
        max_len = config.getint("brain", "tweet_len")
        if query_string and msger:
            # XXX: nltk.pos_tag doesn't do so well if the first word in a question
            # is capitalized. Should we add an option to the normalizer for this?
            query_string = word_detokenize(
                self.conversation_contexts[msger].normalizer.normalize_tokens(word_tokenize(query_string))
            )
            query_string = PronounInverter.invert_all(query_string)

            print "Normalized Inverted Query: " + query_string
            query_text = SearchableText(query_string, strip=True)
            curr_vect = self.pending_tweets.score_query(query_text)

            if followed:
                qvect = curr_vect
            else:
                if self.last_vect != None:
                    self.conversation_contexts[msger].prime_memory(self.last_vect)

                qvect = self.conversation_contexts[msger].decay_query(curr_vect, query_text)

            max_len -= len("@" + msger + " ")
            (score, last_vect, ret) = self.pending_tweets.vector_query(
                qvect, exclude=self.remove_tweets, max_len=max_len
            )
            if followed:
                min_score = config.getfloat("query", "min_follow_reply_score")
            else:
                min_score = config.getfloat("query", "min_msg_reply_score")

            if score >= min_score:
                self.last_vect = last_vect
            else:
                print "Minimum score of " + str(min_score) + " not met: " + str(score)
                print str(ret.tagged_tokens)
                print "Not responding with: " + ret.text
                return None
            if followed:
                # If this was a followed tweet, we should now record that it made
                # us say something.
                self.conversation_contexts[msger].decay_query(curr_vect, query_text)

            # Remember the last thing we said.
            self.conversation_contexts[msger].remember_query(self.last_vect)
        else:
            # query should be None here
            if query_string:
                query_text = SearchableText(query_string, strip=True)
            else:
                query_text = None
            (score, self.last_vect, ret) = self.pending_tweets.text_query(
                query_text, exclude=self.remove_tweets, max_len=max_len
            )
        self.remove_tweets.append(ret)
        tokens = ret.tokens()
        self.already_tweeted.append(set(tokens))
        self.__unlock()
        print str(ret.tagged_tokens)
        if msger:
            return "@" + msger + " " + ret.text
        else:
            return ret.text
Exemplo n.º 2
0
  def __init__(self, directory):
    self.normalizer = TokenNormalizer()
    self.quote_engine_only = config.getboolean('soul', 'quote_engine_only')
    # FIXME: http://www.w3schools.com/HTML/html_entities.asp
    clean_ents = [("&lt;", "<"), ("&gt;", ">"), ("&amp;", "&")]
    tagged_tweets = []
    tweet_texts = []
    self.vocab = set([])
    for root, dirs, files in os.walk(directory):
      for f in files:
        # .jtwt: json-encoded twitter tweets, 1 per line
        # TODO: Add @msgs to this user as hidden text
        if f.endswith(".jtwt"):
          fl = open(root+"/"+f, "r")
          for jtweet in fl.readlines():
            tweet = json.loads(jtweet)
            txt = tweet['text'].encode('ascii', 'ignore')
            if re.search("( |^)RT(:| )", txt, re.IGNORECASE): continue
            if txt[0] == '@': txt = re.sub('^@[\S]+ ', '', txt)
            for e in clean_ents:
              txt = re.sub(e[0], e[1], txt)
            if self.quote_engine_only:
              tagged_tweets.append(txt)
            else:
              tokens = self.normalizer.normalize_tokens(word_tokenize(txt))
              if tokens:
                self.vocab.update(tokens)
                tagged_tweet = pos_tag(tokens,
                                 config.getboolean("soul","attempt_agfl"),
                                 config.getboolean("soul","reject_agfl_failures"),
                                 config.getboolean("soul","agfl_nltk_fallback"))
                if tagged_tweet:
                  tweet_texts.append(word_detokenize(tokens))
                  tagged_tweets.append(tagged_tweet)
            print "Loaded tweet #"+str(len(tagged_tweets)) #+"/"+str(len(files))
        # .twt: plain-text tweets, 1 per line
        elif f.endswith(".twt"):
          fl = open(root+"/"+f, "r")
          for tweet in fl.readlines():
            txt = tweet.encode('ascii', 'ignore')
            if txt.startswith('RT'): continue
            if txt[0] == '@': txt = re.sub('^@[\S]+ ', '', txt)
            for e in clean_ents:
              txt = re.sub(e[0], e[1], txt)
            if self.quote_engine_only:
              tagged_tweets.append(txt)
            else:
              tokens = self.normalizer.normalize_tokens(word_tokenize(txt))
              if tokens:
                self.vocab.update(tokens)
                tagged_tweet = pos_tag(tokens,
                              config.getboolean("soul","attempt_agfl"),
                              config.getboolean("soul","reject_agfl_failures"),
                              config.getboolean("soul","agfl_nltk_fallback"))
                if tagged_tweet:
                  tweet_texts.append(word_detokenize(tokens))
                  tagged_tweets.append(tagged_tweet)
            print "Loaded tweet #"+str(len(tagged_tweets)) #+"/"+str(len(files))
          pass
        # .post: long-winded material (blog/mailinglist posts, essays, articles, etc)
        elif f.endswith(".post"):
          fl = open(root+"/"+f, "r")
          post = fl.read()
          tweets = self.post_to_tweets(post)
          for txt in tweets:
            #txt = txt.encode('ascii', 'ignore')
            for e in clean_ents:
              txt = re.sub(e[0], e[1], txt)
            if self.quote_engine_only:
              tagged_tweets.append(txt)
            else:
              tokens = self.normalizer.normalize_tokens(word_tokenize(txt))
              if tokens:
                self.vocab.update(tokens)
                tagged_tweet = pos_tag(tokens,
                               config.getboolean("soul","attempt_agfl"),
                               config.getboolean("soul","reject_agfl_failures"),
                               config.getboolean("soul","agfl_nltk_fallback"))
                if tagged_tweet:
                  tweet_texts.append(word_detokenize(tokens))
                  tagged_tweets.append(tagged_tweet)
            print "Loaded post-tweet #"+str(len(tagged_tweets))
        # .irclog: irc log files. irssi format.
        elif f.endswith(".irclog"):
          pass
        # .4sq: foursquare data
        elif f.endswith(".4sq"):
          pass

    self.tagged_tweets = tagged_tweets

    num_clusters = config.getint('soul', 'tweet_topics')
    if num_clusters > 1:
      self.cluster_tweets(tweet_texts, num_clusters)
    else:
      self.cluster_rates = {}
      self.clustered_tweets = {}
      self.clustered_tweets[0] = tagged_tweets
      self.cluster_rates[0] = len(self.tagged_tweets)