示例#1
0
 def tokens(self):
     # FIXME: If we decide to drop tagged_tokens, switch to saving
     # just the tokens
     if self.tagged_tokens:
         retlist = [t[0] for t in self.tagged_tokens]
     else:
         retlist = word_tokenize(self.text)
     if self.hidden_text:
         retlist.extend(word_tokenize(self.hidden_text))
     return retlist
示例#2
0
    def get_tweet(self, msger=None, query_string=None, followed=False):
        self.__lock()
        if msger and msger not in self.conversation_contexts:
            self.conversation_contexts[msger] = ConversationContext(msger)
        max_len = config.getint("brain", "tweet_len")
        if query_string and msger:
            # XXX: nltk.pos_tag doesn't do so well if the first word in a question
            # is capitalized. Should we add an option to the normalizer for this?
            query_string = word_detokenize(
                self.conversation_contexts[msger].normalizer.normalize_tokens(word_tokenize(query_string))
            )
            query_string = PronounInverter.invert_all(query_string)

            print "Normalized Inverted Query: " + query_string
            query_text = SearchableText(query_string, strip=True)
            curr_vect = self.pending_tweets.score_query(query_text)

            if followed:
                qvect = curr_vect
            else:
                if self.last_vect != None:
                    self.conversation_contexts[msger].prime_memory(self.last_vect)

                qvect = self.conversation_contexts[msger].decay_query(curr_vect, query_text)

            max_len -= len("@" + msger + " ")
            (score, last_vect, ret) = self.pending_tweets.vector_query(
                qvect, exclude=self.remove_tweets, max_len=max_len
            )
            if followed:
                min_score = config.getfloat("query", "min_follow_reply_score")
            else:
                min_score = config.getfloat("query", "min_msg_reply_score")

            if score >= min_score:
                self.last_vect = last_vect
            else:
                print "Minimum score of " + str(min_score) + " not met: " + str(score)
                print str(ret.tagged_tokens)
                print "Not responding with: " + ret.text
                return None
            if followed:
                # If this was a followed tweet, we should now record that it made
                # us say something.
                self.conversation_contexts[msger].decay_query(curr_vect, query_text)

            # Remember the last thing we said.
            self.conversation_contexts[msger].remember_query(self.last_vect)
        else:
            # query should be None here
            if query_string:
                query_text = SearchableText(query_string, strip=True)
            else:
                query_text = None
            (score, self.last_vect, ret) = self.pending_tweets.text_query(
                query_text, exclude=self.remove_tweets, max_len=max_len
            )
        self.remove_tweets.append(ret)
        tokens = ret.tokens()
        self.already_tweeted.append(set(tokens))
        self.__unlock()
        print str(ret.tagged_tokens)
        if msger:
            return "@" + msger + " " + ret.text
        else:
            return ret.text
示例#3
0
    def __init__(
        self,
        text,
        tokens=None,
        tagged_tokens=None,
        strip=False,
        hidden_text="",
        generalize_terms=config.getboolean("query", "generalize_terms"),
    ):
        if hidden_text:
            hidden_text = hidden_text.rstrip()
            if not curses.ascii.ispunct(hidden_text[-1]):
                hidden_text += ". "
            else:
                hidden_text += " "
        self.hidden_text = hidden_text

        if not curses.ascii.ispunct(text[-1]):
            text += "."

        self.tagged_tokens = tagged_tokens
        if not tokens:
            tokens = word_tokenize(text)
        self.text = text

        if hidden_text:
            tokens.extend(word_tokenize(hidden_text))

        # Include hidden text in search tokens
        pos_tags = nltk.pos_tag(tokens)

        if strip:
            search_tokens = [porter.stem(t[0]).lower() for t in QueryStripper.strip_tagged_query(pos_tags)]
        else:
            search_tokens = [porter.stem(t[0]).lower() for t in pos_tags]

        self.word_info = {}
        self.total_words = 0
        if generalize_terms:
            # Add senses, antonyms, and hypernyms to this list with
            # http://nodebox.net/code/index.php/Linguistics
            # Also add normalized versions with en.spelling() first

            # FIXME: This is biasing results. Words with lots of hyponyms are being
            # favored by TF-IDF. We need word sense disambiguation to prune this
            # down.
            # http://groups.google.com/group/nltk-users/browse_thread/thread/ad191241e5d9ee78
            for v in xrange(len(search_tokens)):
                sv = search_tokens[v]
                add_terms = set([sv])
                # en.spelling.correct(v)
                tag = POSTrim.trim(pos_tags[v][1])
                mod = None
                if tag == "NN":
                    mod = en.noun
                elif tag == "JJ":
                    mod = en.adjective
                elif tag == "VB":
                    mod = en.verb
                elif tag == "RB":
                    mod = en.adverb
                else:
                    mod = en.wordnet
                if mod:
                    # add_terms.update(en.list.flatten(mod.senses(sv)))
                    add_terms.update(en.list.flatten(mod.antonym(sv)))
                    add_terms.update(en.list.flatten(mod.hypernym(sv)))
                    add_terms.update(en.list.flatten(mod.hyponym(sv)))

                for t in add_terms:
                    if t not in self.word_info:
                        self.word_info[t] = TextWordInfo()
                    if tag not in self.word_info[t].pos_counts:
                        self.word_info[t].pos_counts[tag] = 0
                    self.word_info[t].count += 1
                    self.word_info[t].pos_counts[tag] += 1
                    self.total_words += 1
        else:
            for t in search_tokens:
                if t not in self.word_info:
                    self.word_info[t] = TextWordInfo()
                if tag not in self.word_info[t].pos_counts:
                    self.word_info[t].pos_counts[tag] = 0
                self.word_info[t].count += 1
                self.word_info[t].pos_counts[tag] += 1
                self.total_words += 1
示例#4
0
  def __init__(self, directory):
    self.normalizer = TokenNormalizer()
    self.quote_engine_only = config.getboolean('soul', 'quote_engine_only')
    # FIXME: http://www.w3schools.com/HTML/html_entities.asp
    clean_ents = [("&lt;", "<"), ("&gt;", ">"), ("&amp;", "&")]
    tagged_tweets = []
    self.vocab = set([])
    for root, dirs, files in os.walk(directory):
      for f in files:
        # .jtwt: json-encoded twitter tweets, 1 per line
        # TODO: Add @msgs to this user as hidden text
        if f.endswith(".jtwt"):
          fl = open(root+"/"+f, "r")
          for jtweet in fl.readlines():
            tweet = json.loads(jtweet)
            txt = tweet['text'].encode('ascii', 'ignore')
            if re.search("( |^)RT(:| )", txt, re.IGNORECASE): continue
            if txt[0] == '@': txt = re.sub('^@[\S]+ ', '', txt)
            for e in clean_ents:
              txt = re.sub(e[0], e[1], txt)
            if self.quote_engine_only:
              tagged_tweets.append(txt)
            else:
              tokens = self.normalizer.normalize_tokens(word_tokenize(txt))
              self.vocab.update(tokens)
              tagged_tweet = pos_tag(tokens,
                               config.getboolean("soul","attempt_agfl"),
                               config.getboolean("soul","reject_agfl_failures"),
                               config.getboolean("soul","agfl_nltk_fallback"))
              if tagged_tweet: tagged_tweets.append(tagged_tweet)
            print "Loaded tweet #"+str(len(tagged_tweets)) #+"/"+str(len(files))
        # .twt: plain-text tweets, 1 per line
        elif f.endswith(".twt"):
          fl = open(root+"/"+f, "r")
          for tweet in fl.readlines():
            txt = tweet.encode('ascii', 'ignore')
            if txt.startswith('RT'): continue
            if txt[0] == '@': txt = re.sub('^@[\S]+ ', '', txt)
            for e in clean_ents:
              txt = re.sub(e[0], e[1], txt)
            if self.quote_engine_only:
              tagged_tweets.append(txt)
            else:
              tokens = self.normalizer.normalize_tokens(word_tokenize(txt))
              self.vocab.update(tokens)
              tagged_tweet = pos_tag(tokens,
                               config.getboolean("soul","attempt_agfl"),
                               config.getboolean("soul","reject_agfl_failures"),
                               config.getboolean("soul","agfl_nltk_fallback"))
              if tagged_tweet: tagged_tweets.append(tagged_tweet)
            print "Loaded tweet #"+str(len(tagged_tweets)) #+"/"+str(len(files))
          pass
        # .post: long-winded material (blog/mailinglist posts, essays, articles, etc)
        elif f.endswith(".post"):
          fl = open(root+"/"+f, "r")
          post = fl.read()
          tweets = self.post_to_tweets(post)
          for txt in tweets:
            #txt = txt.encode('ascii', 'ignore')
            for e in clean_ents:
              txt = re.sub(e[0], e[1], txt)
            if self.quote_engine_only:
              tagged_tweets.append(txt)
            else:
              tokens = self.normalizer.normalize_tokens(word_tokenize(txt))
              if tokens:
                self.vocab.update(tokens)
                tagged_tweet = pos_tag(tokens,
                               config.getboolean("soul","attempt_agfl"),
                               config.getboolean("soul","reject_agfl_failures"),
                               config.getboolean("soul","agfl_nltk_fallback"))
                if tagged_tweet: tagged_tweets.append(tagged_tweet)
            print "Loaded post-tweet #"+str(len(tagged_tweets))
        # .irclog: irc log files. irssi format.
        elif f.endswith(".irclog"):
          pass
        # .4sq: foursquare data
        elif f.endswith(".4sq"):
          pass

    self.tagged_tweets = tagged_tweets