예제 #1
0
  def __init__(self, directory):
    self.normalizer = TokenNormalizer()
    self.quote_engine_only = config.getboolean('soul', 'quote_engine_only')
    # FIXME: http://www.w3schools.com/HTML/html_entities.asp
    clean_ents = [("&lt;", "<"), ("&gt;", ">"), ("&amp;", "&")]
    tagged_tweets = []
    self.vocab = set([])
    for root, dirs, files in os.walk(directory):
      for f in files:
        # .jtwt: json-encoded twitter tweets, 1 per line
        # TODO: Add @msgs to this user as hidden text
        if f.endswith(".jtwt"):
          fl = open(root+"/"+f, "r")
          for jtweet in fl.readlines():
            tweet = json.loads(jtweet)
            txt = tweet['text'].encode('ascii', 'ignore')
            if re.search("( |^)RT(:| )", txt, re.IGNORECASE): continue
            if txt[0] == '@': txt = re.sub('^@[\S]+ ', '', txt)
            for e in clean_ents:
              txt = re.sub(e[0], e[1], txt)
            if self.quote_engine_only:
              tagged_tweets.append(txt)
            else:
              tokens = self.normalizer.normalize_tokens(word_tokenize(txt))
              self.vocab.update(tokens)
              tagged_tweet = pos_tag(tokens,
                               config.getboolean("soul","attempt_agfl"),
                               config.getboolean("soul","reject_agfl_failures"),
                               config.getboolean("soul","agfl_nltk_fallback"))
              if tagged_tweet: tagged_tweets.append(tagged_tweet)
            print "Loaded tweet #"+str(len(tagged_tweets)) #+"/"+str(len(files))
        # .twt: plain-text tweets, 1 per line
        elif f.endswith(".twt"):
          fl = open(root+"/"+f, "r")
          for tweet in fl.readlines():
            txt = tweet.encode('ascii', 'ignore')
            if txt.startswith('RT'): continue
            if txt[0] == '@': txt = re.sub('^@[\S]+ ', '', txt)
            for e in clean_ents:
              txt = re.sub(e[0], e[1], txt)
            if self.quote_engine_only:
              tagged_tweets.append(txt)
            else:
              tokens = self.normalizer.normalize_tokens(word_tokenize(txt))
              self.vocab.update(tokens)
              tagged_tweet = pos_tag(tokens,
                               config.getboolean("soul","attempt_agfl"),
                               config.getboolean("soul","reject_agfl_failures"),
                               config.getboolean("soul","agfl_nltk_fallback"))
              if tagged_tweet: tagged_tweets.append(tagged_tweet)
            print "Loaded tweet #"+str(len(tagged_tweets)) #+"/"+str(len(files))
          pass
        # .post: long-winded material (blog/mailinglist posts, essays, articles, etc)
        elif f.endswith(".post"):
          fl = open(root+"/"+f, "r")
          post = fl.read()
          tweets = self.post_to_tweets(post)
          for txt in tweets:
            #txt = txt.encode('ascii', 'ignore')
            for e in clean_ents:
              txt = re.sub(e[0], e[1], txt)
            if self.quote_engine_only:
              tagged_tweets.append(txt)
            else:
              tokens = self.normalizer.normalize_tokens(word_tokenize(txt))
              if tokens:
                self.vocab.update(tokens)
                tagged_tweet = pos_tag(tokens,
                               config.getboolean("soul","attempt_agfl"),
                               config.getboolean("soul","reject_agfl_failures"),
                               config.getboolean("soul","agfl_nltk_fallback"))
                if tagged_tweet: tagged_tweets.append(tagged_tweet)
            print "Loaded post-tweet #"+str(len(tagged_tweets))
        # .irclog: irc log files. irssi format.
        elif f.endswith(".irclog"):
          pass
        # .4sq: foursquare data
        elif f.endswith(".4sq"):
          pass

    self.tagged_tweets = tagged_tweets
예제 #2
0
class CorpusSoul:
  def __init__(self, directory):
    self.normalizer = TokenNormalizer()
    self.quote_engine_only = config.getboolean('soul', 'quote_engine_only')
    # FIXME: http://www.w3schools.com/HTML/html_entities.asp
    clean_ents = [("&lt;", "<"), ("&gt;", ">"), ("&amp;", "&")]
    tagged_tweets = []
    tweet_texts = []
    self.vocab = set([])
    for root, dirs, files in os.walk(directory):
      for f in files:
        # .jtwt: json-encoded twitter tweets, 1 per line
        # TODO: Add @msgs to this user as hidden text
        if f.endswith(".jtwt"):
          fl = open(root+"/"+f, "r")
          for jtweet in fl.readlines():
            tweet = json.loads(jtweet)
            txt = tweet['text'].encode('ascii', 'ignore')
            if re.search("( |^)RT(:| )", txt, re.IGNORECASE): continue
            if txt[0] == '@': txt = re.sub('^@[\S]+ ', '', txt)
            for e in clean_ents:
              txt = re.sub(e[0], e[1], txt)
            if self.quote_engine_only:
              tagged_tweets.append(txt)
            else:
              tokens = self.normalizer.normalize_tokens(word_tokenize(txt))
              if tokens:
                self.vocab.update(tokens)
                tagged_tweet = pos_tag(tokens,
                                 config.getboolean("soul","attempt_agfl"),
                                 config.getboolean("soul","reject_agfl_failures"),
                                 config.getboolean("soul","agfl_nltk_fallback"))
                if tagged_tweet:
                  tweet_texts.append(word_detokenize(tokens))
                  tagged_tweets.append(tagged_tweet)
            print "Loaded tweet #"+str(len(tagged_tweets)) #+"/"+str(len(files))
        # .twt: plain-text tweets, 1 per line
        elif f.endswith(".twt"):
          fl = open(root+"/"+f, "r")
          for tweet in fl.readlines():
            txt = tweet.encode('ascii', 'ignore')
            if txt.startswith('RT'): continue
            if txt[0] == '@': txt = re.sub('^@[\S]+ ', '', txt)
            for e in clean_ents:
              txt = re.sub(e[0], e[1], txt)
            if self.quote_engine_only:
              tagged_tweets.append(txt)
            else:
              tokens = self.normalizer.normalize_tokens(word_tokenize(txt))
              if tokens:
                self.vocab.update(tokens)
                tagged_tweet = pos_tag(tokens,
                              config.getboolean("soul","attempt_agfl"),
                              config.getboolean("soul","reject_agfl_failures"),
                              config.getboolean("soul","agfl_nltk_fallback"))
                if tagged_tweet:
                  tweet_texts.append(word_detokenize(tokens))
                  tagged_tweets.append(tagged_tweet)
            print "Loaded tweet #"+str(len(tagged_tweets)) #+"/"+str(len(files))
          pass
        # .post: long-winded material (blog/mailinglist posts, essays, articles, etc)
        elif f.endswith(".post"):
          fl = open(root+"/"+f, "r")
          post = fl.read()
          tweets = self.post_to_tweets(post)
          for txt in tweets:
            #txt = txt.encode('ascii', 'ignore')
            for e in clean_ents:
              txt = re.sub(e[0], e[1], txt)
            if self.quote_engine_only:
              tagged_tweets.append(txt)
            else:
              tokens = self.normalizer.normalize_tokens(word_tokenize(txt))
              if tokens:
                self.vocab.update(tokens)
                tagged_tweet = pos_tag(tokens,
                               config.getboolean("soul","attempt_agfl"),
                               config.getboolean("soul","reject_agfl_failures"),
                               config.getboolean("soul","agfl_nltk_fallback"))
                if tagged_tweet:
                  tweet_texts.append(word_detokenize(tokens))
                  tagged_tweets.append(tagged_tweet)
            print "Loaded post-tweet #"+str(len(tagged_tweets))
        # .irclog: irc log files. irssi format.
        elif f.endswith(".irclog"):
          pass
        # .4sq: foursquare data
        elif f.endswith(".4sq"):
          pass

    self.tagged_tweets = tagged_tweets

    num_clusters = config.getint('soul', 'tweet_topics')
    if num_clusters > 1:
      self.cluster_tweets(tweet_texts, num_clusters)
    else:
      self.cluster_rates = {}
      self.clustered_tweets = {}
      self.clustered_tweets[0] = tagged_tweets
      self.cluster_rates[0] = len(self.tagged_tweets)

  def post_to_tweets(self, post, summarize=False):
    # We do poorly with parentheticals. Just kill them.
    post = re.sub(r"\([^\)]+\)", "", post)
    if summarize:
      summ = SimpleSummarizer()
      post = summ.summarize(post, config.getint("soul", "post_summarize_len"))
    sentences = nltk.sent_tokenize(post)
    tweets = []
    tweet = ""
    for s in sentences:
      if len(s) > config.getint("soul","post_len"): continue
      if len(tweet + s) < config.getint("soul","post_len"):
        tweet += s+" "
      else:
        if tweet: tweets.append(tweet)
        tweet = ""
    return tweets

  def cluster_tweets(self, tweet_texts, num_clusters=3):
    # XXX: move SearchableTextCollection to libs
    from resurrect import SearchableTextCollection,SearchableText

    print "Scoring tweets.."
    tc = SearchableTextCollection(self.vocab)
    for tweet in tweet_texts:
      txt = SearchableText(tweet)
      tc.add_text(txt)
    tc.update_matrix()
    print "Scored tweets.."
    print "Clustering tweets.."
    cluster = nltk.cluster.KMeansClusterer(num_clusters,
                 nltk.cluster.util.euclidean_distance,
                 repeats=20*num_clusters)
    # EM takes waaaaaayy too long, even with SVD
    #cluster = nltk.cluster.EMClusterer(means, svd_dimensions=100)
    clustered = cluster.cluster(tc.D, assign_clusters=True)
    print "Clustered tweets.."
    clustered_tweets = {}
    for i in xrange(len(clustered)):
      if clustered[i] not in clustered_tweets:
        clustered_tweets[clustered[i]] = []
      clustered_tweets[clustered[i]].append(self.tagged_tweets[i])
    self.cluster_rates = {}
    for i in clustered_tweets.iterkeys():
      self.cluster_rates[i] = len(clustered_tweets[i])
      print
      print "Cluster "+str(i)+": "+str(len(clustered_tweets[i]))
      for t in clustered_tweets[i]:
        print t
    self.clustered_tweets = clustered_tweets