def __init__(self, directory): self.normalizer = TokenNormalizer() self.quote_engine_only = config.getboolean('soul', 'quote_engine_only') # FIXME: http://www.w3schools.com/HTML/html_entities.asp clean_ents = [("<", "<"), (">", ">"), ("&", "&")] tagged_tweets = [] self.vocab = set([]) for root, dirs, files in os.walk(directory): for f in files: # .jtwt: json-encoded twitter tweets, 1 per line # TODO: Add @msgs to this user as hidden text if f.endswith(".jtwt"): fl = open(root+"/"+f, "r") for jtweet in fl.readlines(): tweet = json.loads(jtweet) txt = tweet['text'].encode('ascii', 'ignore') if re.search("( |^)RT(:| )", txt, re.IGNORECASE): continue if txt[0] == '@': txt = re.sub('^@[\S]+ ', '', txt) for e in clean_ents: txt = re.sub(e[0], e[1], txt) if self.quote_engine_only: tagged_tweets.append(txt) else: tokens = self.normalizer.normalize_tokens(word_tokenize(txt)) self.vocab.update(tokens) tagged_tweet = pos_tag(tokens, config.getboolean("soul","attempt_agfl"), config.getboolean("soul","reject_agfl_failures"), config.getboolean("soul","agfl_nltk_fallback")) if tagged_tweet: tagged_tweets.append(tagged_tweet) print "Loaded tweet #"+str(len(tagged_tweets)) #+"/"+str(len(files)) # .twt: plain-text tweets, 1 per line elif f.endswith(".twt"): fl = open(root+"/"+f, "r") for tweet in fl.readlines(): txt = tweet.encode('ascii', 'ignore') if txt.startswith('RT'): continue if txt[0] == '@': txt = re.sub('^@[\S]+ ', '', txt) for e in clean_ents: txt = re.sub(e[0], e[1], txt) if self.quote_engine_only: tagged_tweets.append(txt) else: tokens = self.normalizer.normalize_tokens(word_tokenize(txt)) self.vocab.update(tokens) tagged_tweet = pos_tag(tokens, config.getboolean("soul","attempt_agfl"), config.getboolean("soul","reject_agfl_failures"), config.getboolean("soul","agfl_nltk_fallback")) if tagged_tweet: tagged_tweets.append(tagged_tweet) print "Loaded tweet #"+str(len(tagged_tweets)) #+"/"+str(len(files)) pass # .post: long-winded material (blog/mailinglist posts, essays, articles, etc) elif f.endswith(".post"): fl = open(root+"/"+f, "r") post = fl.read() tweets = self.post_to_tweets(post) for txt in tweets: #txt = txt.encode('ascii', 'ignore') for e in clean_ents: txt = re.sub(e[0], e[1], txt) if self.quote_engine_only: tagged_tweets.append(txt) else: tokens = self.normalizer.normalize_tokens(word_tokenize(txt)) if tokens: self.vocab.update(tokens) tagged_tweet = pos_tag(tokens, config.getboolean("soul","attempt_agfl"), config.getboolean("soul","reject_agfl_failures"), config.getboolean("soul","agfl_nltk_fallback")) if tagged_tweet: tagged_tweets.append(tagged_tweet) print "Loaded post-tweet #"+str(len(tagged_tweets)) # .irclog: irc log files. irssi format. elif f.endswith(".irclog"): pass # .4sq: foursquare data elif f.endswith(".4sq"): pass self.tagged_tweets = tagged_tweets
class CorpusSoul: def __init__(self, directory): self.normalizer = TokenNormalizer() self.quote_engine_only = config.getboolean('soul', 'quote_engine_only') # FIXME: http://www.w3schools.com/HTML/html_entities.asp clean_ents = [("<", "<"), (">", ">"), ("&", "&")] tagged_tweets = [] tweet_texts = [] self.vocab = set([]) for root, dirs, files in os.walk(directory): for f in files: # .jtwt: json-encoded twitter tweets, 1 per line # TODO: Add @msgs to this user as hidden text if f.endswith(".jtwt"): fl = open(root+"/"+f, "r") for jtweet in fl.readlines(): tweet = json.loads(jtweet) txt = tweet['text'].encode('ascii', 'ignore') if re.search("( |^)RT(:| )", txt, re.IGNORECASE): continue if txt[0] == '@': txt = re.sub('^@[\S]+ ', '', txt) for e in clean_ents: txt = re.sub(e[0], e[1], txt) if self.quote_engine_only: tagged_tweets.append(txt) else: tokens = self.normalizer.normalize_tokens(word_tokenize(txt)) if tokens: self.vocab.update(tokens) tagged_tweet = pos_tag(tokens, config.getboolean("soul","attempt_agfl"), config.getboolean("soul","reject_agfl_failures"), config.getboolean("soul","agfl_nltk_fallback")) if tagged_tweet: tweet_texts.append(word_detokenize(tokens)) tagged_tweets.append(tagged_tweet) print "Loaded tweet #"+str(len(tagged_tweets)) #+"/"+str(len(files)) # .twt: plain-text tweets, 1 per line elif f.endswith(".twt"): fl = open(root+"/"+f, "r") for tweet in fl.readlines(): txt = tweet.encode('ascii', 'ignore') if txt.startswith('RT'): continue if txt[0] == '@': txt = re.sub('^@[\S]+ ', '', txt) for e in clean_ents: txt = re.sub(e[0], e[1], txt) if self.quote_engine_only: tagged_tweets.append(txt) else: tokens = self.normalizer.normalize_tokens(word_tokenize(txt)) if tokens: self.vocab.update(tokens) tagged_tweet = pos_tag(tokens, config.getboolean("soul","attempt_agfl"), config.getboolean("soul","reject_agfl_failures"), config.getboolean("soul","agfl_nltk_fallback")) if tagged_tweet: tweet_texts.append(word_detokenize(tokens)) tagged_tweets.append(tagged_tweet) print "Loaded tweet #"+str(len(tagged_tweets)) #+"/"+str(len(files)) pass # .post: long-winded material (blog/mailinglist posts, essays, articles, etc) elif f.endswith(".post"): fl = open(root+"/"+f, "r") post = fl.read() tweets = self.post_to_tweets(post) for txt in tweets: #txt = txt.encode('ascii', 'ignore') for e in clean_ents: txt = re.sub(e[0], e[1], txt) if self.quote_engine_only: tagged_tweets.append(txt) else: tokens = self.normalizer.normalize_tokens(word_tokenize(txt)) if tokens: self.vocab.update(tokens) tagged_tweet = pos_tag(tokens, config.getboolean("soul","attempt_agfl"), config.getboolean("soul","reject_agfl_failures"), config.getboolean("soul","agfl_nltk_fallback")) if tagged_tweet: tweet_texts.append(word_detokenize(tokens)) tagged_tweets.append(tagged_tweet) print "Loaded post-tweet #"+str(len(tagged_tweets)) # .irclog: irc log files. irssi format. elif f.endswith(".irclog"): pass # .4sq: foursquare data elif f.endswith(".4sq"): pass self.tagged_tweets = tagged_tweets num_clusters = config.getint('soul', 'tweet_topics') if num_clusters > 1: self.cluster_tweets(tweet_texts, num_clusters) else: self.cluster_rates = {} self.clustered_tweets = {} self.clustered_tweets[0] = tagged_tweets self.cluster_rates[0] = len(self.tagged_tweets) def post_to_tweets(self, post, summarize=False): # We do poorly with parentheticals. Just kill them. post = re.sub(r"\([^\)]+\)", "", post) if summarize: summ = SimpleSummarizer() post = summ.summarize(post, config.getint("soul", "post_summarize_len")) sentences = nltk.sent_tokenize(post) tweets = [] tweet = "" for s in sentences: if len(s) > config.getint("soul","post_len"): continue if len(tweet + s) < config.getint("soul","post_len"): tweet += s+" " else: if tweet: tweets.append(tweet) tweet = "" return tweets def cluster_tweets(self, tweet_texts, num_clusters=3): # XXX: move SearchableTextCollection to libs from resurrect import SearchableTextCollection,SearchableText print "Scoring tweets.." tc = SearchableTextCollection(self.vocab) for tweet in tweet_texts: txt = SearchableText(tweet) tc.add_text(txt) tc.update_matrix() print "Scored tweets.." print "Clustering tweets.." cluster = nltk.cluster.KMeansClusterer(num_clusters, nltk.cluster.util.euclidean_distance, repeats=20*num_clusters) # EM takes waaaaaayy too long, even with SVD #cluster = nltk.cluster.EMClusterer(means, svd_dimensions=100) clustered = cluster.cluster(tc.D, assign_clusters=True) print "Clustered tweets.." clustered_tweets = {} for i in xrange(len(clustered)): if clustered[i] not in clustered_tweets: clustered_tweets[clustered[i]] = [] clustered_tweets[clustered[i]].append(self.tagged_tweets[i]) self.cluster_rates = {} for i in clustered_tweets.iterkeys(): self.cluster_rates[i] = len(clustered_tweets[i]) print print "Cluster "+str(i)+": "+str(len(clustered_tweets[i])) for t in clustered_tweets[i]: print t self.clustered_tweets = clustered_tweets