def __init__(self, emoticon, searcher, analyzer, english_only=False): super(PMICalculator, self).__init__() self.field = "emoticons" self.emoticon = emoticon self.searcher = searcher self.analyzer = analyzer self.escaped_emoticon = QueryParser.escape(self.emoticon) self.query = QueryParser("emoticons", self.analyzer).parse(self.escaped_emoticon) self.raw_stats_dir = "/Volumes/TerraFirma/SharedData/vdb5/emoticons_raw_files/" if english_only: country = "United States" country_prefix = "US" else: country = None country_prefix = "" self.pmi_file_name = ( self.raw_stats_dir + normalizeEmoticonName(self.emoticon).rstrip("_") + ("_%s" % (country_prefix)) * english_only + ".pmidata" ) self.sample_tweets_name = ( self.raw_stats_dir + normalizeEmoticonName(self.emoticon).rstrip("_") + ("_%s" % (country_prefix)) * english_only + ".samptweets" ) self.sample_tweets_file = codecs.open(self.sample_tweets_name, encoding="utf-8", mode="w") self.term_count_collector = TermCountCollector(searcher, emoticon, country) print "starting query at: ", time.time() hits = self.searcher.search(self.query, self.term_count_collector) # print "terms: ", self.terms if emoticon == ":P": ee_two = QueryParser.escape(":p") elif emoticon == "T_T": ee_two = QueryParser.escape("TT") elif emoticon == "^_^": ee_two = QueryParser.escape("^^") if emoticon in [":P", "T_T", "^_^"]: q_two = QueryParser("emoticons", self.analyzer).parse(ee_two) hits_two = self.searcher.search(q_two, self.term_count_collector) self.terms = self.term_count_collector.getTerms() self.query_result_count = self.term_count_collector.getDocCount() for p_term, p_term_tweets in self.term_count_collector.popular_terms_hash.items(): for p_term_tweet in p_term_tweets: self.sample_tweets_file.write("term: " + p_term + " tweet: " + p_term_tweet + "\n") self.sample_tweets_file.close() self.base_stats_file = open( "/Volumes/TerraFirma/SharedData/vdb5/emoticons_raw_files/emoticon_pmi_stats.txt", "r" ) self.n = int(self.base_stats_file.read().strip().split(":")[1]) print "computing PMI for query: ", self.emoticon, " at: ", time.time() self.p_query_result = self.query_result_count * 1.0 / self.n
class PMICalculator(object): def __init__(self, emoticon, searcher, analyzer, english_only=False): super(PMICalculator, self).__init__() self.field = "emoticons" self.emoticon = emoticon self.searcher = searcher self.analyzer = analyzer self.escaped_emoticon = QueryParser.escape(self.emoticon) self.query = QueryParser("emoticons", self.analyzer).parse(self.escaped_emoticon) self.raw_stats_dir = "/Volumes/TerraFirma/SharedData/vdb5/emoticons_raw_files/" if english_only: country = "United States" country_prefix = "US" else: country = None country_prefix = "" self.pmi_file_name = ( self.raw_stats_dir + normalizeEmoticonName(self.emoticon).rstrip("_") + ("_%s" % (country_prefix)) * english_only + ".pmidata" ) self.sample_tweets_name = ( self.raw_stats_dir + normalizeEmoticonName(self.emoticon).rstrip("_") + ("_%s" % (country_prefix)) * english_only + ".samptweets" ) self.sample_tweets_file = codecs.open(self.sample_tweets_name, encoding="utf-8", mode="w") self.term_count_collector = TermCountCollector(searcher, emoticon, country) print "starting query at: ", time.time() hits = self.searcher.search(self.query, self.term_count_collector) # print "terms: ", self.terms if emoticon == ":P": ee_two = QueryParser.escape(":p") elif emoticon == "T_T": ee_two = QueryParser.escape("TT") elif emoticon == "^_^": ee_two = QueryParser.escape("^^") if emoticon in [":P", "T_T", "^_^"]: q_two = QueryParser("emoticons", self.analyzer).parse(ee_two) hits_two = self.searcher.search(q_two, self.term_count_collector) self.terms = self.term_count_collector.getTerms() self.query_result_count = self.term_count_collector.getDocCount() for p_term, p_term_tweets in self.term_count_collector.popular_terms_hash.items(): for p_term_tweet in p_term_tweets: self.sample_tweets_file.write("term: " + p_term + " tweet: " + p_term_tweet + "\n") self.sample_tweets_file.close() self.base_stats_file = open( "/Volumes/TerraFirma/SharedData/vdb5/emoticons_raw_files/emoticon_pmi_stats.txt", "r" ) self.n = int(self.base_stats_file.read().strip().split(":")[1]) print "computing PMI for query: ", self.emoticon, " at: ", time.time() self.p_query_result = self.query_result_count * 1.0 / self.n def getTermPMI(self, min_cooccurrence): self.emoticon = self.emoticon.replace('"', "") self.pmi_file = codecs.open(self.pmi_file_name, encoding="utf-8", mode="w") term_re = "([a-z]+)|([#]\\w+)" cnt = 0 result_set = set() for co_occurring_term in self.terms: cnt += 1 if (self.terms[co_occurring_term] >= min_cooccurrence) and re.match(term_re, co_occurring_term): term_result = self.getPMI(co_occurring_term) result_set.add(term_result) if cnt % 10000 == 0: print "processed term number: ", cnt, " out of: ", len(self.terms), " at: ", time.time() print "number of results: ", len(result_set) sorted_result_set = sorted(list(result_set), key=lambda x: x.getPMI(), reverse=True) for tr in sorted_result_set: self.pmi_file.write( tr.getTerm() + "," + str(tr.getPMI()) + "," + str(tr.getNormPMI()) + "," + str(tr.getCooccurrenceCount()) + "\n" ) self.pmi_file.close() def getPMI(self, co_term): pmi = -1.0 cooccurrence_count = 0 term_count = 0 try: cooccurrence_count = self.terms[co_term] * 1.0 term_count = self.getTermCount(co_term) * 1.0 if cooccurrence_count > 0: p_cooccurrence = cooccurrence_count / self.n p_term = term_count / self.n + 0.00000001 else: p_cooccurrence = 0 p_term = 0 pmi = math.log(p_cooccurrence / (self.p_query_result * p_term), 2) norm_pmi = pmi * 1.0 / (-1 * math.log(p_cooccurrence)) # print "term: ", co_term, " term count: ", term_count, " cooccurrence_count: ", cooccurrence_count, " P(seed-term,term): ", p_cooccurrence, " P(seedterm): ", p_term, " PMI: ", pmi except Exception, e: print "failed to calculate PMI: ", e return PMIResult(co_term, pmi, norm_pmi, cooccurrence_count)