def __init__(self, emoticon, searcher, analyzer, english_only=False): super(PMICalculator, self).__init__() self.field = "emoticons" self.emoticon = emoticon self.searcher = searcher self.analyzer = analyzer self.escaped_emoticon = QueryParser.escape(self.emoticon) self.query = QueryParser("emoticons", self.analyzer).parse(self.escaped_emoticon) self.raw_stats_dir = "/Volumes/TerraFirma/SharedData/vdb5/emoticons_raw_files/" if english_only: country = "United States" country_prefix = "US" else: country = None country_prefix = "" self.pmi_file_name = ( self.raw_stats_dir + normalizeEmoticonName(self.emoticon).rstrip("_") + ("_%s" % (country_prefix)) * english_only + ".pmidata" ) self.sample_tweets_name = ( self.raw_stats_dir + normalizeEmoticonName(self.emoticon).rstrip("_") + ("_%s" % (country_prefix)) * english_only + ".samptweets" ) self.sample_tweets_file = codecs.open(self.sample_tweets_name, encoding="utf-8", mode="w") self.term_count_collector = TermCountCollector(searcher, emoticon, country) print "starting query at: ", time.time() hits = self.searcher.search(self.query, self.term_count_collector) # print "terms: ", self.terms if emoticon == ":P": ee_two = QueryParser.escape(":p") elif emoticon == "T_T": ee_two = QueryParser.escape("TT") elif emoticon == "^_^": ee_two = QueryParser.escape("^^") if emoticon in [":P", "T_T", "^_^"]: q_two = QueryParser("emoticons", self.analyzer).parse(ee_two) hits_two = self.searcher.search(q_two, self.term_count_collector) self.terms = self.term_count_collector.getTerms() self.query_result_count = self.term_count_collector.getDocCount() for p_term, p_term_tweets in self.term_count_collector.popular_terms_hash.items(): for p_term_tweet in p_term_tweets: self.sample_tweets_file.write("term: " + p_term + " tweet: " + p_term_tweet + "\n") self.sample_tweets_file.close() self.base_stats_file = open( "/Volumes/TerraFirma/SharedData/vdb5/emoticons_raw_files/emoticon_pmi_stats.txt", "r" ) self.n = int(self.base_stats_file.read().strip().split(":")[1]) print "computing PMI for query: ", self.emoticon, " at: ", time.time() self.p_query_result = self.query_result_count * 1.0 / self.n
def calculateEmoticonDiffusion(emoticon, searcher, analyzer, user_location_hash, usage_threshold = 1, comm_threshold = 1): raw_stats_dir = "/Volumes/TerraFirma/SharedData/vdb5/emoticons_raw_files/" emoticon_stats_file = open("/Volumes/TerraFirma/SharedData/vdb5/emoticons_raw_files/emoticon_diffusion_stats.txt","r") total_users = int(emoticon_stats_file.read().strip()) emoticon_stats_file.close() emoticon_file_name = raw_stats_dir + normalizeEmoticonName(emoticon).rstrip('_')+".diffusion_bidir" print "Calculating Diffusion for: ", emoticon, " at: ", time.time() escaped_emoticon = QueryParser.escape(emoticon) query = QueryParser("emoticons", analyzer).parse(escaped_emoticon) hits = searcher.search(query) print "%s total matching documents." % hits.length() if hits.length() == 0: return print "compiling diffusion stats at: ", time.time() emoticon_users_by_time_hash = {} emoticon_users_adopters_hash = {} emoticon_users_non_adopters_hash = {} users_exposure_hash = {} reverse_users_exposure_hash = {} try: hctr = 0 for hit in hits: hctr += 1 if hctr%100000==0: print "on hit: ", hctr #if hctr > 100000: break if hctr == hits.length(): break uid, timestamp, country, emoticons, user_id_replied = hit.get("user_id"), int(hit.get("timestamp")), hit.get('country'), hit.get('emoticons'), hit.get('user_id_replied') emoticon_users_by_time_hash[uid] = emoticon_users_by_time_hash.get(uid,[])+[timestamp] except Exception, e: pass
def getEmoticonPropagationCurves(emoticon, searcher, analyzer): raw_stats_dir = "/Volumes/TerraFirma/SharedData/vdb5/emoticons_raw_files/" emoticon_file_name = raw_stats_dir + normalizeEmoticonName(emoticon).rstrip('_')+".timehash" emoticon_stats_file = open("/Volumes/TerraFirma/SharedData/vdb5/emoticons_raw_files/emoticon_stats.json","r") emoticon_stats_hash = json.loads(emoticon_stats_file.read()) print "Searching for: ", emoticon, " at: ", time.time() escaped_emoticon = QueryParser.escape(emoticon) query = QueryParser("emoticons", analyzer).parse(escaped_emoticon) hits = searcher.search(query) print "%s total matching documents." % hits.length() if hits.length() == 0: return print " compiling propagation curve at: ", time.time() emoticon_propagation_hash = {} countryset = set() daytshash = {} try: hctr = 0 for hit in hits: hctr += 1 if hctr%100000==0: print "on hit: ", hctr if hctr == hits.length(): break uid, timestamp, country, emoticons, user_id_replied = hit.get("user_id"), hit.get("timestamp"), hit.get('country'), hit.get('emoticons'), hit.get('user_id_replied') num_replies = int(user_id_replied != '0') countryset.add(country) timestruct = time.gmtime(int(timestamp)) daysincestart = (timestruct[0]-2005)*365+timestruct[7] daystartts = int(timestamp)-60*60*timestruct[3]-60*timestruct[4]-timestruct[5] nextdaystartts = daystartts+86400 daytshash[daystartts] = {'days since start':daysincestart, 'next day ts':nextdaystartts} total_emoticon_count = string.count(emoticons, emoticon) if daysincestart in emoticon_propagation_hash: #emoticon_propagation_hash[daysincestart]['total'] += total_emoticon_count emoticon_propagation_hash[daysincestart]['total'] += 1 #emoticon_propagation_hash[daysincestart][country] = emoticon_propagation_hash[daysincestart].get(country,0) + total_emoticon_count emoticon_propagation_hash[daysincestart][country] = emoticon_propagation_hash[daysincestart].get(country,0) + 1 emoticon_propagation_hash[daysincestart]['total_in_replies'] += num_replies else: emoticon_propagation_hash[daysincestart] = {'total':total_emoticon_count, 'total_in_replies':num_replies, country:total_emoticon_count, \ 'total tweets':0, 'total emoticon tweets':0, 'total http emoticons':0} except Exception, e: print "failed to list hit: ", e