Exemplo n.º 1
0
    def __init__(self, emoticon, searcher, analyzer, english_only=False):
        super(PMICalculator, self).__init__()

        self.field = "emoticons"
        self.emoticon = emoticon
        self.searcher = searcher
        self.analyzer = analyzer
        self.escaped_emoticon = QueryParser.escape(self.emoticon)
        self.query = QueryParser("emoticons", self.analyzer).parse(self.escaped_emoticon)
        self.raw_stats_dir = "/Volumes/TerraFirma/SharedData/vdb5/emoticons_raw_files/"
        if english_only:
            country = "United States"
            country_prefix = "US"
        else:
            country = None
            country_prefix = ""
        self.pmi_file_name = (
            self.raw_stats_dir
            + normalizeEmoticonName(self.emoticon).rstrip("_")
            + ("_%s" % (country_prefix)) * english_only
            + ".pmidata"
        )
        self.sample_tweets_name = (
            self.raw_stats_dir
            + normalizeEmoticonName(self.emoticon).rstrip("_")
            + ("_%s" % (country_prefix)) * english_only
            + ".samptweets"
        )
        self.sample_tweets_file = codecs.open(self.sample_tweets_name, encoding="utf-8", mode="w")
        self.term_count_collector = TermCountCollector(searcher, emoticon, country)
        print "starting query at: ", time.time()
        hits = self.searcher.search(self.query, self.term_count_collector)
        # print "terms: ", self.terms
        if emoticon == ":P":
            ee_two = QueryParser.escape(":p")
        elif emoticon == "T_T":
            ee_two = QueryParser.escape("TT")
        elif emoticon == "^_^":
            ee_two = QueryParser.escape("^^")
        if emoticon in [":P", "T_T", "^_^"]:
            q_two = QueryParser("emoticons", self.analyzer).parse(ee_two)
            hits_two = self.searcher.search(q_two, self.term_count_collector)
        self.terms = self.term_count_collector.getTerms()
        self.query_result_count = self.term_count_collector.getDocCount()
        for p_term, p_term_tweets in self.term_count_collector.popular_terms_hash.items():
            for p_term_tweet in p_term_tweets:
                self.sample_tweets_file.write("term: " + p_term + " tweet: " + p_term_tweet + "\n")
        self.sample_tweets_file.close()
        self.base_stats_file = open(
            "/Volumes/TerraFirma/SharedData/vdb5/emoticons_raw_files/emoticon_pmi_stats.txt", "r"
        )
        self.n = int(self.base_stats_file.read().strip().split(":")[1])

        print "computing PMI for query: ", self.emoticon, " at: ", time.time()

        self.p_query_result = self.query_result_count * 1.0 / self.n
def calculateEmoticonDiffusion(emoticon, searcher, analyzer, user_location_hash, usage_threshold = 1, comm_threshold = 1):
    raw_stats_dir = "/Volumes/TerraFirma/SharedData/vdb5/emoticons_raw_files/"
    emoticon_stats_file = open("/Volumes/TerraFirma/SharedData/vdb5/emoticons_raw_files/emoticon_diffusion_stats.txt","r") 
    total_users = int(emoticon_stats_file.read().strip())
    emoticon_stats_file.close()

    emoticon_file_name = raw_stats_dir + normalizeEmoticonName(emoticon).rstrip('_')+".diffusion_bidir"
    print "Calculating Diffusion for: ", emoticon, " at: ", time.time()
    escaped_emoticon = QueryParser.escape(emoticon)
    query = QueryParser("emoticons", analyzer).parse(escaped_emoticon)
    hits = searcher.search(query)
    print "%s total matching documents." % hits.length()
    if hits.length() == 0: return

    print "compiling diffusion stats at: ", time.time()
    emoticon_users_by_time_hash = {}
    emoticon_users_adopters_hash = {}
    emoticon_users_non_adopters_hash = {}
    users_exposure_hash = {}
    reverse_users_exposure_hash = {}
    try:
        hctr = 0
        for hit in hits:
            hctr += 1
            if hctr%100000==0: print "on hit: ", hctr
            #if hctr > 100000: break
            if hctr == hits.length(): break
            uid, timestamp, country, emoticons, user_id_replied = hit.get("user_id"), int(hit.get("timestamp")), hit.get('country'), hit.get('emoticons'), hit.get('user_id_replied')
            emoticon_users_by_time_hash[uid] = emoticon_users_by_time_hash.get(uid,[])+[timestamp]
    except Exception, e:
        pass
def getEmoticonPropagationCurves(emoticon, searcher, analyzer):
    raw_stats_dir = "/Volumes/TerraFirma/SharedData/vdb5/emoticons_raw_files/"
    emoticon_file_name = raw_stats_dir + normalizeEmoticonName(emoticon).rstrip('_')+".timehash"
    emoticon_stats_file = open("/Volumes/TerraFirma/SharedData/vdb5/emoticons_raw_files/emoticon_stats.json","r") 
    emoticon_stats_hash = json.loads(emoticon_stats_file.read())
    print "Searching for: ", emoticon, " at: ", time.time()
    escaped_emoticon = QueryParser.escape(emoticon)
    query = QueryParser("emoticons", analyzer).parse(escaped_emoticon)
    hits = searcher.search(query)
    print "%s total matching documents." % hits.length()
    if hits.length() == 0: return

    print " compiling propagation curve at: ", time.time()
    emoticon_propagation_hash = {}
    countryset = set()
    daytshash = {}
    try:
        hctr = 0
        for hit in hits:
            hctr += 1
            if hctr%100000==0: print "on hit: ", hctr
            if hctr == hits.length(): break
            uid, timestamp, country, emoticons, user_id_replied = hit.get("user_id"), hit.get("timestamp"), hit.get('country'), hit.get('emoticons'), hit.get('user_id_replied')
            num_replies = int(user_id_replied != '0')
            countryset.add(country)
            timestruct = time.gmtime(int(timestamp))
            daysincestart = (timestruct[0]-2005)*365+timestruct[7]
            daystartts = int(timestamp)-60*60*timestruct[3]-60*timestruct[4]-timestruct[5]
            nextdaystartts = daystartts+86400
            daytshash[daystartts] = {'days since start':daysincestart, 'next day ts':nextdaystartts}
            total_emoticon_count = string.count(emoticons, emoticon)
            if daysincestart in emoticon_propagation_hash:
                #emoticon_propagation_hash[daysincestart]['total'] += total_emoticon_count
                emoticon_propagation_hash[daysincestart]['total'] += 1
                #emoticon_propagation_hash[daysincestart][country] = emoticon_propagation_hash[daysincestart].get(country,0) + total_emoticon_count
                emoticon_propagation_hash[daysincestart][country] = emoticon_propagation_hash[daysincestart].get(country,0) + 1
                emoticon_propagation_hash[daysincestart]['total_in_replies'] += num_replies
            else:
                emoticon_propagation_hash[daysincestart] = {'total':total_emoticon_count, 'total_in_replies':num_replies, country:total_emoticon_count, \
                                                            'total tweets':0, 'total emoticon tweets':0, 'total http emoticons':0}
    except Exception, e: 
        print "failed to list hit: ", e