예제 #1
0
def sentiment_kcount_redis2mysql(end_ts, during=Fifteenminutes):
    topics = get_keywords()

    global_emotions_data ={}
    for k, v in emotions_kv.iteritems():
        global_emotions_data[v] = [end_ts, r.zrange(TOP_KEYWORDS_RANK % v, 0, 50, desc=True, withscores=True)]
    print '%s global saved keywords: %s' % (time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), len(global_emotions_data))
    save_global_keywords(global_emotions_data, during, TOP_KEYWORDS_LIMIT) 
    
    for topic in topics:
        topic_emotions_data = {}
        for k, v in emotions_kv.iteritems():
##            print topic, v
##            print r.zrange(KEYWORD_TOP_KEYWORDS_RANK % (topic, v), 0, 50, desc=True, withscores=True)
            topic_emotions_data[v] = [end_ts, r.zrange(KEYWORD_TOP_KEYWORDS_RANK % (topic, v), 0, 50, desc=True, withscores=True)]
        #print topic_emotions_data
        print '%s topic %s saved keywords: %s' % (time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), topic, len(topic_emotions_data))
        save_topic_results('kcount', topic, topic_emotions_data, during)

    for fieldid, field in enumerate(DOMAIN_LIST):
        domain_emotions_data = {}
        for k, v in emotions_kv.iteritems():
            domain_emotions_data[v] = [end_ts, r.zrange(DOMAIN_TOP_KEYWORDS_RANK % (fieldid, v), 0, 50, desc=True, withscores=True)]
        print '%s domain %s saved keywords: %s' % (time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), fieldid, len(domain_emotions_data))
        save_domain_kcount(int(fieldid), domain_emotions_data, during, TOP_KEYWORDS_LIMIT)       
예제 #2
0
def sentiment_weibo_redis2mysql(end_ts, during=Fifteenminutes):
    topics = get_keywords()

    global_emotions_data ={}
    for k, v in emotions_kv.iteritems():
        weiboids = r.zrange(TOP_WEIBO_REPOSTS_COUNT_RANK % v, 0, 50, desc=True, withscores=False)
        weibos = [pickle.loads(zlib.decompress(r.get(TOP_WEIBO_KEY % mid))) for mid in weiboids]
        global_emotions_data[v] = [end_ts, weibos]
    print '%s global saved weibos: %s' % (time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), len(global_emotions_data))
    save_global_weibos(global_emotions_data, during, TOP_WEIBOS_LIMIT)    
    
    for topic in topics:
        topic_emotions_data = {}
        for k, v in emotions_kv.iteritems():
            weiboids = r.zrange(KEYWORD_TOP_WEIBO_REPOSTS_COUNT_RANK % (topic, v), 0, 50, desc=True, withscores=False)
            weibos = [pickle.loads(zlib.decompress(r.get(TOP_WEIBO_KEY % mid))) for mid in weiboids]
            topic_emotions_data[v] = [end_ts, weibos]
        #print topic_emotions_data
        print '%s topic %s saved weibos: %s' % (time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), topic, len(topic_emotions_data))
        save_topic_results('weibos', topic, topic_emotions_data, during)
        
    for fieldid, field in enumerate(DOMAIN_LIST):
        domain_emotions_data = {}
        for k, v in emotions_kv.iteritems():
            weiboids = r.zrange(DOMAIN_TOP_WEIBO_REPOSTS_COUNT_RANK % (fieldid, v), 0, 50, desc=True, withscores=False)
            weibos = [pickle.loads(zlib.decompress(r.get(TOP_WEIBO_KEY % mid))) for mid in weiboids]    
            domain_emotions_data[v] = [end_ts, weibos]
        print '%s domain %s saved weibos: %s' % (time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), fieldid, len(domain_emotions_data))
        save_domain_weibos(int(fieldid), domain_emotions_data, during, TOP_WEIBOS_LIMIT)
예제 #3
0
def sentiment_count_redis2mysql(end_ts, during=Fifteenminutes):
    keywords = get_keywords()
    
    global_emotions_data ={}
    for k, v in emotions_kv.iteritems():
        global_count = r.get(GLOBAL_SENTIMENT_COUNT % v)
        if not global_count:
            global_count = 0
        global_emotions_data[v] = [end_ts, global_count]
    print '%s global saved count: %s' % (time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), len(global_emotions_data))
    save_global_count(global_emotions_data, during)

    for keyword in keywords:
        topic_emotions_data = {}
        for k, v in emotions_kv.iteritems():
            topic_count = r.get(KEYWORD_SENTIMENT_COUNT % (keyword, v))
            if not topic_count:
                topic_count = 0
            topic_emotions_data[v] = [end_ts, topic_count]
        print '%s topic %s saved count: %s' % (time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), keyword, len(topic_emotions_data))
        save_topic_results('count', keyword, topic_emotions_data, during)

    for fieldid, field in enumerate(DOMAIN_LIST):
        domain_emotions_data = {}
        for k, v in emotions_kv.iteritems():
            domain_count = r.get(DOMAIN_SENTIMENT_COUNT % (fieldid, v))
            if not domain_count:
                domain_count = 0
            domain_emotions_data[v] = [end_ts, domain_count]
        print '%s domain %s saved: %s' % (time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), fieldid, len(domain_emotions_data))
        save_domain_count(fieldid, domain_emotions_data, during)        
예제 #4
0
def quota_sentiment(topic, xapian_search_weibo, start_ts, end_ts):
    if topic and topic != '':
        topics = topic.strip().split(',')
    query_dict = {
        'timestamp': {'$gt':start_ts, '$lt':end_ts},
        '$and':[],
        }
    for ctopic in topics:
        query_dict['$and'].append({'topics': ctopic}) # just test ---topics
    sentiment_count_dict = {}
    allcount = 0
    for k, v in emotions_kv.iteritems(): 
        query_dict['sentiment'] = v
        scount, weibo_results = xapian_search_weibo.search(query=query_dict, fields=fields_list)
        results_list = []
        if scount:
            for weibo_result in weibo_results():
                results_list.append([weibo_result['_id'], weibo_result['text'].encode('utf-8')])
            scount_new, data_wid = get_nad(results_list)
        else:
            scount_new = 0
        sentiment_count_dict[v] = scount_new
        allcount += scount_new
    print 'sentiment_count_dict:', sentiment_count_dict
    emotion_ratio_dict = {}
    emotion_ratio_dict['sad'] = float(sentiment_count_dict[3])/float(allcount)
    emotion_ratio_dict['angry'] = float(sentiment_count_dict[2]) / float(allcount)

    save_sentiment_quota(topic, start_ts, end_ts, emotion_ratio_dict) # 需要修改sentiment_quota表结构
예제 #5
0
def sentiment_keywords(xapian_search_weibo, start_ts, over_ts, during=Hour, sort_field='reposts_count', save_fields=RESP_ITER_KEYS, w_limit=TOP_WEIBOS_LIMIT, k_limit=TOP_KEYWORDS_LIMIT):
    start_ts = int(start_ts)
    over_ts = int(over_ts)

    over_ts = ts2HourlyTime(over_ts, during)
    interval = (over_ts - start_ts) / during

    for i in range(interval, 0, -1):
    	emotions_data = {}
        emotions_weibo = {}
        
        begin_ts = over_ts - during * i
        end_ts = begin_ts + during
        print begin_ts, end_ts, ' starts calculate'

        query_dict = {
	        'timestamp': {'$gt': begin_ts, '$lt': end_ts},
	    }

        for k, v in emotions_kv.iteritems():
            query_dict['sentiment'] = v
            print xapian_search_weibo.search(query=query_dict, count_only=True)
            mset = xapian_search_weibo.search(query=query_dict, sort_by=[sort_field], max_offset=w_limit, mset_direct=True)
            top_ws = top_weibos(gen_mset_iter(xapian_search_weibo, mset, fields=save_fields), top=w_limit)
            keywords_with_count = top_keywords(gen_mset_iter(xapian_search_weibo, mset, fields=['terms']), top=k_limit)

            emotions_data[v] = [end_ts, keywords_with_count]
            emotions_weibo[v] = [end_ts, top_ws]

            print k, v, ', emotion keywords length: ', len(keywords_with_count), ', emotion weibos length: ', len(top_ws)

        print date, '%s %s saved emotions keywords and weibos' % (begin_ts, end_ts)
        save_count_results(emotions_data, during, TOP_KEYWORDS_LIMIT)
        save_weibos_results(emotions_weibo, during, TOP_WEIBOS_LIMIT)
예제 #6
0
def sentiment_count(xapian_search_weibo, start_ts, over_ts, during=Hour):
    start_ts = int(start_ts)
    over_ts = int(over_ts)

    over_ts = ts2HourlyTime(over_ts, during)
    interval = (over_ts - start_ts) / during

    for i in range(interval, 0, -1):
        emotions_data = {}

        begin_ts = over_ts - during * i
        end_ts = begin_ts + during
        print begin_ts, end_ts, ' starts calculate'

        query_dict = {
            'timestamp': {'$gt': begin_ts, '$lt': end_ts},
        }

        for k, v in emotions_kv.iteritems():
            query_dict['sentiment'] = v
            count = xapian_search_weibo.search(query=query_dict, count_only=True)
            emotions_data[v] = [end_ts, count]

        print date, 'saved: ', emotions_data 
        save_count_results(emotions_data, during)
예제 #7
0
def sentimentCronTopic(topic, xapian_search_weibo, start_ts, over_ts, sort_field=SORT_FIELD, save_fields=RESP_ITER_KEYS, during=Fifteenminutes, w_limit=TOP_WEIBOS_LIMIT, k_limit=TOP_KEYWORDS_LIMIT):
    if topic and topic != '':
        start_ts = int(start_ts)
        over_ts = int(over_ts)

        over_ts = ts2HourlyTime(over_ts, during)
        interval = (over_ts - start_ts) / during

        topics = topic.strip().split(',')

        for i in range(interval, 0, -1):
            emotions_count = {}
            emotions_kcount = {}
            emotions_weibo = {}

            begin_ts = over_ts - during * i
            end_ts = begin_ts + during
            print begin_ts, end_ts, 'topic %s starts calculate' % topic.encode('utf-8')

            query_dict = {
                'timestamp': {'$gt': begin_ts, '$lt': end_ts},
                '$or': []
            }

            for topic in topics:
                query_dict['$or'].append({'text': topic})

            for k, v in emotions_kv.iteritems():
                query_dict['sentiment'] = v
                scount = xapian_search_weibo.search(query=query_dict, count_only=True)
                mset = xapian_search_weibo.search(query=query_dict, sort_by=[sort_field], \
                                                  max_offset=w_limit, mset_direct=True)
                kcount = top_keywords(gen_mset_iter(xapian_search_weibo, mset, fields=['terms']), top=k_limit)
                top_ws = top_weibos(gen_mset_iter(xapian_search_weibo, mset, fields=save_fields), top=w_limit)

                emotions_count[v] = [end_ts, scount]
                emotions_kcount[v] = [end_ts, kcount]
                emotions_weibo[v] = [end_ts, top_ws]

                print k, v, ', emotions count: ', emotions_count, ', emotion keywords length: ', len(kcount), ', emotion weibos length: ', len(top_ws)

            print '%s %s saved emotions counts, keywords and weibos' % (begin_ts, end_ts)
            save_rt_results('count', topic, emotions_count, during)
            save_rt_results('kcount', topic, emotions_kcount, during, k_limit, w_limit)
            save_rt_results('weibos', topic, emotions_weibo, during, k_limit, w_limit)
예제 #8
0
def sentiment_field(domain, xapian_search_weibo, start_ts, over_ts, sort_field='reposts_count', save_fields=RESP_ITER_KEYS, during=Hour, w_limit=TOP_WEIBOS_LIMIT, k_limit=TOP_KEYWORDS_LIMIT):
    if domain_uids != []:
        start_ts = int(start_ts)
        over_ts = int(over_ts)

        over_ts = ts2HourlyTime(over_ts, during)
        interval = (over_ts - start_ts) / during

        for i in range(interval, 0, -1):
            emotions_count = {}
            emotions_kcount = {}
            emotions_weibo = {}

            begin_ts = over_ts - during * i
            end_ts = begin_ts + during
            print begin_ts, end_ts, 'domain %s starts calculate' % domain

            query_dict = {
                'timestamp': {'$gt': begin_ts, '$lt': end_ts},
                '$or': []
            }

            for uid in domain_uids:
                query_dict['$or'].append({'user': uid})

            for k, v in emotions_kv.iteritems():
                query_dict['sentiment'] = v
                scount = xapian_search_weibo.search(query=query_dict, count_only=True)
                mset = xapian_search_weibo.search(query=query_dict, sort_by=[sort_field], \
                                                  max_offset=w_limit, mset_direct=True)
                kcount = top_keywords(gen_mset_iter(xapian_search_weibo, mset, fields=['terms']), top=k_limit)
                top_ws = top_weibos(gen_mset_iter(xapian_search_weibo, mset, fields=save_fields), top=w_limit)

                emotions_count[v] = [end_ts, scount]
                emotions_kcount[v] = [end_ts, kcount]
                emotions_weibo[v] = [end_ts, top_ws]

                print k, v, ', emotions count: ', emotions_count, ', emotion keywords length: ', len(kcount), ', emotion weibos length: ', len(top_ws)

            print domain, date, ' %s %s saved emotions counts, keywords and weibos' % (begin_ts, end_ts)
            save_count_results(DOMAIN_LIST.index(domain), emotions_count, during)
            save_kcount_results(DOMAIN_LIST.index(domain), emotions_kcount, during, TOP_KEYWORDS_LIMIT)
            save_weibos_results(DOMAIN_LIST.index(domain), emotions_weibo, during, TOP_WEIBOS_LIMIT)
예제 #9
0
            for k, v in kcount_dict.iteritems():
                try:
                    kcounts_dict[k] += v
                except KeyError:
                    kcounts_dict[k] = v

    kcounts_dict = _top_keywords(kcounts_dict, top)

    return kcounts_dict


if __name__ == '__main__':
	end_ts = datetime2ts('2013-09-03')
	during = Day / 2

	for k, v in emotions_kv.iteritems():
            print v
	    keywords_with_count = search_keywords(end_ts, during, v)
            for key, value in keywords_with_count.iteritems():
                print key, value
                
        print 'topic'
        for k, v in emotions_kv.iteritems():
            print v
	    keywords_with_count = search_topic_keywords(end_ts, during, v, query='钓鱼岛')
            for key, value in keywords_with_count.iteritems():
                print key, value

        
        print 'domain'
        for k, v in emotions_kv.iteritems():          
def sentimentRealTimeTopic(query, start_ts, end_ts, save_fields=RESP_ITER_KEYS, during=Fifteenminutes, calc='all', w_limit=TOP_WEIBOS_LIMIT, k_limit=TOP_KEYWORDS_LIMIT, sort_field=SORT_FIELD):
    if query and query != '':

    	start_ts = int(start_ts)
    	over_ts = int(end_ts)
    	
        over_ts = ts2HourlyTime(over_ts, during)
        interval = (over_ts - start_ts) / during

        for i in range(interval, 0, -1):

    	    emotions_count = {}
    	    emotions_kcount = {}
    	    emotions_weibo = {}

    	    begin_ts = over_ts - during * i
            xapian_datestr = datetime.date.fromtimestamp(begin_ts).isoformat()
            xapian_search_weibo = getXapianWeiboByDate(xapian_datestr.replace('-', ''))
            if not xapian_search_weibo:
                return

            end_ts = begin_ts + during
            print begin_ts, end_ts, 'topic realtime %s starts calculate' % query.encode('utf-8')

            query_dict = {
                'timestamp': {'$gt': begin_ts, '$lt': end_ts},
                '$or': []
            }

            for term in query.strip().split(','):
                if term:
                    query_dict['$or'].append({'text': [term]})

            if calc == 'all':
                for k, v in emotions_kv.iteritems():
                    query_dict['sentiment'] = v
                    scount = xapian_search_weibo.search(query=query_dict, count_only=True)
                    mset = xapian_search_weibo.search(query=query_dict, sort_by=[sort_field], \
                                                      max_offset=w_limit, mset_direct=True)
                    kcount = top_keywords(gen_mset_iter(xapian_search_weibo, mset, fields=['terms']), top=k_limit)
                    top_ws = top_weibos(gen_mset_iter(xapian_search_weibo, mset, fields=save_fields), top=w_limit)
                    emotions_count[v] = [end_ts, scount]
                    emotions_kcount[v] = [end_ts, kcount]
                    emotions_weibo[v] = [end_ts, top_ws]

                    print k, v, ', emotions count: ', emotions_count, ', keywords length: ', len(kcount), ', weibos count: ', len(top_ws)
                
                print 'save emotions count, keywords and weibo'
                save_rt_results('count', query, emotions_count, during)
                save_rt_results('kcount', query, emotions_kcount, during, klimit=k_limit)
                save_rt_results('weibos', query, emotions_weibo, during, wlimit=w_limit)
                
            elif calc == 'count':
                for k, v in emotions_kv.iteritems():
                    query_dict['sentiment'] = v
                    scount = xapian_search_weibo.search(query=query_dict, count_only=True)
                    emotions_count[v] = [end_ts, scount]

                save_rt_results('count', query, emotions_count, during)
            
            else:
                if calc == 'kcount':
                    for k, v in emotions_kv.iteritems():
                        query_dict['sentiment'] = v
                        count, get_results = xapian_search_weibo.search(query=query_dict, fields=RESP_ITER_KEYS, \
                                                                            sort_by=[SORT_FIELD], max_offset=w_limit)
                        kcount = top_keywords(get_results, top=k_limit)
                        emotions_kcount[v] = [end_ts, kcount]
                    
                    save_rt_results('kcount', query, emotions_kcount, during, TOP_KEYWORDS_LIMIT)

                if calc == 'weibos':
                    for k, v in emotions_kv.iteritems():
                        query_dict['sentiment'] = v
                        count, get_results = xapian_search_weibo.search(query=query_dict, fields=RESP_ITER_KEYS, \
                                                                        sort_by=[sort_field], max_offset=w_limit)
                        top_ws = top_weibos(get_results, top=w_limit)
                        emotions_weibo[v] = [end_ts, top_ws]
                    
                    save_rt_results('weibos', query, emotions_weibo, during, TOP_WEIBOS_LIMIT)
예제 #11
0
def sentimentCronTopic(topic, weibos_list, start_ts, over_ts, sort_field=SORT_FIELD,
                       save_fields=RESP_ITER_KEYS, during=Fifteenminutes, w_limit=TOP_WEIBOS_LIMIT,
                       k_limit=TOP_KEYWORDS_LIMIT):
    import sys

    sys.path.append('../triple_classifier/')
    from triple_sentiment_classifier import triple_classifier

    start_ts = int(start_ts)
    over_ts = int(over_ts)

    over_ts = ts2HourlyTime(over_ts, during)
    interval = (over_ts - start_ts) / during
    logFile.write('start_ts: ' + str(start_ts) + '\r\n')
    logFile.write('over_ts: ' + str(over_ts) + '\r\n')
    logFile.write('during: ' + str(during) + '\r\n')
    logFile.write('interval: ' + str(interval) + '\r\n')

    for i in range(interval, 0, -1):
        begin_ts = over_ts - during * i
        end_ts = begin_ts + during

        emotions_count = {}
        emotions_kcount = {}
        emotions_weibo = {}
        emotions_rcount = {}
        weiboIDs = {}

        for k, v in emotions_kv.iteritems():
            zero = 0
            emotions_count[v] = [end_ts, 0]
            emotions_kcount[v] = [end_ts, '']
            emotions_weibo[v] = [end_ts, []]
            weiboIDs[v] = [end_ts, []]
            # print begin_ts, end_ts, 'topic %s starts calculate' % topic.encode('utf-8')
        slide = get_weibos_slide(weibos_list, begin_ts, end_ts)
        string = ['', '', '', '']

        emo0 = 0
        emo1 = 1

        for weibo in slide:
            sentiment, emo = triple_classifier(weibo)

            if sentiment != 0 and emo == 0:
                emo0 += 1
            elif sentiment != 0 and emo == 1:
                emo1 += 1
            # words = jieba.cut(weibo['text'], cut_all=False)
            weibo['sentiment'] = sentiment
            string[sentiment] = string[sentiment] + weibo['text']

            if sentiment != 0:
                emotions_count[sentiment][1] += 1
                #                kcount = emotions_kcount[sentiment][1]
                emotions_weibo[sentiment][1].append(weibo)
            else:
                zero += 1

        for k, v in emotions_kv.iteritems():
            #            sorted_kcount = sorted(emotions_kcount[v][1].iteritems(), key=lambda(k, v):v, reverse=False)
            #            sorted_kcount = { k: v for k, v in sorted_kcount[len(sorted_kcount)-k_limit:]}
            #            emotions_kcount[v][1] = sorted_kcount
            sorted_weibos = sorted(emotions_weibo[v][1], key=lambda i: i[sort_field], reverse=False)
            emotions_weibo[v][1] = sorted_weibos[len(sorted_weibos) - w_limit:]

            for item in emotions_weibo[v][1]:
                weiboIDs[v][1].append(item['key'])

            wordd = {}

            if string[v] != '':
                words = GetKeyWords(string[v].encode('utf-8'), 5, True)

            word_list = words.split('#')
            for word in word_list:
                token = word.split(r'/')
                if (len(token) == 3 and not (token[0] in STOPWORDS)):
                    #                    wordd.append({token[0]:token[2]})
                    wordd[token[0]] = token[2]
            emotions_kcount[v][1] = wordd

        print emo0, emo1
        print zero, emotions_count[1][1], emotions_count[2][1], emotions_count[3][1]
        save_rt_results('count', topic, emotions_count, during)
        save_rt_results('kcount', topic, emotions_kcount, during, k_limit, w_limit)
        save_rt_results('weibos', topic, weiboIDs, during, k_limit, w_limit)

        j = interval - i
        logFile.write('finish ' + str(j) + ' slide' + '\r\n')