def sentiment_kcount_redis2mysql(end_ts, during=Fifteenminutes): topics = get_keywords() global_emotions_data ={} for k, v in emotions_kv.iteritems(): global_emotions_data[v] = [end_ts, r.zrange(TOP_KEYWORDS_RANK % v, 0, 50, desc=True, withscores=True)] print '%s global saved keywords: %s' % (time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), len(global_emotions_data)) save_global_keywords(global_emotions_data, during, TOP_KEYWORDS_LIMIT) for topic in topics: topic_emotions_data = {} for k, v in emotions_kv.iteritems(): ## print topic, v ## print r.zrange(KEYWORD_TOP_KEYWORDS_RANK % (topic, v), 0, 50, desc=True, withscores=True) topic_emotions_data[v] = [end_ts, r.zrange(KEYWORD_TOP_KEYWORDS_RANK % (topic, v), 0, 50, desc=True, withscores=True)] #print topic_emotions_data print '%s topic %s saved keywords: %s' % (time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), topic, len(topic_emotions_data)) save_topic_results('kcount', topic, topic_emotions_data, during) for fieldid, field in enumerate(DOMAIN_LIST): domain_emotions_data = {} for k, v in emotions_kv.iteritems(): domain_emotions_data[v] = [end_ts, r.zrange(DOMAIN_TOP_KEYWORDS_RANK % (fieldid, v), 0, 50, desc=True, withscores=True)] print '%s domain %s saved keywords: %s' % (time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), fieldid, len(domain_emotions_data)) save_domain_kcount(int(fieldid), domain_emotions_data, during, TOP_KEYWORDS_LIMIT)
def sentiment_weibo_redis2mysql(end_ts, during=Fifteenminutes): topics = get_keywords() global_emotions_data ={} for k, v in emotions_kv.iteritems(): weiboids = r.zrange(TOP_WEIBO_REPOSTS_COUNT_RANK % v, 0, 50, desc=True, withscores=False) weibos = [pickle.loads(zlib.decompress(r.get(TOP_WEIBO_KEY % mid))) for mid in weiboids] global_emotions_data[v] = [end_ts, weibos] print '%s global saved weibos: %s' % (time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), len(global_emotions_data)) save_global_weibos(global_emotions_data, during, TOP_WEIBOS_LIMIT) for topic in topics: topic_emotions_data = {} for k, v in emotions_kv.iteritems(): weiboids = r.zrange(KEYWORD_TOP_WEIBO_REPOSTS_COUNT_RANK % (topic, v), 0, 50, desc=True, withscores=False) weibos = [pickle.loads(zlib.decompress(r.get(TOP_WEIBO_KEY % mid))) for mid in weiboids] topic_emotions_data[v] = [end_ts, weibos] #print topic_emotions_data print '%s topic %s saved weibos: %s' % (time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), topic, len(topic_emotions_data)) save_topic_results('weibos', topic, topic_emotions_data, during) for fieldid, field in enumerate(DOMAIN_LIST): domain_emotions_data = {} for k, v in emotions_kv.iteritems(): weiboids = r.zrange(DOMAIN_TOP_WEIBO_REPOSTS_COUNT_RANK % (fieldid, v), 0, 50, desc=True, withscores=False) weibos = [pickle.loads(zlib.decompress(r.get(TOP_WEIBO_KEY % mid))) for mid in weiboids] domain_emotions_data[v] = [end_ts, weibos] print '%s domain %s saved weibos: %s' % (time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), fieldid, len(domain_emotions_data)) save_domain_weibos(int(fieldid), domain_emotions_data, during, TOP_WEIBOS_LIMIT)
def sentiment_count_redis2mysql(end_ts, during=Fifteenminutes): keywords = get_keywords() global_emotions_data ={} for k, v in emotions_kv.iteritems(): global_count = r.get(GLOBAL_SENTIMENT_COUNT % v) if not global_count: global_count = 0 global_emotions_data[v] = [end_ts, global_count] print '%s global saved count: %s' % (time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), len(global_emotions_data)) save_global_count(global_emotions_data, during) for keyword in keywords: topic_emotions_data = {} for k, v in emotions_kv.iteritems(): topic_count = r.get(KEYWORD_SENTIMENT_COUNT % (keyword, v)) if not topic_count: topic_count = 0 topic_emotions_data[v] = [end_ts, topic_count] print '%s topic %s saved count: %s' % (time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), keyword, len(topic_emotions_data)) save_topic_results('count', keyword, topic_emotions_data, during) for fieldid, field in enumerate(DOMAIN_LIST): domain_emotions_data = {} for k, v in emotions_kv.iteritems(): domain_count = r.get(DOMAIN_SENTIMENT_COUNT % (fieldid, v)) if not domain_count: domain_count = 0 domain_emotions_data[v] = [end_ts, domain_count] print '%s domain %s saved: %s' % (time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), fieldid, len(domain_emotions_data)) save_domain_count(fieldid, domain_emotions_data, during)
def quota_sentiment(topic, xapian_search_weibo, start_ts, end_ts): if topic and topic != '': topics = topic.strip().split(',') query_dict = { 'timestamp': {'$gt':start_ts, '$lt':end_ts}, '$and':[], } for ctopic in topics: query_dict['$and'].append({'topics': ctopic}) # just test ---topics sentiment_count_dict = {} allcount = 0 for k, v in emotions_kv.iteritems(): query_dict['sentiment'] = v scount, weibo_results = xapian_search_weibo.search(query=query_dict, fields=fields_list) results_list = [] if scount: for weibo_result in weibo_results(): results_list.append([weibo_result['_id'], weibo_result['text'].encode('utf-8')]) scount_new, data_wid = get_nad(results_list) else: scount_new = 0 sentiment_count_dict[v] = scount_new allcount += scount_new print 'sentiment_count_dict:', sentiment_count_dict emotion_ratio_dict = {} emotion_ratio_dict['sad'] = float(sentiment_count_dict[3])/float(allcount) emotion_ratio_dict['angry'] = float(sentiment_count_dict[2]) / float(allcount) save_sentiment_quota(topic, start_ts, end_ts, emotion_ratio_dict) # 需要修改sentiment_quota表结构
def sentiment_keywords(xapian_search_weibo, start_ts, over_ts, during=Hour, sort_field='reposts_count', save_fields=RESP_ITER_KEYS, w_limit=TOP_WEIBOS_LIMIT, k_limit=TOP_KEYWORDS_LIMIT): start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during for i in range(interval, 0, -1): emotions_data = {} emotions_weibo = {} begin_ts = over_ts - during * i end_ts = begin_ts + during print begin_ts, end_ts, ' starts calculate' query_dict = { 'timestamp': {'$gt': begin_ts, '$lt': end_ts}, } for k, v in emotions_kv.iteritems(): query_dict['sentiment'] = v print xapian_search_weibo.search(query=query_dict, count_only=True) mset = xapian_search_weibo.search(query=query_dict, sort_by=[sort_field], max_offset=w_limit, mset_direct=True) top_ws = top_weibos(gen_mset_iter(xapian_search_weibo, mset, fields=save_fields), top=w_limit) keywords_with_count = top_keywords(gen_mset_iter(xapian_search_weibo, mset, fields=['terms']), top=k_limit) emotions_data[v] = [end_ts, keywords_with_count] emotions_weibo[v] = [end_ts, top_ws] print k, v, ', emotion keywords length: ', len(keywords_with_count), ', emotion weibos length: ', len(top_ws) print date, '%s %s saved emotions keywords and weibos' % (begin_ts, end_ts) save_count_results(emotions_data, during, TOP_KEYWORDS_LIMIT) save_weibos_results(emotions_weibo, during, TOP_WEIBOS_LIMIT)
def sentiment_count(xapian_search_weibo, start_ts, over_ts, during=Hour): start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during for i in range(interval, 0, -1): emotions_data = {} begin_ts = over_ts - during * i end_ts = begin_ts + during print begin_ts, end_ts, ' starts calculate' query_dict = { 'timestamp': {'$gt': begin_ts, '$lt': end_ts}, } for k, v in emotions_kv.iteritems(): query_dict['sentiment'] = v count = xapian_search_weibo.search(query=query_dict, count_only=True) emotions_data[v] = [end_ts, count] print date, 'saved: ', emotions_data save_count_results(emotions_data, during)
def sentimentCronTopic(topic, xapian_search_weibo, start_ts, over_ts, sort_field=SORT_FIELD, save_fields=RESP_ITER_KEYS, during=Fifteenminutes, w_limit=TOP_WEIBOS_LIMIT, k_limit=TOP_KEYWORDS_LIMIT): if topic and topic != '': start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during topics = topic.strip().split(',') for i in range(interval, 0, -1): emotions_count = {} emotions_kcount = {} emotions_weibo = {} begin_ts = over_ts - during * i end_ts = begin_ts + during print begin_ts, end_ts, 'topic %s starts calculate' % topic.encode('utf-8') query_dict = { 'timestamp': {'$gt': begin_ts, '$lt': end_ts}, '$or': [] } for topic in topics: query_dict['$or'].append({'text': topic}) for k, v in emotions_kv.iteritems(): query_dict['sentiment'] = v scount = xapian_search_weibo.search(query=query_dict, count_only=True) mset = xapian_search_weibo.search(query=query_dict, sort_by=[sort_field], \ max_offset=w_limit, mset_direct=True) kcount = top_keywords(gen_mset_iter(xapian_search_weibo, mset, fields=['terms']), top=k_limit) top_ws = top_weibos(gen_mset_iter(xapian_search_weibo, mset, fields=save_fields), top=w_limit) emotions_count[v] = [end_ts, scount] emotions_kcount[v] = [end_ts, kcount] emotions_weibo[v] = [end_ts, top_ws] print k, v, ', emotions count: ', emotions_count, ', emotion keywords length: ', len(kcount), ', emotion weibos length: ', len(top_ws) print '%s %s saved emotions counts, keywords and weibos' % (begin_ts, end_ts) save_rt_results('count', topic, emotions_count, during) save_rt_results('kcount', topic, emotions_kcount, during, k_limit, w_limit) save_rt_results('weibos', topic, emotions_weibo, during, k_limit, w_limit)
def sentiment_field(domain, xapian_search_weibo, start_ts, over_ts, sort_field='reposts_count', save_fields=RESP_ITER_KEYS, during=Hour, w_limit=TOP_WEIBOS_LIMIT, k_limit=TOP_KEYWORDS_LIMIT): if domain_uids != []: start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during for i in range(interval, 0, -1): emotions_count = {} emotions_kcount = {} emotions_weibo = {} begin_ts = over_ts - during * i end_ts = begin_ts + during print begin_ts, end_ts, 'domain %s starts calculate' % domain query_dict = { 'timestamp': {'$gt': begin_ts, '$lt': end_ts}, '$or': [] } for uid in domain_uids: query_dict['$or'].append({'user': uid}) for k, v in emotions_kv.iteritems(): query_dict['sentiment'] = v scount = xapian_search_weibo.search(query=query_dict, count_only=True) mset = xapian_search_weibo.search(query=query_dict, sort_by=[sort_field], \ max_offset=w_limit, mset_direct=True) kcount = top_keywords(gen_mset_iter(xapian_search_weibo, mset, fields=['terms']), top=k_limit) top_ws = top_weibos(gen_mset_iter(xapian_search_weibo, mset, fields=save_fields), top=w_limit) emotions_count[v] = [end_ts, scount] emotions_kcount[v] = [end_ts, kcount] emotions_weibo[v] = [end_ts, top_ws] print k, v, ', emotions count: ', emotions_count, ', emotion keywords length: ', len(kcount), ', emotion weibos length: ', len(top_ws) print domain, date, ' %s %s saved emotions counts, keywords and weibos' % (begin_ts, end_ts) save_count_results(DOMAIN_LIST.index(domain), emotions_count, during) save_kcount_results(DOMAIN_LIST.index(domain), emotions_kcount, during, TOP_KEYWORDS_LIMIT) save_weibos_results(DOMAIN_LIST.index(domain), emotions_weibo, during, TOP_WEIBOS_LIMIT)
for k, v in kcount_dict.iteritems(): try: kcounts_dict[k] += v except KeyError: kcounts_dict[k] = v kcounts_dict = _top_keywords(kcounts_dict, top) return kcounts_dict if __name__ == '__main__': end_ts = datetime2ts('2013-09-03') during = Day / 2 for k, v in emotions_kv.iteritems(): print v keywords_with_count = search_keywords(end_ts, during, v) for key, value in keywords_with_count.iteritems(): print key, value print 'topic' for k, v in emotions_kv.iteritems(): print v keywords_with_count = search_topic_keywords(end_ts, during, v, query='钓鱼岛') for key, value in keywords_with_count.iteritems(): print key, value print 'domain' for k, v in emotions_kv.iteritems():
def sentimentRealTimeTopic(query, start_ts, end_ts, save_fields=RESP_ITER_KEYS, during=Fifteenminutes, calc='all', w_limit=TOP_WEIBOS_LIMIT, k_limit=TOP_KEYWORDS_LIMIT, sort_field=SORT_FIELD): if query and query != '': start_ts = int(start_ts) over_ts = int(end_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during for i in range(interval, 0, -1): emotions_count = {} emotions_kcount = {} emotions_weibo = {} begin_ts = over_ts - during * i xapian_datestr = datetime.date.fromtimestamp(begin_ts).isoformat() xapian_search_weibo = getXapianWeiboByDate(xapian_datestr.replace('-', '')) if not xapian_search_weibo: return end_ts = begin_ts + during print begin_ts, end_ts, 'topic realtime %s starts calculate' % query.encode('utf-8') query_dict = { 'timestamp': {'$gt': begin_ts, '$lt': end_ts}, '$or': [] } for term in query.strip().split(','): if term: query_dict['$or'].append({'text': [term]}) if calc == 'all': for k, v in emotions_kv.iteritems(): query_dict['sentiment'] = v scount = xapian_search_weibo.search(query=query_dict, count_only=True) mset = xapian_search_weibo.search(query=query_dict, sort_by=[sort_field], \ max_offset=w_limit, mset_direct=True) kcount = top_keywords(gen_mset_iter(xapian_search_weibo, mset, fields=['terms']), top=k_limit) top_ws = top_weibos(gen_mset_iter(xapian_search_weibo, mset, fields=save_fields), top=w_limit) emotions_count[v] = [end_ts, scount] emotions_kcount[v] = [end_ts, kcount] emotions_weibo[v] = [end_ts, top_ws] print k, v, ', emotions count: ', emotions_count, ', keywords length: ', len(kcount), ', weibos count: ', len(top_ws) print 'save emotions count, keywords and weibo' save_rt_results('count', query, emotions_count, during) save_rt_results('kcount', query, emotions_kcount, during, klimit=k_limit) save_rt_results('weibos', query, emotions_weibo, during, wlimit=w_limit) elif calc == 'count': for k, v in emotions_kv.iteritems(): query_dict['sentiment'] = v scount = xapian_search_weibo.search(query=query_dict, count_only=True) emotions_count[v] = [end_ts, scount] save_rt_results('count', query, emotions_count, during) else: if calc == 'kcount': for k, v in emotions_kv.iteritems(): query_dict['sentiment'] = v count, get_results = xapian_search_weibo.search(query=query_dict, fields=RESP_ITER_KEYS, \ sort_by=[SORT_FIELD], max_offset=w_limit) kcount = top_keywords(get_results, top=k_limit) emotions_kcount[v] = [end_ts, kcount] save_rt_results('kcount', query, emotions_kcount, during, TOP_KEYWORDS_LIMIT) if calc == 'weibos': for k, v in emotions_kv.iteritems(): query_dict['sentiment'] = v count, get_results = xapian_search_weibo.search(query=query_dict, fields=RESP_ITER_KEYS, \ sort_by=[sort_field], max_offset=w_limit) top_ws = top_weibos(get_results, top=w_limit) emotions_weibo[v] = [end_ts, top_ws] save_rt_results('weibos', query, emotions_weibo, during, TOP_WEIBOS_LIMIT)
def sentimentCronTopic(topic, weibos_list, start_ts, over_ts, sort_field=SORT_FIELD, save_fields=RESP_ITER_KEYS, during=Fifteenminutes, w_limit=TOP_WEIBOS_LIMIT, k_limit=TOP_KEYWORDS_LIMIT): import sys sys.path.append('../triple_classifier/') from triple_sentiment_classifier import triple_classifier start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during logFile.write('start_ts: ' + str(start_ts) + '\r\n') logFile.write('over_ts: ' + str(over_ts) + '\r\n') logFile.write('during: ' + str(during) + '\r\n') logFile.write('interval: ' + str(interval) + '\r\n') for i in range(interval, 0, -1): begin_ts = over_ts - during * i end_ts = begin_ts + during emotions_count = {} emotions_kcount = {} emotions_weibo = {} emotions_rcount = {} weiboIDs = {} for k, v in emotions_kv.iteritems(): zero = 0 emotions_count[v] = [end_ts, 0] emotions_kcount[v] = [end_ts, ''] emotions_weibo[v] = [end_ts, []] weiboIDs[v] = [end_ts, []] # print begin_ts, end_ts, 'topic %s starts calculate' % topic.encode('utf-8') slide = get_weibos_slide(weibos_list, begin_ts, end_ts) string = ['', '', '', ''] emo0 = 0 emo1 = 1 for weibo in slide: sentiment, emo = triple_classifier(weibo) if sentiment != 0 and emo == 0: emo0 += 1 elif sentiment != 0 and emo == 1: emo1 += 1 # words = jieba.cut(weibo['text'], cut_all=False) weibo['sentiment'] = sentiment string[sentiment] = string[sentiment] + weibo['text'] if sentiment != 0: emotions_count[sentiment][1] += 1 # kcount = emotions_kcount[sentiment][1] emotions_weibo[sentiment][1].append(weibo) else: zero += 1 for k, v in emotions_kv.iteritems(): # sorted_kcount = sorted(emotions_kcount[v][1].iteritems(), key=lambda(k, v):v, reverse=False) # sorted_kcount = { k: v for k, v in sorted_kcount[len(sorted_kcount)-k_limit:]} # emotions_kcount[v][1] = sorted_kcount sorted_weibos = sorted(emotions_weibo[v][1], key=lambda i: i[sort_field], reverse=False) emotions_weibo[v][1] = sorted_weibos[len(sorted_weibos) - w_limit:] for item in emotions_weibo[v][1]: weiboIDs[v][1].append(item['key']) wordd = {} if string[v] != '': words = GetKeyWords(string[v].encode('utf-8'), 5, True) word_list = words.split('#') for word in word_list: token = word.split(r'/') if (len(token) == 3 and not (token[0] in STOPWORDS)): # wordd.append({token[0]:token[2]}) wordd[token[0]] = token[2] emotions_kcount[v][1] = wordd print emo0, emo1 print zero, emotions_count[1][1], emotions_count[2][1], emotions_count[3][1] save_rt_results('count', topic, emotions_count, during) save_rt_results('kcount', topic, emotions_kcount, during, k_limit, w_limit) save_rt_results('weibos', topic, weiboIDs, during, k_limit, w_limit) j = interval - i logFile.write('finish ' + str(j) + ' slide' + '\r\n')