def sentiment_keywords(xapian_search_weibo, start_ts, over_ts, during=Hour, sort_field='reposts_count', save_fields=RESP_ITER_KEYS, w_limit=TOP_WEIBOS_LIMIT, k_limit=TOP_KEYWORDS_LIMIT): start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during for i in range(interval, 0, -1): emotions_data = {} emotions_weibo = {} begin_ts = over_ts - during * i end_ts = begin_ts + during print begin_ts, end_ts, ' starts calculate' query_dict = { 'timestamp': {'$gt': begin_ts, '$lt': end_ts}, } for k, v in emotions_kv.iteritems(): query_dict['sentiment'] = v print xapian_search_weibo.search(query=query_dict, count_only=True) mset = xapian_search_weibo.search(query=query_dict, sort_by=[sort_field], max_offset=w_limit, mset_direct=True) top_ws = top_weibos(gen_mset_iter(xapian_search_weibo, mset, fields=save_fields), top=w_limit) keywords_with_count = top_keywords(gen_mset_iter(xapian_search_weibo, mset, fields=['terms']), top=k_limit) emotions_data[v] = [end_ts, keywords_with_count] emotions_weibo[v] = [end_ts, top_ws] print k, v, ', emotion keywords length: ', len(keywords_with_count), ', emotion weibos length: ', len(top_ws) print date, '%s %s saved emotions keywords and weibos' % (begin_ts, end_ts) save_count_results(emotions_data, during, TOP_KEYWORDS_LIMIT) save_weibos_results(emotions_weibo, during, TOP_WEIBOS_LIMIT)
def propagateCronTopic(topic, start_ts, over_ts, sort_field=SORT_FIELD, \ save_fields=RESP_ITER_KEYS, during=Fifteenminutes, w_limit=TOP_WEIBOS_LIMIT, k_limit=TOP_KEYWORDS_LIMIT): if topic and topic != '': start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during for i in range(interval,0,-1): #每15分钟计算一次 mtype_count = {} #每类微博的数量 mtype_kcount = {} #每类微博的TOPK关键词 mtype_weibo = {} #三种类型的微博 begin_ts = over_ts - during * i end_ts = begin_ts + during #print begin_ts,end_ts #print begin_ts, end_ts, 'topic %s starts calculate' % topic.encode('utf-8') mtype_count = compute_mtype_count(topic, begin_ts, end_ts) mtype_kcount = compute_mtype_keywords(topic, begin_ts, end_ts ,k_limit) mtype_weibo = compute_mtype_weibo(topic,begin_ts,end_ts,w_limit) save_results('count', topic, mtype_count, during) save_results('kcount', topic, mtype_kcount, during, k_limit, w_limit) save_results('weibo', topic, mtype_weibo, during, k_limit)
def propagateCronTopic(news_id, start_ts, over_ts, during=Fifteenminutes, w_limit=TOP_MESSAGE_LIMIT, k_limit=TOP_KEYWORDS_LIMIT): start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during for i in range(interval,0,-1): #每15分钟计算一次 # message_type_count = {} #五类消息的数量 # mtype_kcount = {} #五类消息的TOPK关键词 # mtype_content = {} #五种类型的内容,原系统是按转发数排序,不知效果如何 begin_ts = over_ts - during * i end_ts = begin_ts + during # print news_id,begin_ts,end_ts #print begin_ts, end_ts, 'topic %s starts calculate' % topic.encode('utf-8') mtype_count = compute_mtype_count(news_id, begin_ts, end_ts) # print mtype_count # print mtype_count # mtype_kcount = compute_mtype_keywords(topic, begin_ts, end_ts ,k_limit) # allsource_traceback = compute_allsource_traceback(news_id,begin_ts,end_ts,w_limit) # print allsource_traceback save_results('count', news_id, mtype_count, during)
def sentiment_count(xapian_search_weibo, start_ts, over_ts, during=Hour): start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during for i in range(interval, 0, -1): emotions_data = {} begin_ts = over_ts - during * i end_ts = begin_ts + during print begin_ts, end_ts, ' starts calculate' query_dict = { 'timestamp': {'$gt': begin_ts, '$lt': end_ts}, } for k, v in emotions_kv.iteritems(): query_dict['sentiment'] = v count = xapian_search_weibo.search(query=query_dict, count_only=True) emotions_data[v] = [end_ts, count] print date, 'saved: ', emotions_data save_count_results(emotions_data, during)
def propagateCronTopic(topic, start_ts, over_ts, during=Fifteenminutes, w_limit=TOP_MESSAGE_LIMIT, k_limit=TOP_KEYWORDS_LIMIT): if topic and topic != '': start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during for i in range(interval, 0, -1): #每15分钟计算一次 message_type_count = {} #五类消息的数量 mtype_kcount = {} #五类消息的TOPK关键词 mtype_content = {} #五种类型的内容,原系统是按转发数排序,不知效果如何 begin_ts = over_ts - during * i end_ts = begin_ts + during print begin_ts, end_ts #print begin_ts, end_ts, 'topic %s starts calculate' % topic.encode('utf-8') mtype_count = compute_mtype_count(topic, begin_ts, end_ts) # print mtype_count # mtype_kcount = compute_mtype_keywords(topic, begin_ts, end_ts ,k_limit) allsource_traceback = compute_allsource_traceback( topic, begin_ts, end_ts, w_limit) # print allsource_traceback save_results('count', topic, mtype_count, during)
def sentimentCronTopic(topic, xapian_search_weibo, start_ts, over_ts, sort_field=SORT_FIELD, save_fields=RESP_ITER_KEYS, \ during=Fifteenminutes, w_limit=TOP_WEIBOS_LIMIT, k_limit=TOP_KEYWORDS_LIMIT): if topic and topic != '': start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during for i in range(interval, 0, -1): emotions_kcount = {} emotions_count = {} emotions_weibo = {} begin_ts = over_ts - during * i end_ts = begin_ts + during print begin_ts, end_ts, 'topic %s starts calculate' % topic.encode( 'utf-8') query_dict = { 'timestamp': { '$gt': begin_ts, '$lt': end_ts }, '$and': [ { '$or': [{ 'message_type': 1 }, { 'message_type': 3 }] }, ] } for k, v in emotions_kv.iteritems(): query_dict['sentiment'] = v count, results = xapian_search_weibo.search(query=query_dict, fields=save_fields) mset = xapian_search_weibo.search(query=query_dict, sort_by=[sort_field], \ max_offset=w_limit, mset_direct=True) kcount = top_keywords(gen_mset_iter(xapian_search_weibo, mset, fields=['terms']), top=k_limit) top_ws = top_weibos(results, top=w_limit) emotions_count[v] = [end_ts, count] emotions_kcount[v] = [end_ts, kcount] emotions_weibo[v] = [end_ts, top_ws] save_rt_results('count', topic, emotions_count, during) save_rt_results('kcount', topic, emotions_kcount, during, k_limit, w_limit) save_rt_results('weibos', topic, emotions_weibo, during, k_limit, w_limit)
def cul_key_weibo_time_count(task_source, task_id, news_topics, start_ts, over_ts, during): if S_TYPE == 'test': if task_source == 'weibo': start_ts = datetime2ts(S_DATE) - 5 * 24 * 3600 over_ts = datetime2ts(S_DATE) else: start_ts = datetime2ts(S_DATE_FB) - 5 * 24 * 3600 over_ts = datetime2ts(S_DATE_FB) key_weibo_time_count = {} time_dict = {} during = Day for clusterid, keywords in news_topics.iteritems( ): #{u'd2e97cf7-fc43-4982-8405-2d215b3e1fea': [u'\u77e5\u8bc6', u'\u5e7f\u5dde', u'\u9009\u624b']} if len(keywords) > 0: start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during for i in range(interval, 0, -1): #时间段取每900秒的 begin_ts = over_ts - during * i end_ts = begin_ts + during must_list = [] must_list.append( {'range': { 'timestamp': { 'gte': begin_ts, 'lt': end_ts } }}) temp = [] for word in keywords: sentence = { 'wildcard': { 'keywords_string': '*' + word + '*' } } temp.append(sentence) must_list.append({'bool': {'should': temp}}) query_body = {'query': {'bool': {'must': must_list}}} key_weibo = es_intel.search(index=task_id, doc_type=task_source, body=query_body) key_weibo_count = key_weibo['hits']['total'] #分时间段的类的数量 time_dict[ts2datetime(end_ts)] = key_weibo_count key_weibo_time_count[clusterid] = sorted(time_dict.items(), key=lambda x: x[0]) return key_weibo_time_count
def sentimentCronTopic(topic, xapian_search_weibo, start_ts, over_ts, sort_field=SORT_FIELD, save_fields=RESP_ITER_KEYS, during=Fifteenminutes, w_limit=TOP_WEIBOS_LIMIT, k_limit=TOP_KEYWORDS_LIMIT): if topic and topic != '': start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during topics = topic.strip().split(',') for i in range(interval, 0, -1): emotions_count = {} emotions_kcount = {} emotions_weibo = {} begin_ts = over_ts - during * i end_ts = begin_ts + during print begin_ts, end_ts, 'topic %s starts calculate' % topic.encode('utf-8') query_dict = { 'timestamp': {'$gt': begin_ts, '$lt': end_ts}, '$or': [] } for topic in topics: query_dict['$or'].append({'text': topic}) for k, v in emotions_kv.iteritems(): query_dict['sentiment'] = v scount = xapian_search_weibo.search(query=query_dict, count_only=True) mset = xapian_search_weibo.search(query=query_dict, sort_by=[sort_field], \ max_offset=w_limit, mset_direct=True) kcount = top_keywords(gen_mset_iter(xapian_search_weibo, mset, fields=['terms']), top=k_limit) top_ws = top_weibos(gen_mset_iter(xapian_search_weibo, mset, fields=save_fields), top=w_limit) emotions_count[v] = [end_ts, scount] emotions_kcount[v] = [end_ts, kcount] emotions_weibo[v] = [end_ts, top_ws] print k, v, ', emotions count: ', emotions_count, ', emotion keywords length: ', len(kcount), ', emotion weibos length: ', len(top_ws) print '%s %s saved emotions counts, keywords and weibos' % (begin_ts, end_ts) save_rt_results('count', topic, emotions_count, during) save_rt_results('kcount', topic, emotions_kcount, during, k_limit, w_limit) save_rt_results('weibos', topic, emotions_weibo, during, k_limit, w_limit)
def sentimentTopic_new(topic,start_ts, over_ts, sort_field=SORT_FIELD, save_fields=RESP_ITER_KEYS, \ during=Fifteenminutes, w_limit=TOP_WEIBOS_LIMIT, k_limit=TOP_KEYWORDS_LIMIT ): if topic and topic != '': start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during sentiment_results = {} sentiment_results['during'] = during sentiment_results['count'] = {} sentiment_results['geo_count'] = {} for i in range(interval, 0, -1): #时间段取每900秒的 emotions_kcount = {} #每类情感的TOPK关键词 emotions_count = {} #每类情感的数量 emotions_weibo = {} #每类情感的微博 begin_ts = over_ts - during * i end_ts = begin_ts + during #test(topic,begin_ts,end_ts) print begin_ts, end_ts #, 'topic %s starts calculate' % topic.encode('utf-8') emotions_count = compute_sentiment_count(topic, begin_ts, end_ts, during) # emotions_kcount = compute_sentiment_keywords(topic,begin_ts,end_ts,k_limit,w_limit,during) #emotions_weibo,emotions_geo_count = compute_sentiment_weibo(topic,begin_ts,end_ts,k_limit,w_limit,during) sentiment_results['count'][end_ts] = emotions_count #sentiment_results['weibo'][end_ts] = emotions_weibo # save_rt_results('count', topic, emotions_count, during) # '1':[end_ts,4], # save_rt_results('kcount', topic, emotions_kcount, during, k_limit, w_limit) # save_rt_results('weibos', topic, emotions_weibo, during, k_limit, w_limit) emotions_geo_count = compute_sentiment_weibo(topic, start_ts, over_ts, k_limit, w_limit, during) sentiment_results['geo_count'] = emotions_geo_count sentiment_results = json.dumps(sentiment_results) save_rt_results_es('sentiment_results', topic, sentiment_results, during)
def propagateCronNewsTopic(topic, mongo_collection, start_ts, over_ts, sort_field=SORT_FIELD, \ during=Fifteenminutes, n_limit=TOP_NEWS_LIMIT, k_limit=TOP_KEYWORDS_LIMIT): if topic and topic != '': start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during for i in range(interval, 0, -1): begin_ts = over_ts - during * i end_ts = begin_ts + during # print begin_ts, end_ts, 'topic %s starts calculate' % topic.encode('utf-8') news = [] news_count = [] news_kcount = [] query_dict = { 'timestamp': {'$gte': begin_ts, '$lt': end_ts} } fields_dict = get_filter_dict() results_list = mongo_collection.find(query_dict, fields_dict) origin_forward_dict = {'origin':[], 'forward':[]} for weibo_result in results_list: if weibo_result['source_from_name'] and weibo_result['transmit_name']: origin_forward_dict['forward'].append(weibo_result) elif weibo_result['source_from_name']: origin_forward_dict['origin'].append(weibo_result) else: continue for k, v_list in origin_forward_dict.iteritems(): mtype = mtype_kv_news[k] count, kcount, top_ns = top_news_keywords(v_list, news_top=n_limit, keywords_top = k_limit) news = [end_ts, top_ns] news_count = [end_ts, count] news_kcount = [end_ts, kcount] save_ws_news_results(topic, mtype, news, during, n_limit) save_pc_news_results(topic, mtype, news_count, during) save_kc_news_results(topic, mtype, news_kcount, during, k_limit)
def sentiment_field(domain, xapian_search_weibo, start_ts, over_ts, sort_field='reposts_count', save_fields=RESP_ITER_KEYS, during=Hour, w_limit=TOP_WEIBOS_LIMIT, k_limit=TOP_KEYWORDS_LIMIT): if domain_uids != []: start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during for i in range(interval, 0, -1): emotions_count = {} emotions_kcount = {} emotions_weibo = {} begin_ts = over_ts - during * i end_ts = begin_ts + during print begin_ts, end_ts, 'domain %s starts calculate' % domain query_dict = { 'timestamp': {'$gt': begin_ts, '$lt': end_ts}, '$or': [] } for uid in domain_uids: query_dict['$or'].append({'user': uid}) for k, v in emotions_kv.iteritems(): query_dict['sentiment'] = v scount = xapian_search_weibo.search(query=query_dict, count_only=True) mset = xapian_search_weibo.search(query=query_dict, sort_by=[sort_field], \ max_offset=w_limit, mset_direct=True) kcount = top_keywords(gen_mset_iter(xapian_search_weibo, mset, fields=['terms']), top=k_limit) top_ws = top_weibos(gen_mset_iter(xapian_search_weibo, mset, fields=save_fields), top=w_limit) emotions_count[v] = [end_ts, scount] emotions_kcount[v] = [end_ts, kcount] emotions_weibo[v] = [end_ts, top_ws] print k, v, ', emotions count: ', emotions_count, ', emotion keywords length: ', len(kcount), ', emotion weibos length: ', len(top_ws) print domain, date, ' %s %s saved emotions counts, keywords and weibos' % (begin_ts, end_ts) save_count_results(DOMAIN_LIST.index(domain), emotions_count, during) save_kcount_results(DOMAIN_LIST.index(domain), emotions_kcount, during, TOP_KEYWORDS_LIMIT) save_weibos_results(DOMAIN_LIST.index(domain), emotions_weibo, during, TOP_WEIBOS_LIMIT)
def propagateCronTopic(topic, xapian_search_weibo, start_ts, over_ts, sort_field=SORT_FIELD, \ save_fields=RESP_ITER_KEYS, during=Fifteenminutes, w_limit=TOP_WEIBOS_LIMIT, k_limit=TOP_KEYWORDS_LIMIT): if topic and topic != '': start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during for i in range(interval, 0, -1): begin_ts = over_ts - during * i end_ts = begin_ts + during print begin_ts, end_ts, 'topic %s starts calculate' % topic.encode( 'utf-8') mtype_count = {} mtype_kcount = {} # mtype_kcount={mtype:[terms]} mtype_weibo = {} # mtype_weibo={mtype:weibo} query_dict = {'timestamp': {'$gt': begin_ts, '$lt': end_ts}} for k, v in mtype_kv.iteritems(): query_dict['message_type'] = v count, results = xapian_search_weibo.search(query=query_dict, fields=fields_list) mset = xapian_search_weibo.search(query=query_dict, sort_by=[sort_field], \ max_offset=w_limit, mset_direct=True) kcount = top_keywords(gen_mset_iter(xapian_search_weibo, mset, fields=['terms']), top=k_limit) top_ws = top_weibos(results, top=w_limit) mtype_count[v] = [end_ts, count] mtype_kcount[v] = [end_ts, kcount] mtype_weibo[v] = [end_ts, top_ws] save_pc_results(topic, mtype_count, during) save_kc_results(topic, mtype_kcount, during, k_limit) save_ws_results(topic, mtype_weibo, during, w_limit)
def sentimentCronTopic(topic, xapian_search_weibo, start_ts, over_ts, sort_field=SORT_FIELD, save_fields=RESP_ITER_KEYS, \ during=Fifteenminutes, w_limit=TOP_WEIBOS_LIMIT, k_limit=TOP_KEYWORDS_LIMIT): if topic and topic != '': start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during for i in range(interval, 0, -1): emotions_kcount = {} emotions_count = {} emotions_weibo = {} begin_ts = over_ts - during * i end_ts = begin_ts + during print begin_ts, end_ts, 'topic %s starts calculate' % topic.encode('utf-8') query_dict = { 'timestamp': {'$gt': begin_ts, '$lt': end_ts}, '$and': [ {'$or': [{'message_type': 1}, {'message_type': 3}]}, ] } for k, v in emotions_kv.iteritems(): query_dict['sentiment'] = v count, results = xapian_search_weibo.search(query=query_dict, fields=save_fields) mset = xapian_search_weibo.search(query=query_dict, sort_by=[sort_field], \ max_offset=w_limit, mset_direct=True) kcount = top_keywords(gen_mset_iter(xapian_search_weibo, mset, fields=['terms']), top=k_limit) top_ws = top_weibos(results, top=w_limit) emotions_count[v] = [end_ts, count] emotions_kcount[v] = [end_ts, kcount] emotions_weibo[v] = [end_ts, top_ws] save_rt_results('count', topic, emotions_count, during) save_rt_results('kcount', topic, emotions_kcount, during, k_limit, w_limit) save_rt_results('weibos', topic, emotions_weibo, during, k_limit, w_limit)
def propagateCronTopic(topic, xapian_search_weibo, start_ts, over_ts, sort_field=SORT_FIELD, \ save_fields=RESP_ITER_KEYS, during=Fifteenminutes, w_limit=TOP_WEIBOS_LIMIT, k_limit=TOP_KEYWORDS_LIMIT): if topic and topic != '': start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during for i in range(interval, 0, -1): begin_ts = over_ts - during * i end_ts = begin_ts + during print begin_ts, end_ts, 'topic %s starts calculate' % topic.encode('utf-8') mtype_count = {} mtype_kcount = {} # mtype_kcount={mtype:[terms]} mtype_weibo = {} # mtype_weibo={mtype:weibo} query_dict = { 'timestamp': {'$gt': begin_ts, '$lt': end_ts} } for k, v in mtype_kv.iteritems(): query_dict['message_type'] = v count, results = xapian_search_weibo.search(query=query_dict, fields=fields_list) mset = xapian_search_weibo.search(query=query_dict, sort_by=[sort_field], \ max_offset=w_limit, mset_direct=True) kcount = top_keywords(gen_mset_iter(xapian_search_weibo, mset, fields=['terms']), top=k_limit) top_ws = top_weibos(results, top=w_limit) mtype_count[v] = [end_ts, count] mtype_kcount[v] = [end_ts, kcount] mtype_weibo[v] = [end_ts, top_ws] save_pc_results(topic, mtype_count, during) save_kc_results(topic, mtype_kcount, during, k_limit) save_ws_results(topic, mtype_weibo, during, w_limit)
def propagateCronTopic(topic, start_ts, over_ts, sort_field=SORT_FIELD, \ save_fields=RESP_ITER_KEYS, during=Fifteenminutes, w_limit=TOP_WEIBOS_LIMIT, k_limit=TOP_KEYWORDS_LIMIT): if topic and topic != '': start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during time_results = [] for i in range(interval, 0, -1): #每15分钟计算一次 mtype_count = {} #每类微博的数量 mtype_kcount = {} #每类微博的TOPK关键词 mtype_weibo = {} #三种类型的微博 begin_ts = over_ts - during * i end_ts = begin_ts + during mtype_count = compute_mtype_count(topic, begin_ts, end_ts, during) time_results.append([end_ts, mtype_count]) save_results_es('time_results', topic, time_results, during)
def cul_key_weibo_time_count(topic, news_topics, start_ts, over_ts, during): key_weibo_time_count = {} time_dict = {} for clusterid, keywords in news_topics.iteritems( ): #{u'd2e97cf7-fc43-4982-8405-2d215b3e1fea': [u'\u77e5\u8bc6', u'\u5e7f\u5dde', u'\u9009\u624b']} start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during for i in range(interval, 0, -1): #时间段取每900秒的 begin_ts = over_ts - during * i end_ts = begin_ts + during must_list = [] must_list.append( {'range': { 'timestamp': { 'gte': begin_ts, 'lt': end_ts } }}) temp = [] for word in keywords: sentence = {"wildcard": {"keywords_string": "*" + word + "*"}} temp.append(sentence) must_list.append({'bool': {'should': temp}}) query_body = {"query": {"bool": {"must": must_list}}} key_weibo = weibo_es.search(index=topic, doc_type=weibo_index_type, body=query_body) key_weibo_count = key_weibo['hits']['total'] #分时间段的类的数量 time_dict[end_ts] = key_weibo_count key_weibo_time_count[clusterid] = time_dict return key_weibo_time_count
def sentimentTopic(topic,start_ts, over_ts, sort_field=SORT_FIELD, save_fields=RESP_ITER_KEYS, \ during=Fifteenminutes, w_limit=TOP_WEIBOS_LIMIT, k_limit=TOP_KEYWORDS_LIMIT ): if topic and topic != '': start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during for i in range(interval, 0, -1): #时间段取每900秒的 emotions_kcount = {} #每类情感的TOPK关键词 emotions_count = {} #每类情感的数量 emotions_weibo = {} #每类情感的微博 begin_ts = over_ts - during * i end_ts = begin_ts + during #test(topic,begin_ts,end_ts) print begin_ts, end_ts #, 'topic %s starts calculate' % topic.encode('utf-8') emotions_count = compute_sentiment_count(topic, begin_ts, end_ts, during) # emotions_kcount = compute_sentiment_keywords(topic,begin_ts,end_ts,k_limit,w_limit,during) emotions_weibo = compute_sentiment_weibo(topic, begin_ts, end_ts, k_limit, w_limit, during)
def cityCronTopicNews(topic, mongo_collection, start_ts, over_ts, during=Fifteenminutes, n_limit=TOP_NEWS_LIMIT): if topic and topic != '': start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during topics = topic.strip().split(',') for i in range(interval, 0, -1): ccount_dict = {} for k, v in mtype_kv_news.iteritems(): ccount_dict[k] = {} begin_ts = over_ts - during * i end_ts = begin_ts + during first_timestamp = end_ts first_item = {} news = [] query_dict = { 'timestamp': {'$gt': begin_ts, '$lt': end_ts}, } fields_dict = get_filter_dict() results_list = mongo_collection.find(query_dict, fields_dict).sort([(SORT_FIELD,1)]) for weibo_result in results_list: if (weibo_result['timestamp'] <= first_timestamp ): first_timestamp = weibo_result['timestamp'] first_item = weibo_result if weibo_result['source_from_name'] and weibo_result['transmit_name']: source = media2city(weibo_result['source_from_name']) if source: try: ccount_dict['forward'][source] += 1 except KeyError: ccount_dict['forward'][source] = 1 """ try: ccount_dict['sum'][source] += 1 except KeyError: ccount_dict['sum'][source] = 1 """ elif weibo_result['source_from_name']: source = media2city(weibo_result['source_from_name']) if source: try: ccount_dict['origin'][source] += 1 except KeyError: ccount_dict['origin'][source] = 1 """ try: ccount_dict['sum'][source] += 1 except KeyError: ccount_dict['sum'][source] = 1 """ else: continue weibo_result['source_from_area'] = source # 添加区域字段 news.append(weibo_result) for k, v in mtype_kv_news.iteritems(): results = [end_ts, ccount_dict[k]] save_rt_results(topic,v, results, during, first_item) sorted_news = sorted(news, key=lambda k: k[SORT_FIELD], reverse=True) sorted_news = sorted_news[:n_limit] save_ns_results(topic, end_ts, during, n_limit, sorted_news)
def cityCronTopicNews(topic, mongo_collection, start_ts, over_ts, during=Fifteenminutes, n_limit=TOP_NEWS_LIMIT): if topic and topic != '': start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during topics = topic.strip().split(',') for i in range(interval, 0, -1): ccount_dict = {} for k, v in mtype_kv_news.iteritems(): ccount_dict[k] = {} begin_ts = over_ts - during * i end_ts = begin_ts + during first_timestamp = end_ts first_item = {} news = [] query_dict = { 'timestamp': { '$gt': begin_ts, '$lt': end_ts }, } fields_dict = get_filter_dict() results_list = mongo_collection.find(query_dict, fields_dict).sort([ (SORT_FIELD, 1) ]) for weibo_result in results_list: if (weibo_result['timestamp'] <= first_timestamp): first_timestamp = weibo_result['timestamp'] first_item = weibo_result if weibo_result['source_from_name'] and weibo_result[ 'transmit_name']: source = media2city(weibo_result['source_from_name']) if source: try: ccount_dict['forward'][source] += 1 except KeyError: ccount_dict['forward'][source] = 1 """ try: ccount_dict['sum'][source] += 1 except KeyError: ccount_dict['sum'][source] = 1 """ elif weibo_result['source_from_name']: source = media2city(weibo_result['source_from_name']) if source: try: ccount_dict['origin'][source] += 1 except KeyError: ccount_dict['origin'][source] = 1 """ try: ccount_dict['sum'][source] += 1 except KeyError: ccount_dict['sum'][source] = 1 """ else: continue weibo_result['source_from_area'] = source # 添加区域字段 news.append(weibo_result) for k, v in mtype_kv_news.iteritems(): results = [end_ts, ccount_dict[k]] save_rt_results(topic, v, results, during, first_item) sorted_news = sorted(news, key=lambda k: k[SORT_FIELD], reverse=True) sorted_news = sorted_news[:n_limit] save_ns_results(topic, end_ts, during, n_limit, sorted_news)
def cityCronTopic(topic, xapian_search_weibo, start_ts, over_ts, during=Fifteenminutes, n_limit=TOP_WEIBOS_LIMIT): if topic and topic != '': start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during topics = topic.strip().split(',') for i in range(interval, 0, -1): mtype_ccount = {} # mtype为message_type,ccount为{city:count} begin_ts = over_ts - during * i end_ts = begin_ts + during weibos = [] query_dict = { 'timestamp': {'$gt': begin_ts, '$lt': end_ts}, } for k, v in mtype_kv.iteritems(): ccount={} first_timestamp = end_ts first_item = {} query_dict['message_type'] = v count,weibo_results = xapian_search_weibo.search(query=query_dict, fields=fields_list)# weibo_results是在指定时间段、topic、message_type的微博匹配集 for weibo_result in weibo_results(): if (weibo_result['timestamp'] <= first_timestamp ): first_timestamp = weibo_result['timestamp'] first_item = weibo_result try: if (len(weibo_result['geo'].split('.')) == 4): city = IP2city(weibo_result['geo']) if city: try: ccount[city] += 1 except KeyError: ccount[city] = 1 else: continue else: city = geo2city(weibo_result['geo']) if city: try: ccount[city] += 1 except KeyError: ccount[city] = 1 else: continue except: continue if (v == 1) or (v == 3): # 只存储原创和转发 weibos.append(weibo_result) mtype_ccount[v] = [end_ts, ccount] save_rt_results(topic, mtype_ccount, during, first_item) sorted_weibos = sorted(weibos, key=lambda k: k[SORT_FIELD], reverse=True) sorted_weibos = sorted_weibos[:n_limit] save_ws_results(topic, end_ts, during, n_limit, sorted_weibos)
def sentimentRealTimeTopic(query, start_ts, end_ts, save_fields=RESP_ITER_KEYS, during=Fifteenminutes, calc='all', w_limit=TOP_WEIBOS_LIMIT, k_limit=TOP_KEYWORDS_LIMIT, sort_field=SORT_FIELD): if query and query != '': start_ts = int(start_ts) over_ts = int(end_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during for i in range(interval, 0, -1): emotions_count = {} emotions_kcount = {} emotions_weibo = {} begin_ts = over_ts - during * i xapian_datestr = datetime.date.fromtimestamp(begin_ts).isoformat() xapian_search_weibo = getXapianWeiboByDate(xapian_datestr.replace('-', '')) if not xapian_search_weibo: return end_ts = begin_ts + during print begin_ts, end_ts, 'topic realtime %s starts calculate' % query.encode('utf-8') query_dict = { 'timestamp': {'$gt': begin_ts, '$lt': end_ts}, '$or': [] } for term in query.strip().split(','): if term: query_dict['$or'].append({'text': [term]}) if calc == 'all': for k, v in emotions_kv.iteritems(): query_dict['sentiment'] = v scount = xapian_search_weibo.search(query=query_dict, count_only=True) mset = xapian_search_weibo.search(query=query_dict, sort_by=[sort_field], \ max_offset=w_limit, mset_direct=True) kcount = top_keywords(gen_mset_iter(xapian_search_weibo, mset, fields=['terms']), top=k_limit) top_ws = top_weibos(gen_mset_iter(xapian_search_weibo, mset, fields=save_fields), top=w_limit) emotions_count[v] = [end_ts, scount] emotions_kcount[v] = [end_ts, kcount] emotions_weibo[v] = [end_ts, top_ws] print k, v, ', emotions count: ', emotions_count, ', keywords length: ', len(kcount), ', weibos count: ', len(top_ws) print 'save emotions count, keywords and weibo' save_rt_results('count', query, emotions_count, during) save_rt_results('kcount', query, emotions_kcount, during, klimit=k_limit) save_rt_results('weibos', query, emotions_weibo, during, wlimit=w_limit) elif calc == 'count': for k, v in emotions_kv.iteritems(): query_dict['sentiment'] = v scount = xapian_search_weibo.search(query=query_dict, count_only=True) emotions_count[v] = [end_ts, scount] save_rt_results('count', query, emotions_count, during) else: if calc == 'kcount': for k, v in emotions_kv.iteritems(): query_dict['sentiment'] = v count, get_results = xapian_search_weibo.search(query=query_dict, fields=RESP_ITER_KEYS, \ sort_by=[SORT_FIELD], max_offset=w_limit) kcount = top_keywords(get_results, top=k_limit) emotions_kcount[v] = [end_ts, kcount] save_rt_results('kcount', query, emotions_kcount, during, TOP_KEYWORDS_LIMIT) if calc == 'weibos': for k, v in emotions_kv.iteritems(): query_dict['sentiment'] = v count, get_results = xapian_search_weibo.search(query=query_dict, fields=RESP_ITER_KEYS, \ sort_by=[sort_field], max_offset=w_limit) top_ws = top_weibos(get_results, top=w_limit) emotions_weibo[v] = [end_ts, top_ws] save_rt_results('weibos', query, emotions_weibo, during, TOP_WEIBOS_LIMIT)
def cityCronTopic(topic, xapian_search_weibo, start_ts, over_ts, during=Fifteenminutes, n_limit=TOP_WEIBOS_LIMIT): if topic and topic != '': start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during topics = topic.strip().split(',') for i in range(interval, 0, -1): mtype_ccount = {} # mtype为message_type,ccount为{city:count} begin_ts = over_ts - during * i end_ts = begin_ts + during weibos = [] query_dict = { 'timestamp': { '$gt': begin_ts, '$lt': end_ts }, } for k, v in mtype_kv.iteritems(): ccount = {} first_timestamp = end_ts first_item = {} query_dict['message_type'] = v count, weibo_results = xapian_search_weibo.search( query=query_dict, fields=fields_list ) # weibo_results是在指定时间段、topic、message_type的微博匹配集 for weibo_result in weibo_results(): if (weibo_result['timestamp'] <= first_timestamp): first_timestamp = weibo_result['timestamp'] first_item = weibo_result try: if (len(weibo_result['geo'].split('.')) == 4): city = IP2city(weibo_result['geo']) if city: try: ccount[city] += 1 except KeyError: ccount[city] = 1 else: continue else: city = geo2city(weibo_result['geo']) if city: try: ccount[city] += 1 except KeyError: ccount[city] = 1 else: continue except: continue if (v == 1) or (v == 3): # 只存储原创和转发 weibos.append(weibo_result) mtype_ccount[v] = [end_ts, ccount] save_rt_results(topic, mtype_ccount, during, first_item) sorted_weibos = sorted(weibos, key=lambda k: k[SORT_FIELD], reverse=True) sorted_weibos = sorted_weibos[:n_limit] save_ws_results(topic, end_ts, during, n_limit, sorted_weibos)
def sentimentCronTopic(topic, weibos_list, start_ts, over_ts, sort_field=SORT_FIELD, save_fields=RESP_ITER_KEYS, during=Fifteenminutes, w_limit=TOP_WEIBOS_LIMIT, k_limit=TOP_KEYWORDS_LIMIT): import sys sys.path.append('../triple_classifier/') from triple_sentiment_classifier import triple_classifier start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during logFile.write('start_ts: ' + str(start_ts) + '\r\n') logFile.write('over_ts: ' + str(over_ts) + '\r\n') logFile.write('during: ' + str(during) + '\r\n') logFile.write('interval: ' + str(interval) + '\r\n') for i in range(interval, 0, -1): begin_ts = over_ts - during * i end_ts = begin_ts + during emotions_count = {} emotions_kcount = {} emotions_weibo = {} emotions_rcount = {} weiboIDs = {} for k, v in emotions_kv.iteritems(): zero = 0 emotions_count[v] = [end_ts, 0] emotions_kcount[v] = [end_ts, ''] emotions_weibo[v] = [end_ts, []] weiboIDs[v] = [end_ts, []] # print begin_ts, end_ts, 'topic %s starts calculate' % topic.encode('utf-8') slide = get_weibos_slide(weibos_list, begin_ts, end_ts) string = ['', '', '', ''] emo0 = 0 emo1 = 1 for weibo in slide: sentiment, emo = triple_classifier(weibo) if sentiment != 0 and emo == 0: emo0 += 1 elif sentiment != 0 and emo == 1: emo1 += 1 # words = jieba.cut(weibo['text'], cut_all=False) weibo['sentiment'] = sentiment string[sentiment] = string[sentiment] + weibo['text'] if sentiment != 0: emotions_count[sentiment][1] += 1 # kcount = emotions_kcount[sentiment][1] emotions_weibo[sentiment][1].append(weibo) else: zero += 1 for k, v in emotions_kv.iteritems(): # sorted_kcount = sorted(emotions_kcount[v][1].iteritems(), key=lambda(k, v):v, reverse=False) # sorted_kcount = { k: v for k, v in sorted_kcount[len(sorted_kcount)-k_limit:]} # emotions_kcount[v][1] = sorted_kcount sorted_weibos = sorted(emotions_weibo[v][1], key=lambda i: i[sort_field], reverse=False) emotions_weibo[v][1] = sorted_weibos[len(sorted_weibos) - w_limit:] for item in emotions_weibo[v][1]: weiboIDs[v][1].append(item['key']) wordd = {} if string[v] != '': words = GetKeyWords(string[v].encode('utf-8'), 5, True) word_list = words.split('#') for word in word_list: token = word.split(r'/') if (len(token) == 3 and not (token[0] in STOPWORDS)): # wordd.append({token[0]:token[2]}) wordd[token[0]] = token[2] emotions_kcount[v][1] = wordd print emo0, emo1 print zero, emotions_count[1][1], emotions_count[2][1], emotions_count[3][1] save_rt_results('count', topic, emotions_count, during) save_rt_results('kcount', topic, emotions_kcount, during, k_limit, w_limit) save_rt_results('weibos', topic, weiboIDs, during, k_limit, w_limit) j = interval - i logFile.write('finish ' + str(j) + ' slide' + '\r\n')
def cityTopic(topic, start_ts, over_ts, during=Fifteenminutes, n_limit=TOP_WEIBOS_LIMIT): if topic and topic != '': start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during #topics = topic.strip().split(',') for i in range(interval, 0, -1): mtype_ccount = {} # mtype为message_type,ccount为{city:count} begin_ts = over_ts - during * i end_ts = begin_ts + during print begin_ts, end_ts, topic weibos = [] first_item = {} for k, v in mtype_kv.iteritems(): #v代表转发、评论、原创 province_dict = {} city_dict = {} query_body = { #按message_type得到微博 'query': { 'bool': { 'must': [{ 'term': { 'message_type': v } }, { 'range': { 'timestamp': { 'gte': begin_ts, 'lt': end_ts } } }] } }, 'sort': { SORT_FIELD: { "order": "desc" } }, 'size': n_limit } mtype_weibo = weibo_es.search(index=topic, doc_type=weibo_index_type, body=query_body)['hits']['hits'] #save_ws_results(topic, end_ts, during, n_limit, mtype_weibo) #微博直接保存下来 if len(mtype_weibo) == 0: continue first_item = mtype_weibo[0]['_source'] #数每个地方的不同类型的数量 for weibo in mtype_weibo: #对于每条微博 try: geo = weibo['_source']['geo'].encode('utf8') except: continue #print geo,type(geo) province, city = split_city(geo) #print province,city if province != 'unknown': try: province_dict[province][city] += 1 province_dict[province]['total'] += 1 except: province_dict[province] = {} province_dict[province][city] = 1 province_dict[province]['total'] = 1 save_ws_results(topic, end_ts, during, n_limit, province, city, weibo) # try: # city_dict[city] += 1 # except: # city_dict[city] = 1 # try: # province_dict[province].append(city_dict) # except: # province_dict[province] = [] # province_dict[province].append(city_dict) # try: # province_dict[province] += 1 # except: # province_dict[province] = 1 # sorted_province_dict = sorted(province_dict.items(), key=lambda x: x[0], reverse=False)[:n_limit] #就是x[0] # sorted_city_dict = sorted(city_dict.items(), key=lambda x: x[0], reverse=False)[:n_limit] # print sorted_province_dict # print sorted_city_dict ccount = province_dict # ccount['province'] = sorted_province_dict # ccount['city'] = sorted_city_dict mtype_ccount[v] = [ end_ts, ccount ] #{'message_type':[shijian,{['province':('provice':cishu),()],'city':[(city:cishu)}]} #print mtype_ccount save_rt_results(topic, mtype_ccount, during, first_item)
def cityTopic(topic,start_ts,over_ts,during=Fifteenminutes, n_limit=TOP_WEIBOS_LIMIT): if topic and topic != '': start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during item_exist = es_event.get(index=event_analysis_name,doc_type=event_type,id=topic)['_source'] try: geo_result = json.loads(item_exist['geo_results']) except: geo_result = {} #topics = topic.strip().split(',') for i in range(interval, 0, -1): mtype_ccount = {} # mtype为message_type,ccount为{city:count} begin_ts = over_ts - during * i end_ts = begin_ts + during # print begin_ts,end_ts,topic weibos = [] first_item = {} for k,v in mtype_kv.iteritems(): #v代表转发、评论、原创 #geo_result['geo_cityCount'][end_ts][v] = [] #geo_result = {} #city_dict = {} query_body = { #按message_type得到微博 'query':{ 'bool':{ 'must':[ {'term':{'message_type':v}}, # {'term':{'en_name':topic}}, {'range':{ 'timestamp':{'gte': begin_ts, 'lt':end_ts} } }] } }, 'sort':{SORT_FIELD:{"order":"desc"}}, 'size':n_limit } # print topic,event_text_type,query_body mtype_weibo = es_event.search(index=topic,doc_type=event_text_type,body=query_body)['hits']['hits'] # print len(mtype_weibo) #save_ws_results(topic, end_ts, during, n_limit, mtype_weibo) #微博直接保存下来 # print '160',es_event,event_text,event_text_type,query_body,len(mtype_weibo) if len(mtype_weibo) == 0: continue first_item = mtype_weibo[0]['_source'] #数每个地方的不同类型的数量 for weibo in mtype_weibo: #对于每条微博 try: geo = weibo['_source']['geo'].encode('utf8') except: continue #print geo,type(geo) province,city = split_city(geo) #print province,city if province != 'unknown': try: geo_result[v][province][city]+=1 geo_result[v][province]['total']+=1 except: try: geo_result[v][province][city]=1 geo_result[v][province]['total']+=1 except: try: geo_result[v][province]={city:1,'total':1} except: try: geo_result[v]={province:{city:1,'total':1}} except: geo_result={v:{province:{city:1,'total':1}}} # geo_result[v][province][city] += 1 # try: # geo_result[v][province]['total'] += 1 # except: # try: # geo_result[v][province]['total']=1 # except: # geo_result[v]={province:{'total':1}} #geo_result[end_ts][v] = geo_result #print mtype_ccount v:message type #save_rt_results(topic, mtype_ccount, during, first_item) save_rt_results_es(topic, geo_result) return geo_result