def counts(start_ts, end_ts, topic, en_name, keywords): query_body = { 'query': { 'match_all': {} # 'term':{'en_name':topic} }, 'aggs': { 'diff_uids': { 'cardinality': { 'field': 'uid' } } }, 'size': 999999999 } result = [] index_list = [] weibo_count = 0 result = es_event.search(index=en_name, doc_type=event_text_type, body=query_body) #print result weibo_counts = result['hits']['total'] uid_counts = result['aggregations']['diff_uids']['value'] print weibo_counts, uid_counts #task_id = str(start_ts)+'_'+str(end_ts)+'_'+en_name+'_'+submit_user #print es_event.update(index=event_analysis_name,doc_type=event_type,id=task_id,body={'doc':{'weibo_counts':weibo_counts,'uid_counts':uid_counts}}) return weibo_counts, uid_counts
def get_topic_weibo(topic,en_name,start_ts,end_ts,keywords,mid): query_body = {'query':{'match_all':{}},'sort':'timestamp','size':1} try: task_exist = es_event.search(index=en_name,doc_type=event_type,body=query_body)['hits']['hits'] except: get_mappings(en_name) find_flow_texts_scan(start_ts,end_ts,topic,en_name,keywords,mid)
def getEsIndexName(topic_name): #body={"query": {"match_all": {}}} query_body = {'query': {'match': {'name': topic_name}}} try: res = es_event.search(index='topics', body=query_body)['hits']['hits'] return res[0]['_source']['index_name'] except: return -1
def get_users(topic,begin_ts,end_ts): uid_list = set() query_body = { 'query':{ 'bool':{ 'must':[ {'term':{'en_name':topic}}, {'range':{ 'timestamp':{'gte': begin_ts, 'lt':end_ts} } }] } }, 'size':999999999 } result = es_event.search(index=event_text,doc_type=event_text_type, fields=['uid'],body=query_body)['hits']['hits'] for i in result: uid_list.add(i['fields']['uid'][0]) print len(uid_list) if RUN_TYPE == 0: post = datetime2ts(RUN_TEST_TIME) #datetimestr2ts(RUN_TEST_TIME) post = ts2datetimestr(post) else: post = ts2datetimestr(time.time()) print bci_day_pre+post,bci_day_type,es_user_portrait user_result = es_bci.mget(index=bci_day_pre+post ,doc_type=bci_day_type,body={'ids':list(uid_list)})['docs'] user_influence_dict = {} for i in user_result: #print i if i['found']: i = i['_source'] user_influence_dict[i['user']] = i['user_index'] #print i,type(i) #print i['activeness'],i['influence'],i['fansnum'] user = sorted(user_influence_dict.iteritems(),key=lambda x:x[1],reverse=True)[:100] #print user user_dict = {} for i in user: try: result = es_user_profile.get(index=profile_index_name,doc_type=profile_index_type,id=i[0]) u_type = result['_source']['verified_type'] if u_type in auth_list: u_type = auth_type else: u_type = user_type user_dict[i[0]] = {'user_type':u_type,'influ':i[1]} except: user_dict[i[0]] = {'user_type':user_type,'influ':i[1]} try: es_event.update(index=event_analysis_name,doc_type=event_type,id=topic,body={'doc':{'user_results':json.dumps(user_dict)}}) except Exception,e: es_event.index(index=event_analysis_name,doc_type=event_type,id=topic,body={'user_results':json.dumps(user_dict)})
def compute_mtype_count(topic, begin_ts, end_ts, during): all_mtype_dict = {} #print begin_ts,end_ts query_body = { 'query': { 'bool': { 'must': [ { 'range': { 'timestamp': { 'gte': begin_ts, 'lt': end_ts } } }, { 'term': { 'en_name': topic } } #topic}} jln ] } }, 'aggs': { 'all_interests': { 'terms': { 'field': 'message_type', 'size': MTYPE_COUNT } } } } weibo_mtype_count = es_event.search(index=event_text, doc_type=event_text_type,body=query_body)\ ['aggregations']['all_interests']['buckets'] print es_event, event_text, event_text_type print 'weibo_mtype_count:::::::::::::::::', weibo_mtype_count print begin_ts, end_ts, len(weibo_mtype_count) iter_mtype_dict = {} for mtype_item in weibo_mtype_count: mtype = mtype_item['key'] mtype_count = mtype_item['doc_count'] try: iter_mtype_dict[mtype] += mtype_count except: iter_mtype_dict[mtype] = mtype_count return iter_mtype_dict
def get_task(): query_body = { 'query': { 'term': { 'compute_status': 0 } }, 'sort': { 'submit_ts': { 'order': 'asc' } } } result = es_event.search(index=event_task_name, doc_type=event_task_type, body=query_body) return result['hits']['hits']
def test(topic, start_ts, end_ts): print start_ts, end_ts query_body = { 'query': { 'filtered': { 'filter': { 'range': { 'timestamp': { 'gte': start_ts, 'lt': end_ts } } } } } } weibo = es_event.search(index=topic, doc_type=event_text_type, body=query_body)['hits']['hits'] #字典 print weibo
def compute_real_info(topic,begin_ts,end_ts,relation): info_dict = {} query_body = { 'query':{ 'bool':{ 'must':[ {'term':{'en_name':topic}}, {'term':{'message_type':1}}, {'wildcard':{'text':'【*】*'}}, {'range':{ 'timestamp':{'gte': begin_ts, 'lt':end_ts} } }] } }, 'size':1, 'sort':{'retweeted':{'order':'desc'}} } result = es_event.search(index=topic,doc_type=event_text_type,body=query_body)['hits']['hits'] #抽取事件的人物、机构、地点和时间 print result[0]['_source']['text'] basics = get_news_main(result[0]['_source']['text']) print basics info_dict['real_auth'] = basics['organization'] info_dict['real_geo'] = basics['place'] info_dict['real_time'] = basics['time'] info_dict['real_person'] = basics['people'] #存关系 if('join' in relation.split('&')): rel_list = [] if info_dict['real_auth'] !='NULL': resu = create_person(org_node,org_primary,info_dict['real_auth'],org_index_name) if resu != 'Node Wrong': rel_list.append([[2,topic],'join',[0,info_dict['real_auth']]]) if info_dict['real_person'] !='NULL': resu = create_person(people_node,people_primary,info_dict['real_person'],node_index_name) if resu != 'Node Wrong': rel_list.append([[2,topic],'join',[1,info_dict['real_person']]]) try: nodes_rels(rel_list) except: pass query_body = { 'query':{ 'bool':{ 'must':[ {'term':{'en_name':topic}}, {'range':{ 'timestamp':{'gte': begin_ts, 'lt':end_ts} } }] } }, 'size':10000 } result = es_event.search(index=topic,doc_type=event_text_type,fields=['text'],body=query_body)['hits']['hits'] text_list = [] for i in result: text_list.append(i['fields']['text'][0]) # print text_list #事件类型 try: event = es_event.get(index=event_task_name,doc_type=event_task_type,id=topic)['_source'] info_dict['event_type'] = event['event_type'] except: info_dict['event_type'] = cut_weibo(text_list) info_dict['topics'] = json.dumps(get_topic_word(text_list,10)) keywords = get_keyword(''.join(text_list),2) info_dict['keywords'] = '&'.join([i[0] for i in keywords]) info_dict['keywords_list'] = json.dumps(keywords) hashtag = get_hashtag(''.join(text_list)) info_dict['hashtag_dict'] = json.dumps(hashtag) info_dict['hashtag'] = '&'.join(list(hashtag.keys())) try: es_event.update(index=event_analysis_name,doc_type=event_type,id=topic,body={'doc':info_dict}) except Exception,e: es_event.index(index=event_analysis_name,doc_type=event_type,id=topic,body=info_dict)
def get_users(topic,begin_ts,end_ts,relation): uid_list = set() query_body = { 'query':{ 'bool':{ 'must':[ {'term':{'en_name':topic}}, # {'wildcard':{'text':'【*】*'}}, {'range':{ 'timestamp':{'gte': begin_ts, 'lt':end_ts} } }] } }, 'size':999999999 } result = es_event.search(index=topic,doc_type=event_text_type, fields=['uid'],body=query_body)['hits']['hits'] for i in result: uid_list.add(i['fields']['uid'][0]) print len(uid_list) if RUN_TYPE == 0: post = datetime2ts(RUN_TEST_TIME) #datetimestr2ts(RUN_TEST_TIME) post = ts2datetimestr(post) else: post = ts2datetimestr(time.time()) print bci_day_pre+post,bci_day_type,es_user_portrait user_result = es_bci.mget(index=bci_day_pre+post ,doc_type=bci_day_type,body={'ids':list(uid_list)})['docs'] user_influence_dict = {} for i in user_result: # print i if i['found']: i = i['_source'] user_influence_dict[i['user']] = i['user_index'] #print i,type(i) #print i['activeness'],i['influence'],i['fansnum'] user = sorted(user_influence_dict.iteritems(),key=lambda x:x[1],reverse=True)[:100] #print user not_in_user_list = event_user_portrait([i[0] for i in user]) user_dict = {} p_list = [] a_list = [] for i in user: # if i[0] not in not_in_user_list: # print es_user_profile.get(index=profile_index_name,doc_type=profile_index_type,id=i[0]) try: result = es_user_profile.get(index=profile_index_name,doc_type=profile_index_type,id=i[0]) print result u_type = result['_source']['verified_type'] print u_type if u_type in org_list: u_type = auth_type a_list.append(i[0]) else: u_type = user_type p_list.append(i[0]) user_dict[i[0]] = {'user_type':u_type,'influ':i[1]} except: user_dict[i[0]] = {'user_type':user_type,'influ':i[1]} p_list.append(i[0]) print len(a_list),len(p_list) if('discuss' in relation.split('&')): rel_list = [] for i in p_list: resu = create_person(people_node,people_primary,i,node_index_name) if resu != 'Node Wrong': rel_list.append([[2,topic],'discuss',[1,i]]) for i in a_list: resu = create_person(org_node,org_primary,i,org_index_name) if resu != 'Node Wrong': rel_list.append([[2,topic],'discuss',[0,i]]) try: nodes_rels(rel_list) except: pass try: es_event.update(index=event_analysis_name,doc_type=event_type,id=topic,body={'doc':{'user_results':json.dumps(user_dict)}}) except Exception,e: es_event.index(index=event_analysis_name,doc_type=event_type,id=topic,body={'user_results':json.dumps(user_dict)})
def continue_compute(): ts = time.time() query_body = {'query':{'term':{'range':{'end_ts':{'gte':time.time()}}}},'sort':{'submit_ts':{'order':'asc'}},'size':100000} result = es_event.search(index=event_task_name,doc_type=event_task_type,body=query_body) return result['hits']['hits']
def get_real(info_dict, topic, begin_ts, end_ts, relation): query_body = { 'query': { 'bool': { 'must': [{ 'term': { 'en_name': topic } }, { 'term': { 'message_type': 1 } }, { 'wildcard': { 'text': '【*】*' } }, { 'range': { 'timestamp': { 'gte': begin_ts, 'lt': end_ts } } }] } }, 'size': 1, 'sort': { 'retweeted': { 'order': 'desc' } } } result = es_event.search(index=topic, doc_type=event_text_type, body=query_body)['hits']['hits'] if len(result) == 0: query_body = { 'query': { 'bool': { 'must': [{ 'term': { 'en_name': topic } }, { 'wildcard': { 'text': '【*】*' } }, { 'range': { 'timestamp': { 'gte': begin_ts, 'lt': end_ts } } }] } }, 'size': 1, 'sort': { 'retweeted': { 'order': 'desc' } } } result = es_event.search(index=topic, doc_type=event_text_type, body=query_body)['hits']['hits'] #抽取事件的人物、机构、地点和时间 if len(result) != 0: print result[0]['_source']['text'] basics = get_news_main(result[0]['_source']['text']) print basics info_dict['real_auth'] = basics['organization'] info_dict['real_geo'] = basics['place'] info_dict['real_time'] = basics['time'] info_dict['real_person'] = basics['people'] else: info_dict['real_auth'] = info_dict['real_geo'] = info_dict[ 'real_time'] = info_dict['real_person'] = 'NULL' #存关系 if ('join' in relation.split('&')): rel_list = [] if info_dict['real_auth'] != 'NULL': resu = create_person(org_node, org_primary, info_dict['real_auth'], org_index_name) if resu != 'Node Wrong': rel_list.append([[2, topic], 'join', [0, info_dict['real_auth']]]) if info_dict['real_person'] != 'NULL': resu = create_person(people_node, people_primary, info_dict['real_person'], node_index_name) if resu != 'Node Wrong': rel_list.append([[2, topic], 'join', [1, info_dict['real_person']]]) try: nodes_rels(rel_list) except: pass return info_dict
def compute_sentiment_count(topic, begin_ts, end_ts, during): all_sentiment_dict = {} query_body = { 'query': { 'bool': { 'must': [ { 'range': { 'timestamp': { 'gte': begin_ts, 'lt': end_ts } } }, { 'term': { 'en_name': topic } } #topic}} jln ] } }, 'aggs': { 'all_interests': { 'terms': { 'field': 'sentiment', 'size': SENTIMENT_TYPE_COUNT } #, # 'aggs':{ # 'geo':{ # 'terms':{ # 'field':'geo' # } # } # } } } } weibo_sentiment_count = es_event.search(index=topic,doc_type=event_text_type,body=query_body)\ ['aggregations']['all_interests']['buckets'] #print 'wwwwwwwwwwwwwwwwwwwwww' #print weibo_sentiment_count iter_sentiment_dict = {} for sentiment_item in weibo_sentiment_count: sentiment = sentiment_item['key'] sentiment_count = sentiment_item['doc_count'] try: iter_sentiment_dict[sentiment] += sentiment_count #'1':4 except: iter_sentiment_dict[sentiment] = sentiment_count #print '============================' #all_sentiment_dict[end_ts] = iter_sentiment_dict #按时间段存各个情绪的数量值 #results = sorted(all_sentiment_dict.items(), key=lambda x:x[0]) #按时间段对情绪数量排序 #results = all_sentiment_dict #print type(results) #trend_results = {} #for sentiment in SENTIMENT_FIRST: # trend_results[sentiment] = [[item[0], item[1][sentiment]] for item in sort_sentiment_dict] #results = trend_results #print results #save_rt_results('count', topic, results, during) #save_rt_results_es('count', topic, results, during) #return results return iter_sentiment_dict
def cityTopic(topic,start_ts,over_ts,during=Fifteenminutes, n_limit=TOP_WEIBOS_LIMIT): if topic and topic != '': start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during item_exist = es_event.get(index=event_analysis_name,doc_type=event_type,id=topic)['_source'] try: geo_result = json.loads(item_exist['geo_results']) except: geo_result = {} #topics = topic.strip().split(',') for i in range(interval, 0, -1): mtype_ccount = {} # mtype为message_type,ccount为{city:count} begin_ts = over_ts - during * i end_ts = begin_ts + during # print begin_ts,end_ts,topic weibos = [] first_item = {} for k,v in mtype_kv.iteritems(): #v代表转发、评论、原创 #geo_result['geo_cityCount'][end_ts][v] = [] #geo_result = {} #city_dict = {} query_body = { #按message_type得到微博 'query':{ 'bool':{ 'must':[ {'term':{'message_type':v}}, # {'term':{'en_name':topic}}, {'range':{ 'timestamp':{'gte': begin_ts, 'lt':end_ts} } }] } }, 'sort':{SORT_FIELD:{"order":"desc"}}, 'size':n_limit } # print topic,event_text_type,query_body mtype_weibo = es_event.search(index=topic,doc_type=event_text_type,body=query_body)['hits']['hits'] # print len(mtype_weibo) #save_ws_results(topic, end_ts, during, n_limit, mtype_weibo) #微博直接保存下来 # print '160',es_event,event_text,event_text_type,query_body,len(mtype_weibo) if len(mtype_weibo) == 0: continue first_item = mtype_weibo[0]['_source'] #数每个地方的不同类型的数量 for weibo in mtype_weibo: #对于每条微博 try: geo = weibo['_source']['geo'].encode('utf8') except: continue #print geo,type(geo) province,city = split_city(geo) #print province,city if province != 'unknown': try: geo_result[v][province][city]+=1 geo_result[v][province]['total']+=1 except: try: geo_result[v][province][city]=1 geo_result[v][province]['total']+=1 except: try: geo_result[v][province]={city:1,'total':1} except: try: geo_result[v]={province:{city:1,'total':1}} except: geo_result={v:{province:{city:1,'total':1}}} # geo_result[v][province][city] += 1 # try: # geo_result[v][province]['total'] += 1 # except: # try: # geo_result[v][province]['total']=1 # except: # geo_result[v]={province:{'total':1}} #geo_result[end_ts][v] = geo_result #print mtype_ccount v:message type #save_rt_results(topic, mtype_ccount, during, first_item) save_rt_results_es(topic, geo_result) return geo_result
def compute_real_info(topic, begin_ts, end_ts, relation, submit_user, submit_ts): info_dict = {} query_body = { 'query': { 'bool': { 'must': [{ 'term': { 'en_name': topic } }, { 'range': { 'timestamp': { 'gte': begin_ts, 'lt': end_ts } } }] } }, 'size': 10000 } result = es_event.search(index=topic, doc_type=event_text_type, fields=['text'], body=query_body)['hits']['hits'] text_list = [] for i in result: text_list.append(i['fields']['text'][0]) #事件类型 try: event = es_event.get(index=event_task_name, doc_type=event_task_type, id=topic)['_source'] info_dict['event_type'] = event['event_type'] except: info_dict['event_type'] = cut_weibo(text_list) try: event = es_event.get(index=event_task_name, doc_type=event_task_type, id=topic)['_source'] info_dict['real_auth'] = event['real_auth'] info_dict['real_geo'] = event['real_geo'] info_dict['real_time'] = event['real_time'] info_dict['real_person'] = event['real_person'] except: info_dict = get_real(info_dict, topic, begin_ts, end_ts, relation) info_dict['topics'] = json.dumps(get_topic_word(text_list, 10)) keywords = get_keyword(''.join(text_list), 2) info_dict['keywords'] = '&'.join([i[0] for i in keywords]) info_dict['keywords_list'] = json.dumps(keywords) hashtag = get_hashtag(''.join(text_list)) info_dict['hashtag_dict'] = json.dumps(hashtag) info_dict['hashtag'] = '&'.join(list(hashtag.keys())) try: es_event.update(index=event_analysis_name, doc_type=event_type, id=topic, body={'doc': info_dict}) except Exception, e: es_event.index(index=event_analysis_name, doc_type=event_type, id=topic, body=info_dict)
def excel_read(): data = xlrd.open_workbook('events.xlsx') table = data.sheets()[0] # 打开第一张表 nrows = table.nrows # 获取表的行数 for i in range(nrows): if i == 0: # 跳过第一行 continue now_ts = int(time.time()) keywords_list = table.row_values(i)[1].split(' ') keywords = '&'.join(keywords_list) event_type = table.row_values(i)[2] print event_type condition = [] for w in keywords_list: condition.append({'term': {'keywords': w}}) print w condition.append({'term': {'compute_status': 1}}) es_query = {'query': {'bool': {'must': condition}}} res = es_event.search(index=event_task_name, doc_type=event_task_type, \ body=es_query, request_timeout=999999,params={"search_type":"query_and_fetch"}) print res['hits']['hits'] if len(res['hits']['hits']) == 1: en_id = res['hits']['hits'][0]['_id'] es_event.update(index=event_task_name, doc_type=event_task_type, id=en_id, body={'doc': { 'event_type': event_type }}) es_event.update(index=event_analysis_name, doc_type='text', id=en_id, body={'doc': { 'event_type': event_type }}) elif len(res['hits']['hits']) >= 1: en_id = res['hits']['hits'][0]['_id'] es_event.update(index=event_task_name, doc_type=event_task_type, id=en_id, \ body={'doc':{'event_type':event_type}}) try: task_exist = es_event.get(index=event_analysis_name, doc_type='text', id=task_id)['_source'] except: task_exist = {} if task_exist: es_event.update(index=event_analysis_name, doc_type='text', id=en_id, body={'doc': { 'event_type': event_type }}) else: print 'event_result not exist' + en_id print "查询到多个结果!", i print 'END'
def compute_sentiment_weibo(topic, begin_ts, end_ts, k_limit, w_limit, during): #print topic sentiments = SENTIMENT_FIRST + SENTIMENT_SECOND all_sen_weibo = {} results = {} #results_geo_count = {} geo_count = {} for sentiment in sentiments: province_dict = {} query_body = { 'query': { 'bool': { 'must': [ { 'term': { 'sentiment': sentiment } }, #一个话题,不同情绪下给定时间里按关键词聚合 { 'range': { 'timestamp': { 'gte': begin_ts, 'lt': end_ts } } } ] } }, 'sort': { "retweeted": { "order": "desc" } }, 'size': w_limit } sentiment_weibo = es_event.search(index=topic, doc_type=event_text_type, body=query_body)['hits']['hits'] #字典 if len(sentiment_weibo) > 0: ''' all_sen_weibo[sentiment] = [] for i in range(0,len(sentiment_weibo)): #print sentiment_weibo[i]['_source']['retweeted'] all_sen_weibo[sentiment].append(sentiment_weibo[i]['_source']) ''' for weibo in sentiment_weibo: #对于每条微博 if not weibo['_source']['geo']: continue geo = weibo['_source']['geo'].encode('utf8') province, city = split_city(geo) if province != 'unknown': #print province,city try: province_dict[province]['total'] += 1 except: province_dict[province] = {'total': 1} try: province_dict[province][city] += 1 except: province_dict[province][city] = 1 geo_count[sentiment] = [end_ts, province_dict] else: continue #原有的存微博的 #results[end_ts] = all_sen_weibo #results_geo_count[end_ts] = geo_count #print len(results) #save_rt_results('weibos', topic, results, during, k_limit, w_limit) #save_rt_results_es('weibos', topic, results, during, k_limit, w_limit) #print len(geo_count) #save_rt_results('geo_count',topic,geo_count,during) #save_rt_results_es('geo_count',topic,geo_count,during) #print geo_count #return results,geo_count #{'时间戳':{'情绪1':[{微博字段},{微博字段}],'情绪2':[]}} return geo_count #{'时间戳':{'情绪1':[{微博字段},{微博字段}],'情绪2':[]}}
def compute_sentiment_keywords(topic, begin_ts, end_ts, k_limit, w_limit, during): all_keyword_dict = {} #print 'kkkkkkkkkkkkkkkkkkkk' sen_with_keyword = {} sentiments = SENTIMENT_FIRST + SENTIMENT_SECOND for sentiment in sentiments: query_body = { 'query': { 'bool': { 'must': [ { 'term': { 'sentiment': sentiment } }, #一个话题,不同情绪下给定时间里按关键词聚合 { 'range': { 'timestamp': { 'gte': begin_ts, 'lt': end_ts } } } ] } }, 'aggs': { 'all_interests': { 'terms': { 'field': 'keywords_string', 'size': k_limit #SENTIMENT_MAX_KEYWORDS } } } } show_keywords_dict = es_event.search(index=topic,doc_type=event_text_type,body=query_body)\ ['aggregations']['all_interests']['buckets'] #print show_keywords_dict #keywords_list = [[item['key'], item['doc_count']] for item in show_keywords_dict] #print '=======================' #print keywords_list keyword_dict = {} for keyword in show_keywords_dict: key = keyword['key'] count = keyword['doc_count'] try: keyword_dict[key] += count except: keyword_dict[key] = count sen_with_keyword[sentiment] = sorted(keyword_dict.items(), key=lambda x: x[1], reverse=True)[:k_limit] #print sen_with_keyword #print sorted(sen_with_keyword.items(), key=lambda x:x[0], reverse=True)[:k_limit] all_keyword_dict[end_ts] = sen_with_keyword #还要加按15min切片,然后存 #results = sorted(all_keyword_dict.items(), key=lambda x:x[1][3], reverse=True)[:k_limit] results = all_keyword_dict #{时间戳:{'情绪1':{'词1':1,'词2':2}}} save_rt_results('kcount', topic, results, during, k_limit, w_limit)