def get_user_in_event(event_id): result = es_event.get(index=event_analysis_name, doc_type= event_type, id=event_id)["_source"] # trend_pusher trend_pusher = json.loads(result["trend_pusher"]) # trend_maker trend_maker = json.loads(result["trend_maker"]) # pagerank pagerank = json.loads(result["pagerank"]) print len(trend_pusher), len(trend_maker), len(pagerank) f = open("event_user_list.txt", "a") for item in trend_pusher: f.write(str(item["uid"])) f.write("\n") for item in trend_maker: f.write(str(item["uid"])) f.write("\n") for item in pagerank: f.write(str(item)) f.write("\n") f.close()
def save_ws_results_es(topic, ts, during, n_limit, province,city,weibos): #mappings_event_geo_province_weibos() #index_name = index_event_geo_province_weibos #index_type = type_event_geo_province_weibos #mappings_event_analysis_results(topic) index_name = index_event_analysis_results index_type = type_event_analysis_results item = {} item['en_name'] = topic item['end_ts'] = ts item['range'] = during item['limit'] = n_limit item['province'] = province item['city'] = city item['weibo'] = json.dumps(weibos) id = topic + '_' + ts try: item_exist = es_event.get(index=index_name,doc_type=index_type,id=id)['_source'] es_event.update(index=index_name,doc_type=index_type,id=id,body={'doc':item}) except Exception,e: es_event.index(index=index_name,doc_type=index_type,id=id,body=item)
def save_results_es(calc, topic, results, during, klimit=TOP_KEYWORDS_LIMIT, wlimit=TOP_WEIBOS_LIMIT): if calc == 'time_results': id = topic #results = json.dumps(results) try: item_exist = es_event.get(index=event_analysis_name, doc_type=event_type, id=id)['_source'] try: time_results = json.loads(item_exist['time_results']) except: time_results = [] time_results.append(results) es_event.update( index=event_analysis_name, doc_type=event_type, id=id, body={'doc': { 'time_results': json.dumps(time_results) }}) except Exception, e: es_event.index(index=event_analysis_name, doc_type=event_type, id=id, body={'time_results': json.dumps(results)})
def get_user_in_event(event_id): result = es_event.get(index=event_analysis_name, doc_type=event_type, id=event_id)["_source"] # trend_pusher trend_pusher = json.loads(result["trend_pusher"]) trend_list = [] for item in trend_pusher: trend_list.append(item["uid"]) # trend_maker trend_maker = json.loads(result["trend_maker"]) maker_list = [] for item in trend_maker: maker_list.append(item["uid"]) # pagerank pagerank = json.loads(result["pagerank"]) print len(trend_pusher), len(trend_maker), len(pagerank) create_rel_uid2event(trend_list, event_id, 'ipusher') create_rel_uid2event(maker_list, event_id, 'maker') create_rel_uid2event(pagerank, event_id, 'join')
def immediate_compute(task_id): try: task = es_event.get(index=event_task_name, doc_type=event_task_type, id=task_id) compute_task(task) except: return None
def exist(task_id): #print task_id try: task_exist = es_event.get(index=event_task_name,doc_type=event_task_type,id=task_id)['_source'] except: task_exist = {} if not task_exist: return False else: return True
def uid_diff(): s_re = scan(es_event,index='user_portrait_0312',doc_type='user') uid_list = set() while True: try: scan_re = s_re.next() uid_list.add(scan_re['_id']) except: print len(uid_list) break result = es_event.get(index='event_result',doc_type='text',id='bei-jing-fang-jia-zheng-ce-1480176000')['_source'] event_uid = set(json.loads(result['user_results']).keys()) print len(event_uid) print uid_list - event_uid print event_uid - uid_list
def save_rt_results_es(calc, topic, results, during, klimit=TOP_KEYWORDS_LIMIT, wlimit=TOP_WEIBOS_LIMIT): #mappings_event_analysis_results(topic) index_name = event_analysis_name #index_event_analysis_results index_type = event_type #type_event_analysis_results if calc == 'sentiment_results': id = topic try: item_exist = es_event.get(index=index_name, doc_type=index_type, id=id)['_source'] try: sentiment_results = json.loads(item_exist['sentiment_results']) except: sentiment_results = [] sentiment_results.append(results) es_event.update(index=index_name, doc_type=index_type, id=id, body={ 'doc': { 'sentiment_results': json.dumps(sentiment_results) } }) except Exception, e: es_event.index(index=index_name, doc_type=index_type, id=id, body={'sentiment_results': json.dumps(results)})
def compute_real_info(topic,begin_ts,end_ts,relation): info_dict = {} query_body = { 'query':{ 'bool':{ 'must':[ {'term':{'en_name':topic}}, {'term':{'message_type':1}}, {'wildcard':{'text':'【*】*'}}, {'range':{ 'timestamp':{'gte': begin_ts, 'lt':end_ts} } }] } }, 'size':1, 'sort':{'retweeted':{'order':'desc'}} } result = es_event.search(index=topic,doc_type=event_text_type,body=query_body)['hits']['hits'] #抽取事件的人物、机构、地点和时间 print result[0]['_source']['text'] basics = get_news_main(result[0]['_source']['text']) print basics info_dict['real_auth'] = basics['organization'] info_dict['real_geo'] = basics['place'] info_dict['real_time'] = basics['time'] info_dict['real_person'] = basics['people'] #存关系 if('join' in relation.split('&')): rel_list = [] if info_dict['real_auth'] !='NULL': resu = create_person(org_node,org_primary,info_dict['real_auth'],org_index_name) if resu != 'Node Wrong': rel_list.append([[2,topic],'join',[0,info_dict['real_auth']]]) if info_dict['real_person'] !='NULL': resu = create_person(people_node,people_primary,info_dict['real_person'],node_index_name) if resu != 'Node Wrong': rel_list.append([[2,topic],'join',[1,info_dict['real_person']]]) try: nodes_rels(rel_list) except: pass query_body = { 'query':{ 'bool':{ 'must':[ {'term':{'en_name':topic}}, {'range':{ 'timestamp':{'gte': begin_ts, 'lt':end_ts} } }] } }, 'size':10000 } result = es_event.search(index=topic,doc_type=event_text_type,fields=['text'],body=query_body)['hits']['hits'] text_list = [] for i in result: text_list.append(i['fields']['text'][0]) # print text_list #事件类型 try: event = es_event.get(index=event_task_name,doc_type=event_task_type,id=topic)['_source'] info_dict['event_type'] = event['event_type'] except: info_dict['event_type'] = cut_weibo(text_list) info_dict['topics'] = json.dumps(get_topic_word(text_list,10)) keywords = get_keyword(''.join(text_list),2) info_dict['keywords'] = '&'.join([i[0] for i in keywords]) info_dict['keywords_list'] = json.dumps(keywords) hashtag = get_hashtag(''.join(text_list)) info_dict['hashtag_dict'] = json.dumps(hashtag) info_dict['hashtag'] = '&'.join(list(hashtag.keys())) try: es_event.update(index=event_analysis_name,doc_type=event_type,id=topic,body={'doc':info_dict}) except Exception,e: es_event.index(index=event_analysis_name,doc_type=event_type,id=topic,body=info_dict)
def compute_task(task): print task # task=['雾霾','type','1480003100','1480176000','1483500427743'] task_id = task['_id'] task = task['_source'] topic = task['name']#task[0]#['name'] #en_name = task['en_name'] RUN_TYPE = 1 if RUN_TYPE == 0: start_ts = 1480003200#task['start_ts'] begin_ts = 1480003200 end_ts = 1480176000#task['end_ts'] else: start_ts = task['start_ts'] begin_ts = task['start_ts'] end_ts = task['end_ts'] try: start_ts = task['compute_ts'] task['compute_ts'] = time.time() except: task['compute_ts'] = time.time() if end_ts > time.time(): end_ts = time.time() submit_ts = task['submit_ts']#int(task[4]) #可选的计算关系realtion 用&连接的字符串 relation = task['relation_compute']#task[5] keywords = task['keywords'].split('&') #关键词或者mid #compute_status = task['status'] # mid = task['mid'] # task_id = 'event-'+str(start_ts)+'-'+str(end_ts)+'-'+str(submit_ts) en_name = task_id t1=time.time() re_mid = re.compile('^\d{16}$') try: mid = re.match(re_mid,task_id).group() except: mid = '' exist_flag = exist(task_id) get_topic_weibo(topic,task_id,start_ts,end_ts,keywords,mid) print exist_flag if exist_flag: #start compute #try: resu = create_person(event_node,event_primary,en_name,event_index_name) if resu == 'Node Wrong': return 'Node Wrong' weibo_counts,uid_counts=counts(start_ts,end_ts,topic,en_name,keywords) es_event.update(index=event_task_name,doc_type=event_task_type,id=task_id,body={'doc':{'compute_status':-1}}) # es_event.index(index=event_analysis_name,doc_type=event_type,id=task_id,body={'name':topic,'start_ts':start_ts,'end_ts':end_ts,'submit_ts':submit_ts,'compute_status':0,'en_name':task_id,'relation_compute':relation}) task['compute_status']=-1 task['weibo_counts']=weibo_counts task['uid_counts']=uid_counts try: flag = es_event.get(index=event_analysis_name,doc_type=event_type,id=task_id)['_source'] w_counts = flag['weibo_counts']+weibo_counts u_counts = flag['uid_counts']+uid_counts es_event.update(index=event_analysis_name,doc_type=event_type,id=task_id,body={'doc':{'compute_status':-1,'weibo_counts':w_counts,'uid_counts':u_counts}}) except: es_event.index(index=event_analysis_name,doc_type=event_type,id=task_id,body=task) es_event.update(index=event_analysis_name,doc_type=event_type,id=task_id,body={'doc':{'hashtag_dict':'','topics':'','geo_results':'','real_geo':'','real_auth':'','sentiment_results':'','time_results':'','hashtag':'','real_time':'','user_results':'','real_person':'','keywords_list:''}}) print 'finish change status' if es_event.get(index=event_analysis_name,doc_type=event_type,id=task_id)['_source']['weibo_counts'] == 0: return 1 #geo cityTopic(en_name, start_ts, end_ts) es_event.update(index=event_analysis_name,doc_type=event_type,id=task_id,body={'doc':{'compute_status':-2}}) print 'finish geo analyze' #language compute_real_info(en_name, begin_ts, end_ts,relation,task['submit_user'],task['submit_ts']) es_event.update(index=event_analysis_name,doc_type=event_type,id=task_id,body={'doc':{'compute_status':-3}}) print 'finish language analyze' #time propagateCronTopic(en_name, start_ts, end_ts) es_event.update(index=event_analysis_name,doc_type=event_type,id=task_id,body={'doc':{'compute_status':-4}}) print 'finish time analyze' #sentiment sentimentTopic(en_name, start_ts=start_ts, over_ts=end_ts) print 'finish sentiment analyze' #finish compute print es_event.update(index=event_analysis_name,doc_type=event_type,id=task_id,body={'doc':{'compute_status':1,'finish_ts':int(time.time())}}) print 'finish change status done' print time.time() if('contain' in relation.split('&')): #计算关系 related_event_ids = event_input(keywords,en_name) rel_list = [] for i in related_event_ids: create_person(event_node,event_primary,i,event_index_name) rel_list.append([[2,en_name],'contain',[2,i]]) nodes_rels(rel_list) es_event.update(index=event_task_name,doc_type=event_task_type,id=task_id,body={'doc':{'compute_status':1}}) t2=time.time()-t1 print task_id,t2 # except: # raise # break #get_attr(en_name, start_ts, end_ts) # else: # pass return 1
def immediate_compute(task_id): # try: task = es_event.get(index=event_task_name,doc_type=event_task_type,id=task_id) compute_task(task)
def cityTopic(topic,start_ts,over_ts,during=Fifteenminutes, n_limit=TOP_WEIBOS_LIMIT): if topic and topic != '': start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during item_exist = es_event.get(index=event_analysis_name,doc_type=event_type,id=topic)['_source'] try: geo_result = json.loads(item_exist['geo_results']) except: geo_result = {} #topics = topic.strip().split(',') for i in range(interval, 0, -1): mtype_ccount = {} # mtype为message_type,ccount为{city:count} begin_ts = over_ts - during * i end_ts = begin_ts + during # print begin_ts,end_ts,topic weibos = [] first_item = {} for k,v in mtype_kv.iteritems(): #v代表转发、评论、原创 #geo_result['geo_cityCount'][end_ts][v] = [] #geo_result = {} #city_dict = {} query_body = { #按message_type得到微博 'query':{ 'bool':{ 'must':[ {'term':{'message_type':v}}, # {'term':{'en_name':topic}}, {'range':{ 'timestamp':{'gte': begin_ts, 'lt':end_ts} } }] } }, 'sort':{SORT_FIELD:{"order":"desc"}}, 'size':n_limit } # print topic,event_text_type,query_body mtype_weibo = es_event.search(index=topic,doc_type=event_text_type,body=query_body)['hits']['hits'] # print len(mtype_weibo) #save_ws_results(topic, end_ts, during, n_limit, mtype_weibo) #微博直接保存下来 # print '160',es_event,event_text,event_text_type,query_body,len(mtype_weibo) if len(mtype_weibo) == 0: continue first_item = mtype_weibo[0]['_source'] #数每个地方的不同类型的数量 for weibo in mtype_weibo: #对于每条微博 try: geo = weibo['_source']['geo'].encode('utf8') except: continue #print geo,type(geo) province,city = split_city(geo) #print province,city if province != 'unknown': try: geo_result[v][province][city]+=1 geo_result[v][province]['total']+=1 except: try: geo_result[v][province][city]=1 geo_result[v][province]['total']+=1 except: try: geo_result[v][province]={city:1,'total':1} except: try: geo_result[v]={province:{city:1,'total':1}} except: geo_result={v:{province:{city:1,'total':1}}} # geo_result[v][province][city] += 1 # try: # geo_result[v][province]['total'] += 1 # except: # try: # geo_result[v][province]['total']=1 # except: # geo_result[v]={province:{'total':1}} #geo_result[end_ts][v] = geo_result #print mtype_ccount v:message type #save_rt_results(topic, mtype_ccount, during, first_item) save_rt_results_es(topic, geo_result) return geo_result
def excel_read(): data = xlrd.open_workbook('events.xlsx') table = data.sheets()[0] # 打开第一张表 nrows = table.nrows # 获取表的行数 for i in range(nrows): if i == 0: # 跳过第一行 continue now_ts = int(time.time()) keywords_list = table.row_values(i)[1].split(' ') keywords = '&'.join(keywords_list) event_type = table.row_values(i)[2] print event_type condition = [] for w in keywords_list: condition.append({'term': {'keywords': w}}) print w condition.append({'term': {'compute_status': 1}}) es_query = {'query': {'bool': {'must': condition}}} res = es_event.search(index=event_task_name, doc_type=event_task_type, \ body=es_query, request_timeout=999999,params={"search_type":"query_and_fetch"}) print res['hits']['hits'] if len(res['hits']['hits']) == 1: en_id = res['hits']['hits'][0]['_id'] es_event.update(index=event_task_name, doc_type=event_task_type, id=en_id, body={'doc': { 'event_type': event_type }}) es_event.update(index=event_analysis_name, doc_type='text', id=en_id, body={'doc': { 'event_type': event_type }}) elif len(res['hits']['hits']) >= 1: en_id = res['hits']['hits'][0]['_id'] es_event.update(index=event_task_name, doc_type=event_task_type, id=en_id, \ body={'doc':{'event_type':event_type}}) try: task_exist = es_event.get(index=event_analysis_name, doc_type='text', id=task_id)['_source'] except: task_exist = {} if task_exist: es_event.update(index=event_analysis_name, doc_type='text', id=en_id, body={'doc': { 'event_type': event_type }}) else: print 'event_result not exist' + en_id print "查询到多个结果!", i print 'END'
def compute_real_info(topic, begin_ts, end_ts, relation, submit_user, submit_ts): info_dict = {} query_body = { 'query': { 'bool': { 'must': [{ 'term': { 'en_name': topic } }, { 'range': { 'timestamp': { 'gte': begin_ts, 'lt': end_ts } } }] } }, 'size': 10000 } result = es_event.search(index=topic, doc_type=event_text_type, fields=['text'], body=query_body)['hits']['hits'] text_list = [] for i in result: text_list.append(i['fields']['text'][0]) #事件类型 try: event = es_event.get(index=event_task_name, doc_type=event_task_type, id=topic)['_source'] info_dict['event_type'] = event['event_type'] except: info_dict['event_type'] = cut_weibo(text_list) try: event = es_event.get(index=event_task_name, doc_type=event_task_type, id=topic)['_source'] info_dict['real_auth'] = event['real_auth'] info_dict['real_geo'] = event['real_geo'] info_dict['real_time'] = event['real_time'] info_dict['real_person'] = event['real_person'] except: info_dict = get_real(info_dict, topic, begin_ts, end_ts, relation) info_dict['topics'] = json.dumps(get_topic_word(text_list, 10)) keywords = get_keyword(''.join(text_list), 2) info_dict['keywords'] = '&'.join([i[0] for i in keywords]) info_dict['keywords_list'] = json.dumps(keywords) hashtag = get_hashtag(''.join(text_list)) info_dict['hashtag_dict'] = json.dumps(hashtag) info_dict['hashtag'] = '&'.join(list(hashtag.keys())) try: es_event.update(index=event_analysis_name, doc_type=event_type, id=topic, body={'doc': info_dict}) except Exception, e: es_event.index(index=event_analysis_name, doc_type=event_type, id=topic, body=info_dict)
}, "first_compute": { "type": "long" }, "immediate_compute": { "type": "long" } } } } } if not es.indices.exists(index=event_analysis_name): print es.indices.create(index=event_analysis_name, body=index_info, ignore=400) return '1' if __name__ == "__main__": #mappings_event_analysis_results() a = es.get(index='event_result', doc_type='text', id='xiang-gang-qian-zong-du-qian-ze-liang-you-er-ren-1482126431' )['_source']['time_results'] print json.loads(a) print type(json.loads(a))