def add_task( user_name ,type = "keyword",range = "all" ,pre ='flow_text_' , during = '1' , start_time ='2013-09-07' ,end_time ='2013-09-07', keyword = 'hello,world' , sort_norm = 'bci' , sort_scope = 'in_limit_keyword', time = 7, isall = False, number=100 ): time_now = int(TIME.time()) task_id = user_name + "-" + str(time_now) tmp_list = keyword.split(',') keyword_list = [] for item in tmp_list: if item: keyword_list.append(item) body_json = { 'submit_user' : user_name , 'keyword' : json.dumps(keyword_list), 'keyword_string': "&".join(keyword_list), 'submit_time' : ts2datetime(time_now), 'create_time': time_now, 'end_time' : datetime2ts(end_time), 'search_type' : type, 'status':0, 'range' : range , 'user_ts' : user_name + '-'+ str(time_now), 'pre' : pre, 'during' : during , 'start_time' : datetime2ts(start_time) , 'sort_norm' : sort_norm , 'sort_scope' : sort_scope, 'time' : time , 'isall' : isall, 'number': number } es.index(index = USER_RANK_KEYWORD_TASK_INDEX , doc_type=USER_RANK_KEYWORD_TASK_TYPE , id=task_id, body=body_json) return body_json["user_ts"]
def add_task( user_name ,type = "keyword",range = "all" ,pre ='flow_text_' , during = '1' , start_time ='2013-09-07' ,end_time ='2013-09-07', keyword = 'hello,world' , sort_norm = 'bci' , sort_scope = 'in_limit_keyword', time = 1, isall = False ): time_now = TIME.time() body_json = { 'submit_user' : user_name , 'keyword' : keyword, 'submit_time' : str(ts2date(time_now)) , 'end_time' : end_time, 'search_type' : type, 'status':0, 'range' : range , 'user_ts' : user_name + str(time_now), 'pre' : pre, 'during' : during , 'start_time' : start_time , 'sort_norm' : sort_norm , 'sort_scope' : sort_scope, 'time' : time , 'isall' : isall } try: es.index(index = USER_RANK_KEYWORD_TASK_INDEX , doc_type=USER_RANK_KEYWORD_TASK_TYPE , body=body_json) return body_json["user_ts"] except Exception , e1 : print e1
def save_group_results(results): status = False flag = es.indices.exists(index=index_name) if not flag: es.indices.create(index=index_name, ignore=400) es.index(index_name, doc_type=index_type, body=results, id=results['task_name']) return status
def save_count_results(all_uids_count, es_num): index_name = "user_portrait_network_count" index_type = "network" item = {} date = ts2datetime(time.time()) item['period_'+str(es_num)] = all_uids_count try: item_exist = es_user_portrait.get(index=index_name, doc_type=index_type, id=date)['_source'] es_user_portrait.update(index=index_name, doc_type=index_type,id=date,body=item) except: item['start_ts'] = date es_user_portrait.index(index=index_name, doc_type=index_type,id=date,body=item)
def scan_offline_task(): query = { "query": { "bool": { "must": [{ "term": { "status": 0 } }] } }, "size": 1000 } results = es_user_portrait.search(index=USER_RANK_KEYWORD_TASK_INDEX, doc_type=USER_RANK_KEYWORD_TASK_TYPE, body=query)['hits']['hits'] print USER_RANK_KEYWORD_TASK_INDEX, USER_RANK_KEYWORD_TASK_TYPE if results: for item in results: task_id = item['_id'] iter_item = item['_source'] search_type = iter_item['search_type'] pre = iter_item['pre'] during = iter_item['during'] start_time = iter_item['start_time'] keyword = json.loads(iter_item['keyword']) search_key = iter_item['user_ts'] number = iter_item['number'] sort_norm = iter_item['sort_norm'] sort_scope = iter_item['sort_scope'] time = iter_item['time'] isall = iter_item['isall'] print redis_task redis_task.lpush( "task_user_rank", json.dumps([ task_id, search_type, pre, during, start_time, keyword, search_key, sort_norm, sort_scope, time, isall, number ])) iter_item['status'] = -1 task_id = item['_id'] #print item es_user_portrait.index(index=USER_RANK_KEYWORD_TASK_INDEX, doc_type=USER_RANK_KEYWORD_TASK_TYPE, id=task_id, body=iter_item)
def add_task(user_name, type="keyword", range="all", pre='flow_text_', during='1', start_time='2013-09-07', end_time='2013-09-07', keyword='hello,world', sort_norm='bci', sort_scope='in_limit_keyword', time=7, isall=False, number=100): time_now = int(TIME.time()) task_id = user_name + "-" + str(time_now) tmp_list = keyword.split(',') keyword_list = [] for item in tmp_list: if item: keyword_list.append(item) body_json = { 'submit_user': user_name, 'keyword': json.dumps(keyword_list), 'keyword_string': "&".join(keyword_list), 'submit_time': ts2datetime(time_now), 'create_time': time_now, 'end_time': datetime2ts(end_time), 'search_type': type, 'status': 0, 'range': range, 'user_ts': user_name + '-' + str(time_now), 'pre': pre, 'during': during, 'start_time': datetime2ts(start_time), 'sort_norm': sort_norm, 'sort_scope': sort_scope, 'time': time, 'isall': isall, 'number': number } es.index(index=USER_RANK_KEYWORD_TASK_INDEX, doc_type=USER_RANK_KEYWORD_TASK_TYPE, id=task_id, body=body_json) return body_json["user_ts"]
def key_words_search( search_type , pre , during , start_time , keyword , search_key = '' , sort_norm = '', sort_scope = '' ,time = 1 , isall = False): query = {"query":{"bool":{"must":[{"term":{"user_rank_task.user_ts":search_key}}],"must_not":[],"should":[]}},"from":0,"size":10,"sort":[],"facets":{}} result = es_9200.search(index = USER_RANK_KEYWORD_TASK_INDEX , doc_type = USER_RANK_KEYWORD_TASK_TYPE , body = query)['hits']['hits'] search_id = result[0]['_id'] item = result[0]['_source'] item['status'] = -1 # 任务 item['result'] = json.dumps(results) es_9200.index(index = USER_RANK_KEYWORD_TASK_INDEX , doc_type=USER_RANK_KEYWORD_TASK_TYPE , id=search_id, body=item) keywords = keyword.split(",") should = [] for key in keywords: if search_type == "hashtag": should.append({"prefix":{"text.text": "#" + key + "#"}}) else: should.append({"prefix":{"text.text":key}}) date = start_time index_name = pre + start_time while not es_9206.indices.exists(index= index_name) : new_time = datetime2ts(date) + DAY date = ts2datetime(new_time) index_name = pre + date during -= 1 uid_set = set() for i in range(during): print index_name query = {"query":{"bool":{"must":[],"must_not":[],"should":should}},"size":MAX_ITEMS,"sort":[],"facets":{},"fields":['uid']} try : temp = es_9206.search(index = index_name , doc_type = 'text' , body = query) result = temp['hits']['hits'] print "Fetch " + str(len(result)) for item in result : uid_set.add(item['fields']['uid'][0].encode("utf-8") ) except Exception,e: print e raise Exception('user_list failed!') new_time = datetime2ts(date) + DAY date = ts2datetime(new_time) index_name = pre + date i += 1
def save_count_results(all_uids_count, es_num): index_name = "user_portrait_network_count" index_type = "network" item = {} date = ts2datetime(time.time()) item['period_' + str(es_num)] = all_uids_count try: item_exist = es_user_portrait.get(index=index_name, doc_type=index_type, id=date)['_source'] es_user_portrait.update(index=index_name, doc_type=index_type, id=date, body=item) except: item['start_ts'] = date es_user_portrait.index(index=index_name, doc_type=index_type, id=date, body=item)
def add_task(user_name, type="keyword", range="all", pre='flow_text_', during='1', start_time='2013-09-07', end_time='2013-09-07', keyword='hello,world', sort_norm='bci', sort_scope='in_limit_keyword', time=1, isall=False): time_now = TIME.time() body_json = { 'submit_user': user_name, 'keyword': keyword, 'submit_time': str(ts2date(time_now)), 'end_time': end_time, 'search_type': type, 'status': 0, 'range': range, 'user_ts': user_name + str(time_now), 'pre': pre, 'during': during, 'start_time': start_time, 'sort_norm': sort_norm, 'sort_scope': sort_scope, 'time': time, 'isall': isall } try: es.index(index=USER_RANK_KEYWORD_TASK_INDEX, doc_type=USER_RANK_KEYWORD_TASK_TYPE, body=body_json) return body_json["user_ts"] except Exception, e1: print e1
def scan_offline_task(): query = {"query":{"bool":{"must":[{"term":{"status":0}}]}},"size":1000} results = es_user_portrait.search(index = USER_RANK_KEYWORD_TASK_INDEX , doc_type = USER_RANK_KEYWORD_TASK_TYPE,body=query)['hits']['hits'] if results : for item in results: task_id = item['_id'] iter_item = item['_source'] search_type = iter_item['search_type'] pre = iter_item['pre'] during = iter_item['during'] start_time = iter_item['start_time'] keyword = json.loads(iter_item['keyword']) search_key = iter_item['user_ts'] number = iter_item['number'] sort_norm = iter_item['sort_norm'] sort_scope = iter_item['sort_scope'] time = iter_item['time'] isall = iter_item['isall'] redis_task.lpush("task_user_rank", json.dumps([task_id, search_type , pre , during , start_time , keyword , search_key , sort_norm , sort_scope ,time , isall, number])) iter_item['status'] = -1 task_id = item['_id'] #print item es_user_portrait.index(index=USER_RANK_KEYWORD_TASK_INDEX, doc_type=USER_RANK_KEYWORD_TASK_TYPE, id=task_id, body=iter_item)
def create_sensing_task(task): item = dict() item['task_name'] = task['task_name'] item['social_sensors'] = task['social_sensors'] es.index(index=index_sensing, doc_type=type_sensing, body=item)
def key_words_search(task_id, search_type , pre , during , start_time , keyword_list , search_key = '' , sort_norm = '', sort_scope = '' ,time = 7 , isall = False, number = 100): number = int(number) should = [] for key in keyword_list: if search_type == "hashtag": should.append({"prefix":{"text": "#" + key + "#"}}) else: should.append({"wildcard":{"text": "*" +key + "*"}}) index_list = [] date = ts2datetime(start_time) index_name = pre + date while during: if es_flow_text.indices.exists(index=index_name): index_list.append(index_name) start_time = start_time + DAY date = ts2datetime(start_time) index_name = pre + date during -= 1 print index_list uid_set = set() text_results = [] sorted_text_results = [] query_body = { "query":{ "bool":{ "must":should } }, "sort":{"user_fansnum":{"order":"desc"}}, "size":5000 } results = es_flow_text.search(index = index_list , doc_type = 'text' , body = query_body, _source=False, fields=["uid", "user_fansnum","text", "message_type", "sentiment","timestamp", "geo", "retweeted", "comment"])["hits"]["hits"] id_index = 0 index_list = [] un_uid_list = [] for item in results : if item['fields']['uid'][0] not in uid_set: uid_set.add(item['fields']['uid'][0]) un_uid_list.append(item['fields']['uid'][0]) index_list.append(id_index) id_index += 1 #get_all_filed(sort_norm , time) uid_list = [] print "un_uid_list: ", len(un_uid_list) portrait_list = [] count = 0 in_index = 0 if not isall and un_uid_list : # 库内 portrait_results = es_user_portrait.mget(index=USER_INDEX_NAME, doc_type=USER_INDEX_TYPE, body={"ids":un_uid_list}, _source=False, fields=['uname'])["docs"] for item in portrait_results: if item["found"]: portrait_list.append(item['_id']) nick_name = item['fields']['uname'][0] if nick_name == 'unknown': nick_name = item['_id'] index = index_list[in_index] weibo_url = weiboinfo2url(results[index]['fields']['uid'][0], results[index]['_id']) text_results.extend([results[index]['fields']['uid'][0], results[index]['fields']['user_fansnum'][0], results[index]['fields']['text'][0], results[index]['fields']['message_type'][0], results[index]['fields']['sentiment'][0], ts2date(results[index]['fields']['timestamp'][0]), results[index]['fields']['geo'][0], results[index]['fields']['retweeted'][0], results[index]['fields']['comment'][0], nick_name, weibo_url]) count += 1 if count == number: break print "portrait_len, ", len(portrait_list) in_index += 1 if portrait_list: uid_list = in_sort_filter(time,sort_norm ,sort_scope ,None , portrait_list , True, number) # sort for iter_uid in uid_list: iter_index = portrait_list.index(iter_uid) sorted_text_results.append(text_results[i]) elif un_uid_list: profile_result = es_user_profile.mget(index="weibo_user", doc_type="user", body={"ids":un_uid_list}, fields=['nick_name'])["docs"] for i in range(len(profile_result)): index = index_list[i] try: nick_name = profile_result[i]['fields']['nick_name'][0] except: nick_name = un_uid_list[i] item = results[index] weibo_url = weiboinfo2url(item['fields']['uid'][0], results[index]['_id']) text_results.append([item['fields']['uid'][0], item['fields']['user_fansnum'][0], item['fields']['text'][0], item['fields']['message_type'][0], item['fields']['sentiment'][0], ts2date(item['fields']['timestamp'][0]), results[index]['fields']['geo'][0], results[index]['fields']['retweeted'][0], results[index]['fields']['comment'][0], nick_name, weibo_url]) if i == number: break uid_list = all_sort_filter(un_uid_list[:number] , sort_norm , time ,True, number) sorted_text_results = [] f = open("small.txt", "wb") for iter_uid in uid_list: iter_index = un_uid_list.index(iter_uid) f.write(str(iter_uid)+"\n") sorted_text_results.append(text_results[iter_index]) f.close() print "filter_uid_list: ", len(uid_list) if uid_list: results = make_up_user_info(uid_list,isall,time,sort_norm) else: results = [] print "results: ", len(results) # 修改状态 task_detail = es_user_portrait.get(index=USER_RANK_KEYWORD_TASK_INDEX , doc_type=USER_RANK_KEYWORD_TASK_TYPE, id=task_id) item = task_detail['_source'] item['status'] = 1 item['result'] = json.dumps(results) item['text_results'] = json.dumps(sorted_text_results) item['number'] = len(results) es_user_portrait.index(index = USER_RANK_KEYWORD_TASK_INDEX , doc_type=USER_RANK_KEYWORD_TASK_TYPE , id=task_id, body=item) return "1"
def social_sensing(task_detail): # 任务名 传感器 终止时间 之前状态 创建者 时间 task_name = task_detail[0] social_sensors = task_detail[1] stop_time = task_detail[2] forward_warning_status = task_detail[3] create_by = task_detail[4] ts = int(task_detail[5]) new = int(task_detail[6]) print ts2date(ts) # PART 1 forward_result = get_forward_numerical_info(task_name, ts, create_by) # 之前时间阶段内的原创微博list/retweeted forward_origin_weibo_list = query_mid_list(ts - time_interval, social_sensors, forward_time_range) forward_retweeted_weibo_list = query_mid_list(ts - time_interval, social_sensors, forward_time_range, 3) # 当前阶段内原创微博list current_mid_list = query_mid_list(ts, social_sensors, time_interval) current_retweeted_mid_list = query_mid_list(ts, social_sensors, time_interval, 3) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(current_retweeted_mid_list) all_mid_list.extend(forward_origin_weibo_list) all_mid_list.extend(forward_retweeted_weibo_list) all_origin_list = [] all_origin_list.extend(current_mid_list) all_origin_list.extend(forward_origin_weibo_list) all_retweeted_list = [] all_retweeted_list.extend(current_retweeted_mid_list) all_retweeted_list.extend( forward_retweeted_weibo_list) #被转发微博的mid/root-mid print "all mid list: ", len(all_mid_list) #print "all_origin_list", all_origin_list #print "all_retweeted_list", all_retweeted_list # 查询微博在当前时间内的转发和评论数, 聚合按照message_type statistics_count = query_related_weibo(ts, all_mid_list, time_interval) if all_origin_list: origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval) # 原创微博详情 else: origin_weibo_detail = {} if all_retweeted_list: retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval) # 转发微博详情 else: retweeted_weibo_detail = {} current_total_count = statistics_count['total_count'] # 当前阶段内所有微博总数 current_retweeted_count = statistics_count['retweeted'] current_comment_count = statistics_count['comment'] # PART 2 # 聚合当前时间内积极、中性、悲伤、愤怒情绪分布 # sentiment_dict = {"0": "neutral", "1":"positive", "2":"sad", "3": "anger"} sentiment_count = {"0": 0, "1": 0, "2": 0, "3": 0} search_results = aggregation_sentiment_related_weibo( ts, all_mid_list, time_interval) sentiment_count = search_results print "sentiment_count: ", sentiment_count negetive_key = ["2", "3", "4", "5", "6"] negetive_count = 0 for key in negetive_key: negetive_count += sentiment_count[key] # 聚合当前时间内重要的人 important_uid_list = [] datetime = ts2datetime(ts - time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: search_results = get_important_user(ts, all_mid_list, time_interval) important_uid_list = search_results # 根据获得uid_list,从人物库中匹配重要人物 if important_uid_list: important_results = es_user_portrait.mget( index=portrait_index_name, doc_type=portrait_index_type, body={"ids": important_uid_list})['docs'] else: important_results = [] filter_important_list = [] # uid_list if important_results: for item in important_results: if item['found']: #if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD: filter_important_list.append(item['_id']) #判断感知 burst_reason = signal_nothing_variation warning_status = signal_nothing finish = unfinish_signal # "0" process_status = "1" if forward_result[0]: # 根据移动平均判断是否有时间发生 mean_count = forward_result[1] std_count = forward_result[2] mean_sentiment = forward_result[3] std_sentiment = forward_result[4] if mean_count >= MEAN_COUNT and current_total_count > mean_count + 1.96 * std_count or current_total_count >= len( all_mid_list) * AVERAGE_COUNT: # 异常点发生 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track else: warning_status = signal_brust burst_reason += signal_count_varition # 数量异常 if negetive_count > mean_sentiment + 1.96 * std_sentiment and mean_sentiment >= MEAN_COUNT or negetive_count >= len( all_mid_list) * AVERAGE_COUNT: warning_status = signal_brust burst_reason += signal_sentiment_varition # 负面情感异常, "12"表示两者均异常 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track if int(stop_time) <= ts: # 检查任务是否已经完成 finish = finish_signal process_status = "0" # 感知到的事, all_mid_list tmp_burst_reason = burst_reason topic_list = [] sensitive_text_list = [] # 有事件发生时开始 #if warning_status: if 1: index_list = [] important_words = [] datetime_1 = ts2datetime(ts) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es = es_text.indices.exists(index=index_name_1) if exist_es: index_list.append(index_name_1) datetime_2 = ts2datetime(ts - DAY) index_name_2 = flow_text_index_name_pre + datetime_2 exist_es = es_text.indices.exists(index=index_name_2) if exist_es: index_list.append(index_name_2) if index_list and all_mid_list: query_body = { "query": { "filtered": { "filter": { "terms": { "mid": all_mid_list } } } }, "size": 5000 } search_results = es_text.search(index=index_list, doc_type="text", body=query_body)['hits']['hits'] tmp_sensitive_warning = "" text_dict = dict() # 文本信息 mid_value = dict() # 文本赋值 duplicate_dict = dict() # 重合字典 portrait_dict = dict() # 背景信息 classify_text_dict = dict() # 分类文本 classify_uid_list = [] duplicate_text_list = [] sensitive_words_dict = dict() if search_results: for item in search_results: iter_uid = item['_source']['uid'] iter_mid = item['_source']['mid'] iter_text = item['_source']['text'].encode( 'utf-8', 'ignore') iter_sensitive = item['_source'].get('sensitive', 0) duplicate_text_list.append({ "_id": iter_mid, "title": "", "content": iter_text }) if iter_sensitive: tmp_sensitive_warning = signal_sensitive_variation #涉及到敏感词的微博 sensitive_words_dict[iter_mid] = iter_sensitive keywords_dict = json.loads( item['_source']['keywords_dict']) personal_keywords_dict = dict() for k, v in keywords_dict.iteritems(): k = k.encode('utf-8', 'ignore') personal_keywords_dict[k] = v classify_text_dict[iter_mid] = personal_keywords_dict classify_uid_list.append(iter_uid) # 去重 if duplicate_text_list: dup_results = duplicate(duplicate_text_list) for item in dup_results: if item['duplicate']: duplicate_dict[item['_id']] = item['same_from'] # 分类 if classify_text_dict: classify_results = topic_classfiy(classify_uid_list, classify_text_dict) mid_value = dict() #print "classify_results: ", classify_results for k, v in classify_results.iteritems(): # mid:value mid_value[k] = topic_value_dict[v[0]] if tmp_sensitive_warning: warning_status = signal_brust burst_reason += signal_sensitive_variation sensitive_weibo_detail = {} if sensitive_words_dict: sensitive_mid_list = sensitive_words_dict.keys() sensitivie_weibo_detail = query_hot_weibo( ts, sensitive_mid_list, time_interval) results = dict() results['mid_topic_value'] = json.dumps(mid_value) results['duplicate_dict'] = json.dumps(duplicate_dict) results['sensitive_words_dict'] = json.dumps(sensitive_words_dict) results['sensitive_weibo_detail'] = json.dumps(sensitive_weibo_detail) results['origin_weibo_number'] = len(all_origin_list) results['retweeted_weibo_number'] = len(all_retweeted_list) results['origin_weibo_detail'] = json.dumps(origin_weibo_detail) results['retweeted_weibo_detail'] = json.dumps(retweeted_weibo_detail) results['retweeted_weibo_count'] = current_retweeted_count results['comment_weibo_count'] = current_comment_count results['weibo_total_number'] = current_total_count results['sentiment_distribution'] = json.dumps(sentiment_count) results['important_users'] = json.dumps(filter_important_list) results['unfilter_users'] = json.dumps(important_uid_list) results['burst_reason'] = tmp_burst_reason results['timestamp'] = ts #results['clustering_topic'] = json.dumps(topic_list) # es存储当前时段的信息 doctype = create_by + '-' + task_name es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results) # 更新manage social sensing的es信息 if not new: temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=doctype)['_source'] temporal_result['warning_status'] = warning_status temporal_result['burst_reason'] = tmp_burst_reason temporal_result['finish'] = finish temporal_result['processing_status'] = process_status history_status = json.loads(temporal_result['history_status']) history_status.append([ts, task_name, warning_status]) temporal_result['history_status'] = json.dumps(history_status) es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=doctype, body=temporal_result) else: print "test" return "1"
def sensors_keywords_detection(task_detail): task_name = task_detail[0] social_sensors = task_detail[1] keywords_list = task_detail[2] sensitive_words = task_detail[3] stop_time = task_detail[4] forward_warning_status = task_detail[5] ts = task_detail[7] forward_result = get_forward_numerical_info(task_name, ts, keywords_list) # 1. 聚合前12个小时内传感人物发布的所有与关键词相关的原创微博 forward_origin_weibo_list = query_mid_list(ts-time_interval, keywords_list, forward_time_range, 1, social_sensors) # 2. 聚合当前阶段内的原创微博 current_mid_list = query_mid_list(ts, keywords_list, time_interval, 1, social_sensors) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(forward_origin_weibo_list) all_mid_list = list(set(all_mid_list)) print len(all_mid_list) # 3. 查询当前的原创微博和之前12个小时的原创微博在当前时间内的转发和评论数, 聚合按照message_type statistics_count = query_related_weibo(ts, all_mid_list, time_interval, keywords_list, 1, social_sensors) current_total_count = statistics_count['total_count'] # 当前阶段内所有微博总数 print "current all weibo: ", statistics_count current_origin_count = statistics_count['origin'] current_retweeted_count = statistics_count['retweeted'] current_comment_count = statistics_count['comment'] # 4. 聚合当前时间内积极、中性、悲伤、愤怒情绪分布 sentiment_count = {"0": 0, "1": 0, "2": 0, "3": 0} datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-time_interval) if datetime == datetime_1: index_name = flow_text_index_name_pre + datetime else: index_name = flow_text_index_name_pre + datetime_1 exist_es = es_text.indices.exists(index_name) if exist_es: search_results = aggregation_sentiment_related_weibo(ts, all_mid_list, time_interval, keywords_list, 1) sentiment_count = search_results print "sentiment_count: ", sentiment_count negetive_count = sentiment_count['2'] + sentiment_count['3'] # 5. 那些社会传感器参与事件讨论 query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte": ts - time_interval, "lt": ts } }}, {"terms":{"uid": social_sensors}} ], "should":[ {"terms": {"root_mid": all_mid_list}}, {"terms": {"mid": all_mid_list}} ] } } } }, "size": 10000 } datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts - time_interval) if datetime == datetime_1: index_name = flow_text_index_name_pre + datetime else: index_name = flow_text_index_name_pre + datetime_1 search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)['hits']['hits'] attend_users = [] if search_results: for item in search_results: attend_users.append(item['_source']['uid']) important_users = list(set(attend_users)) print "important users", important_users # 6. 敏感词识别,如果传感器的微博中出现这么一个敏感词,那么就会预警------PS.敏感词是一个危险的设置 sensitive_origin_weibo_number = 0 sensitive_retweeted_weibo_number = 0 sensitive_comment_weibo_number = 0 sensitive_total_weibo_number = 0 if sensitive_words: query_sensitive_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte": ts - time_interval, "lt": ts }} }, {"terms": {"keywords_string": sensitive_words}}, {"terms": {"uid": social_sensors}} ] } } } }, "aggs":{ "all_list":{ "terms":{"field": "message_type"} } } } sensitive_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_sensitive_body)['aggregations']['all_list']["buckets"] if sensitive_results: for item in sensitive_results: if int(item["key"]) == 1: sensitive_origin_weibo_number = item['doc_count'] elif int(item["key"]) == 2: sensitive_comment_weibo_number = item['doc_count'] elif int(item["key"]) == 3: sensitive_retweeted_weibo_number = item["doc_count"] else: pass sensitive_total_weibo_number = sensitive_origin_weibo_number + sensitive_comment_weibo_number + sensitive_retweeted_weibo_number burst_reason = signal_nothing_variation warning_status = signal_nothing finish = unfinish_signal # "0" if sensitive_total_weibo_number: # 敏感微博的数量异常 print "======================" if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track else: warning_status = signal_brust burst_reason = signal_sensitive_variation if forward_result[0]: # 根据移动平均判断是否有时间发生 mean_count = forward_result[1] std_count = forward_result[2] mean_sentiment = forward_result[3] std_sentiment = forward_result[4] if current_total_count > mean_count+1.96*std_count: # 异常点发生 print "=====================================================" if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track else: warning_status = signal_brust burst_reason += signal_count_varition # 数量异常 if negetive_count > mean_sentiment+1.96*std_sentiment: warning_status = signal_brust burst_reason += signal_sentiment_varition # 负面情感异常, "12"表示两者均异常 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track if int(stop_time) <= ts: # 检查任务是否已经完成 finish = finish_signal tmp_burst_reason = burst_reason topic_list = [] # 7. 感知到的事, all_mid_list if burst_reason: # 有事情发生 text_list = [] mid_set = set() if signal_sensitive_variation in burst_reason: query_sensitive_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte": ts - time_interval, "lt": ts }} }, {"terms": {"keywords_string": sensitive_words}} ] } } } }, "size": 10000 } if social_sensors: query_sensitive_body['query']['filtered']['filter']['bool']['must'].append({"terms":{"uid": social_sensors}}) sensitive_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_sensitive_body)['hits']["hits"] if sensitive_results: for item in sensitive_results: iter_mid = item['_source']['mid'] iter_text = item['_source']['text'] temp_dict = dict() temp_dict["mid"] = iter_mid temp_dict["text"] = iter_text if iter_mid not in mid_set: text_list.append(temp_dict) # 整理后的文本,mid,text mid_set.add(iter_mid) burst_reason.replace(signal_sensitive_variation, "") if burst_reason and all_mid_list: sensing_text = es_text.mget(index=index_name, doc_type=flow_text_index_type, body={"ids": all_mid_list}, fields=["mid", "text"])["docs"] if sensing_text: for item in sensing_text: if item['found']: iter_mid = item["fields"]["mid"][0] iter_text = item["fields"]["text"][0] temp_dict = dict() temp_dict["mid"] = iter_mid temp_dict["text"] = iter_text if iter_mid not in mid_set: text_list.append(temp_dict) mid_set.add(iter_mid) if len(text_list) == 1: top_word = freq_word(text_list[0]) topic_list = top_word.keys() elif len(text_list) == 0: topic_list = [] tmp_burst_reason = "" #没有相关微博,归零 print "***********************************" else: feature_words, input_word_dict = tfidf(text_list) #生成特征词和输入数据 word_label, evaluation_results = kmeans(feature_words, text_list) #聚类 inputs = text_classify(text_list, word_label, feature_words) clustering_topic = cluster_evaluation(inputs) sorted_dict = sorted(clustering_topic.items(), key=lambda x:x[1], reverse=True)[0:5] topic_list = [] if sorted_dict: for item in sorted_dict: topic_list.append(word_label[item[0]]) print "topic_list:", topic_list if not topic_list: tmp_burst_reason = signal_nothing_variation warning_status = signal_nothing results = dict() results['sensitive_origin_weibo_number'] = sensitive_origin_weibo_number results['sensitive_retweeted_weibo_number'] = sensitive_retweeted_weibo_number results['sensitive_comment_weibo_number'] = sensitive_comment_weibo_number results['sensitive_weibo_total_number'] = sensitive_total_weibo_number results['origin_weibo_number'] = current_origin_count results['retweeted_weibo_number'] = current_retweeted_count results['comment_weibo_number'] = current_comment_count results['weibo_total_number'] = current_total_count results['sentiment_distribution'] = json.dumps(sentiment_count) results['important_users'] = json.dumps(important_users) results['burst_reason'] = tmp_burst_reason results['timestamp'] = ts if tmp_burst_reason: results["clustering_topic"] = json.dumps(topic_list) # es存储当前时段的信息 doctype = task_name es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results) # 更新manage social sensing的es信息 temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=task_name)['_source'] temporal_result['warning_status'] = warning_status temporal_result['burst_reason'] = tmp_burst_reason temporal_result['finish'] = finish history_status = json.loads(temporal_result['history_status']) history_status.append([ts, ' '.join(keywords_list), warning_status]) temporal_result['history_status'] = json.dumps(history_status) es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=task_name, body=temporal_result) return "1"
def social_sensing(task_detail): # 任务名 传感器 终止时间 之前状态 创建者 时间 task_name = task_detail[0] social_sensors = task_detail[1] stop_time = task_detail[2] forward_warning_status = task_detail[3] create_by = task_detail[4] ts = int(task_detail[5]) new = int(task_detail[6]) print ts2date(ts) # PART 1 forward_result = get_forward_numerical_info(task_name, ts, create_by) # 之前时间阶段内的原创微博list/retweeted forward_origin_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range) forward_retweeted_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range, 3) # 当前阶段内原创微博list current_mid_list = query_mid_list(ts, social_sensors, time_interval) current_retweeted_mid_list = query_mid_list(ts, social_sensors, time_interval, 3) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(current_retweeted_mid_list) all_mid_list.extend(forward_origin_weibo_list) all_mid_list.extend(forward_retweeted_weibo_list) all_origin_list = [] all_origin_list.extend(current_mid_list) all_origin_list.extend(forward_origin_weibo_list) all_retweeted_list = [] all_retweeted_list.extend(current_retweeted_mid_list) all_retweeted_list.extend(forward_retweeted_weibo_list)#被转发微博的mid/root-mid print "all mid list: ", len(all_mid_list) #print "all_origin_list", all_origin_list #print "all_retweeted_list", all_retweeted_list # 查询微博在当前时间内的转发和评论数, 聚合按照message_type statistics_count = query_related_weibo(ts, all_mid_list, time_interval) if all_origin_list: origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval) # 原创微博详情 else: origin_weibo_detail = {} if all_retweeted_list: retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval) # 转发微博详情 else: retweeted_weibo_detail = {} current_total_count = statistics_count['total_count'] # 当前阶段内所有微博总数 current_retweeted_count = statistics_count['retweeted'] current_comment_count = statistics_count['comment'] # PART 2 # 聚合当前时间内积极、中性、悲伤、愤怒情绪分布 # sentiment_dict = {"0": "neutral", "1":"positive", "2":"sad", "3": "anger"} sentiment_count = {"0": 0, "1": 0, "2": 0, "3": 0} search_results = aggregation_sentiment_related_weibo(ts, all_mid_list, time_interval) sentiment_count = search_results print "sentiment_count: ", sentiment_count negetive_key = ["2", "3", "4", "5", "6"] negetive_count = 0 for key in negetive_key: negetive_count += sentiment_count[key] # 聚合当前时间内重要的人 important_uid_list = [] datetime = ts2datetime(ts-time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: search_results = get_important_user(ts, all_mid_list, time_interval) important_uid_list = search_results # 根据获得uid_list,从人物库中匹配重要人物 if important_uid_list: important_results = es_user_portrait.mget(index=portrait_index_name,doc_type=portrait_index_type, body={"ids": important_uid_list})['docs'] else: important_results = [] filter_important_list = [] # uid_list if important_results: for item in important_results: if item['found']: #if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD: filter_important_list.append(item['_id']) #判断感知 burst_reason = signal_nothing_variation warning_status = signal_nothing finish = unfinish_signal # "0" process_status = "1" if forward_result[0]: # 根据移动平均判断是否有时间发生 mean_count = forward_result[1] std_count = forward_result[2] mean_sentiment = forward_result[3] std_sentiment = forward_result[4] if mean_count >= MEAN_COUNT and current_total_count > mean_count+1.96*std_count or current_total_count >= len(all_mid_list)*AVERAGE_COUNT: # 异常点发生 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track else: warning_status = signal_brust burst_reason += signal_count_varition # 数量异常 if negetive_count > mean_sentiment+1.96*std_sentiment and mean_sentiment >= MEAN_COUNT or negetive_count >= len(all_mid_list)*AVERAGE_COUNT: warning_status = signal_brust burst_reason += signal_sentiment_varition # 负面情感异常, "12"表示两者均异常 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track if int(stop_time) <= ts: # 检查任务是否已经完成 finish = finish_signal process_status = "0" # 感知到的事, all_mid_list tmp_burst_reason = burst_reason topic_list = [] sensitive_text_list = [] # 有事件发生时开始 #if warning_status: if 1: index_list = [] important_words = [] datetime_1 = ts2datetime(ts) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es = es_text.indices.exists(index=index_name_1) if exist_es: index_list.append(index_name_1) datetime_2 = ts2datetime(ts-DAY) index_name_2 = flow_text_index_name_pre + datetime_2 exist_es = es_text.indices.exists(index=index_name_2) if exist_es: index_list.append(index_name_2) if index_list and all_mid_list: query_body = { "query":{ "filtered":{ "filter":{ "terms":{"mid": all_mid_list} } } }, "size": 2000 } search_results = es_text.search(index=index_list, doc_type="text", body=query_body)['hits']['hits'] text_list = [] tmp_sensitive_warning = "" sensitive_words_dict = dict() if search_results: for item in search_results: iter_mid = item['_source']['mid'] iter_text = item['_source']['text'] iter_sensitive = item['_source'].get('sensitive', 0) if iter_sensitive: tmp_sensitive_warning = signal_sensitive_variation #涉及到敏感词的微博 sensitive_words_dict[iter_mid] = iter_sensitive temp_dict = dict() temp_dict["mid"] = iter_mid temp_dict["text"] = iter_text text_list.append(temp_dict) if tmp_sensitive_warning: warning_status = signal_brust burst_reason += signal_sensitive_variation sensitive_weibo_detail = {} if sensitive_words_dict: sensitive_mid_list = sensitive_words_dict.keys() sensitivie_weibo_detail = query_hot_weibo(ts, sensitive_mid_list, time_interval) """ if len(text_list) == 1: top_word = freq_word(text_list[0]) topic_list = [top_word.keys()] elif len(text_list) == 0: topic_list = [] tmp_burst_reason = "" #没有相关微博,归零 print "no relate weibo text" else: feature_words, input_word_dict = tfidf(text_list) #生成特征词和输入数据 word_label, evaluation_results = kmeans(feature_words, text_list) #聚类 inputs = text_classify(text_list, word_label, feature_words) clustering_topic = cluster_evaluation(inputs) print "clustering weibo topic" sorted_dict = sorted(clustering_topic.items(), key=lambda x:x[1], reverse=True) topic_list = [] if sorted_dict: for item in sorted_dict: if item[0] != "other": topic_list.append(word_label[item[0]]) print "topic list: ", len(topic_list) """ results = dict() if sensitive_weibo_detail: print "sensitive_weibo_detail: ", sensitive_weibo_detail results['sensitive_words_dict'] = json.dumps(sensitive_words_dict) results['sensitive_weibo_detail'] = json.dumps(sensitive_weibo_detail) results['origin_weibo_number'] = len(all_origin_list) results['retweeted_weibo_number'] = len(all_retweeted_list) results['origin_weibo_detail'] = json.dumps(origin_weibo_detail) results['retweeted_weibo_detail'] = json.dumps(retweeted_weibo_detail) results['retweeted_weibo_count'] = current_retweeted_count results['comment_weibo_count'] = current_comment_count results['weibo_total_number'] = current_total_count results['sentiment_distribution'] = json.dumps(sentiment_count) results['important_users'] = json.dumps(filter_important_list) results['unfilter_users'] = json.dumps(important_uid_list) results['burst_reason'] = tmp_burst_reason results['timestamp'] = ts #results['clustering_topic'] = json.dumps(topic_list) # es存储当前时段的信息 doctype = create_by + '-' + task_name es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results) # 更新manage social sensing的es信息 if not new: temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=doctype)['_source'] temporal_result['warning_status'] = warning_status temporal_result['burst_reason'] = tmp_burst_reason temporal_result['finish'] = finish temporal_result['processing_status'] = process_status history_status = json.loads(temporal_result['history_status']) history_status.append([ts, task_name, warning_status]) temporal_result['history_status'] = json.dumps(history_status) es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=doctype, body=temporal_result) else: print "test" return "1"
def key_words_search(task_id, search_type, pre, during, start_time, keyword_list, search_key='', sort_norm='', sort_scope='', time=1, isall=False, number=100): number = int(number) should = [] for key in keyword_list: if search_type == "hashtag": should.append({"prefix": {"text": "#" + key + "#"}}) else: should.append({"wildcard": {"text": "*" + key + "*"}}) index_list = [] date = ts2datetime(start_time) index_name = pre + date while during: if es_flow_text.indices.exists(index=index_name): index_list.append(index_name) start_time = start_time + DAY date = ts2datetime(start_time) index_name = pre + date during -= 1 print index_list uid_set = set() text_results = [] query_body = { "query": { "bool": { "must": should } }, "sort": { "user_fansnum": { "order": "desc" } }, "size": 5000 } results = es_flow_text.search(index=index_list, doc_type='text', body=query_body, _source=False, fields=[ "uid", "user_fansnum", "text", "message_type", "sentiment", "timestamp", "geo", "retweeted", "comment" ])["hits"]["hits"] id_index = 0 index_list = [] un_uid_list = [] for item in results: if item['fields']['uid'][0] not in uid_set: uid_set.add(item['fields']['uid'][0]) un_uid_list.append(item['fields']['uid'][0]) index_list.append(id_index) id_index += 1 uid_list = [] print "un_uid_list: ", len(un_uid_list) portrait_list = [] count = 0 in_index = 0 if not isall and un_uid_list: # 库内 portrait_results = es_user_portrait.mget(index=USER_INDEX_NAME, doc_type=USER_INDEX_TYPE, body={"ids": un_uid_list}, _source=False, fields=['uname'])["docs"] for item in portrait_results: if item["found"]: portrait_list.append(item['_id']) nick_name = item['fields']['uname'][0] if nick_name == 'unknown': nick_name = item['_id'] index = index_list[in_index] weibo_url = weiboinfo2url(results[index]['fields']['uid'][0], results[index]['_id']) text_results.extend([ results[index]['fields']['uid'][0], results[index]['fields']['user_fansnum'][0], results[index]['fields']['text'][0], results[index]['fields']['message_type'][0], results[index]['fields']['sentiment'][0], ts2date(results[index]['fields']['timestamp'][0]), results[index]['fields']['geo'][0], results[index]['fields']['retweeted'][0], results[index]['fields']['comment'][0], nick_name, weibo_url ]) count += 1 if count == number: break print "portrait_len, ", len(portrait_list) in_index += 1 if portrait_list: uid_list = in_sort_filter(time, sort_norm, sort_scope, None, portrait_list, True, number) # sort elif un_uid_list: profile_result = es_user_profile.mget(index="weibo_user", doc_type="user", body={"ids": un_uid_list}, fields=['nick_name'])["docs"] for i in range(len(profile_result)): index = index_list[i] try: nick_name = profile_result[i]['fields']['nick_name'][0] except: nick_name = un_uid_list[i] item = results[index] weibo_url = weiboinfo2url(item['fields']['uid'][0], results[index]['_id']) text_results.append([ item['fields']['uid'][0], item['fields']['user_fansnum'][0], item['fields']['text'][0], item['fields']['message_type'][0], item['fields']['sentiment'][0], ts2date(item['fields']['timestamp'][0]), results[index]['fields']['geo'][0], results[index]['fields']['retweeted'][0], results[index]['fields']['comment'][0], nick_name, weibo_url ]) if i == number: break uid_list = all_sort_filter(un_uid_list[:number], sort_norm, time, True, number) print "filter_uid_list: ", len(uid_list) if uid_list: results = make_up_user_info(uid_list, isall, time, sort_norm) else: results = [] print "results: ", len(results) # 修改状态 task_detail = es_user_portrait.get(index=USER_RANK_KEYWORD_TASK_INDEX, doc_type=USER_RANK_KEYWORD_TASK_TYPE, id=task_id) item = task_detail['_source'] item['status'] = 1 item['result'] = json.dumps(results) item['text_results'] = json.dumps(text_results) item['number'] = len(results) es_user_portrait.index(index=USER_RANK_KEYWORD_TASK_INDEX, doc_type=USER_RANK_KEYWORD_TASK_TYPE, id=task_id, body=item) return "1"
def specific_keywords_burst_dection(task_detail): task_name = task_detail[0] social_sensors = task_detail[1] keywords_list = task_detail[2] sensitive_words = task_detail[3] stop_time = task_detail[4] forward_warning_status = task_detail[5] ts = int(task_detail[7]) forward_result = get_forward_numerical_info(task_name, ts, keywords_list) # 之前时间阶段内的原创微博list forward_origin_weibo_list = query_mid_list(ts-time_interval, keywords_list, forward_time_range) # 当前阶段内原创微博list current_mid_list = query_mid_list(ts, keywords_list, time_interval) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(forward_origin_weibo_list) print "all mid list: ", len(all_mid_list) # 查询当前的原创微博和之前12个小时的原创微博在当前时间内的转发和评论数, 聚合按照message_type statistics_count = query_related_weibo(ts, all_mid_list, time_interval, keywords_list) current_total_count = statistics_count['total_count'] # 当前阶段内所有微博总数 print "current all weibo: ", statistics_count current_origin_count = statistics_count['origin'] current_retweeted_count = statistics_count['retweeted'] current_comment_count = statistics_count['comment'] # 针对敏感微博的监测,给定传感器和敏感词的前提下,只要传感器的微博里提及到敏感词即会认为是预警 # 聚合当前时间内积极、中性、悲伤、愤怒情绪分布 # sentiment_dict = {"0": "neutral", "1":"positive", "2":"sad", "3": "anger"} sentiment_count = {"0": 0, "1": 0, "2": 0, "3": 0} datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-time_interval) if datetime != datetime_1: index_name = flow_text_index_name_pre + datetime_1 else: index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: search_results = aggregation_sentiment_related_weibo(ts, all_mid_list, time_interval, keywords_list) sentiment_count = search_results print "sentiment_count: ", sentiment_count negetive_count = sentiment_count['2'] + sentiment_count['3'] # 聚合当前时间内重要的人 important_uid_list = [] if exist_es: #search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=aggregation_sensor_keywords(ts-time_interval, ts, [], "root_uid", size=IMPORTANT_USER_NUMBER))['aggregations']['all_keywords']['buckets'] search_results = query_hot_weibo(ts, all_mid_list, time_interval, keywords_list, aggregation_field="root_uid", size=100) important_uid_list = search_results.keys() if datetime != datetime_1: index_name_1 = flow_text_index_name_pre + datetime_1 if es_text.indices.exists(index_name_1): #search_results_1 = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=aggregation_sensor_keywords(ts-time_interval, ts, [], "root_uid", size=IMPORTANT_USER_NUMBER))['aggregations']['all_keywords']['buckets'] search_results_1 = query_hot_weibo(ts, all_mid_list, time_interval, keywords_list, aggregation_field="root_uid", size=100) if search_results_1: for item in search_results_1: important_uid_list.append(item['key']) # 根据获得uid_list,从人物库中匹配重要人物 if important_uid_list: important_results = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, body={"ids": important_uid_list})['docs'] else: important_results = {} filter_important_list = [] # uid_list if important_results: for item in important_results: if item['found']: if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD: filter_important_list.append(item['_id']) print filter_important_list # 6. 敏感词识别,如果传感器的微博中出现这么一个敏感词,那么就会预警------PS.敏感词是一个 sensitive_origin_weibo_number = 0 sensitive_retweeted_weibo_number = 0 sensitive_comment_weibo_number = 0 sensitive_total_weibo_number = 0 if sensitive_words: query_sensitive_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte": ts - time_interval, "lt": ts }} }, {"terms": {"keywords_string": sensitive_words}} ] } } } }, "aggs":{ "all_list":{ "terms":{"field": "message_type"} } } } if social_sensors: query_sensitive_body['query']['filtered']['filter']['bool']['must'].append({"terms":{"uid": social_sensors}}) sensitive_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_sensitive_body)['aggregations']['all_list']["buckets"] if sensitive_results: for item in sensitive_results: if int(item["key"]) == 1: sensitive_origin_weibo_number = item['doc_count'] elif int(item["key"]) == 2: sensitive_comment_weibo_number = item['doc_count'] elif int(item["key"]) == 3: sensitive_retweeted_weibo_number = item["doc_count"] else: pass sensitive_total_weibo_number = sensitive_origin_weibo_number + sensitive_comment_weibo_number + sensitive_retweeted_weibo_number burst_reason = signal_nothing_variation warning_status = signal_nothing finish = unfinish_signal # "0" process_status = "1" if sensitive_total_weibo_number > WARNING_SENSITIVE_COUNT: # 敏感微博的数量异常 print "======================" if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track else: warning_status = signal_brust burst_reason = signal_sensitive_variation if forward_result[0]: # 根据移动平均判断是否有时间发生 mean_count = forward_result[1] std_count = forward_result[2] mean_sentiment = forward_result[3] std_sentiment = forward_result[4] if current_total_count > mean_count+1.96*std_count: # 异常点发生 print "=====================================================" if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track else: warning_status = signal_brust burst_reason += signal_count_varition # 数量异常 if negetive_count > mean_sentiment+1.96*std_sentiment: warning_status = signal_brust burst_reason += signal_sentiment_varition # 负面情感异常, "12"表示两者均异常 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track if int(stop_time) <= ts: # 检查任务是否已经完成 finish = finish_signal process_status = "0" # 7. 感知到的事, all_mid_list tmp_burst_reason = burst_reason topic_list = [] # 判断是否有敏感微博出现:有,则聚合敏感微博,replace;没有,聚合普通微博 if burst_reason: # 有事情发生 text_list = [] mid_set = set() if signal_sensitive_variation in burst_reason: query_sensitive_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte": ts - time_interval, "lt": ts }} }, {"terms": {"keywords_string": sensitive_words}} ] } } } }, "size": 10000 } if social_sensors: query_sensitive_body['query']['filtered']['filter']['bool']['must'].append({"terms":{"uid": social_sensors}}) sensitive_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_sensitive_body)['hits']['hits'] if sensitive_results: for item in sensitive_results: iter_mid = item['_source']['mid'] iter_text = item['_source']['text'] temp_dict = dict() temp_dict["mid"] = iter_mid temp_dict["text"] = iter_text if iter_mid not in mid_set: text_list.append(temp_dict) # 整理后的文本,mid,text mid_set.add(iter_mid) burst_reason.replace(signal_sensitive_variation, "") current_origin_mid_list = query_mid_list(ts, keywords_list, time_interval, 1) print "current_origin_mid_list:", len(current_origin_mid_list) if burst_reason and current_mid_list: origin_sensing_text = es_text.mget(index=index_name, doc_type=flow_text_index_type, body={"ids": current_origin_mid_list}, fields=["mid", "text"])["docs"] if origin_sensing_text: for item in origin_sensing_text: if item["found"]: iter_mid = item["fields"]["mid"][0] iter_text = item["fields"]["text"][0] temp_dict = dict() temp_dict["mid"] = iter_mid temp_dict["text"] = iter_text if iter_mid not in mid_set: text_list.append(temp_dict) # 整理后的文本,mid,text mid_set.add(iter_mid) if len(text_list) == 1: top_word = freq_word(text_list[0]) topic_list = [top_word.keys()] elif len(text_list) == 0: topic_list = [] tmp_burst_reason = "" #没有相关微博,归零 print "***********************************" else: feature_words, input_word_dict = tfidf(text_list) #生成特征词和输入数据 word_label, evaluation_results = kmeans(feature_words, text_list) #聚类 inputs = text_classify(text_list, word_label, feature_words) clustering_topic = cluster_evaluation(inputs) print "========================================================================================" print "=========================================================================================" sorted_dict = sorted(clustering_topic.items(), key=lambda x:x[1], reverse=True) topic_list = [] if sorted_dict: for item in sorted_dict: topic_list.append(word_label[item[0]]) print "topic_list, ", topic_list if not topic_list: warning_status = signal_nothing tmp_burst_reason = signal_nothing_variation results = dict() results['origin_weibo_number'] = current_origin_count results['retweeted_weibo_number'] = current_retweeted_count results['comment_weibo_number'] = current_comment_count results['weibo_total_number'] = current_total_count results['sensitive_origin_weibo_number'] = sensitive_origin_weibo_number results['sensitive_retweeted_weibo_number'] = sensitive_retweeted_weibo_number results['sensitive_comment_weibo_number'] = sensitive_comment_weibo_number results['sensitive_weibo_total_number'] = sensitive_total_weibo_number results['sentiment_distribution'] = json.dumps(sentiment_count) results['important_users'] = json.dumps(filter_important_list) results['burst_reason'] = tmp_burst_reason results['timestamp'] = ts if tmp_burst_reason: results['clustering_topic'] = json.dumps(topic_list) # es存储当前时段的信息 doctype = task_name es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results) # 更新manage social sensing的es信息 temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=task_name)['_source'] temporal_result['warning_status'] = warning_status temporal_result['burst_reason'] = tmp_burst_reason temporal_result['finish'] = finish temporal_result['processing_status'] = process_status history_status = json.loads(temporal_result['history_status']) history_status.append([ts, ' '.join(keywords_list), warning_status]) temporal_result['history_status'] = json.dumps(history_status) es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=task_name, body=temporal_result) return "1"
def social_sensing(task_detail): # 任务名 传感器 终止时间 之前状态 创建者 时间 task_name = task_detail[0] social_sensors = task_detail[1] stop_time = task_detail[2] forward_warning_status = task_detail[3] create_by = task_detail[4] ts = int(task_detail[5]) # PART 1 forward_result = get_forward_numerical_info(task_name, ts, create_by) # 之前时间阶段内的原创微博list/retweeted forward_origin_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range) forward_retweeted_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range, 3) # 当前阶段内原创微博list current_mid_list = query_mid_list(ts, social_sensors, time_interval) current_retweeted_mid_list = query_mid_list(ts, social_sensors, time_interval, 3) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(current_retweeted_mid_list) all_mid_list.extend(forward_origin_weibo_list) all_mid_list.extend(forward_retweeted_weibo_list) all_origin_list = [] all_origin_list.extend(current_mid_list) all_origin_list.extend(forward_origin_weibo_list) all_retweeted_list = [] all_retweeted_list.extend(current_retweeted_mid_list) all_retweeted_list.extend(forward_retweeted_weibo_list)#被转发微博的mid/root-mid print "all mid list: ", len(all_mid_list) print "all_origin_list", all_origin_list print "all_retweeted_list", all_retweeted_list # 查询微博在当前时间内的转发和评论数, 聚合按照message_type statistics_count = query_related_weibo(ts, all_mid_list, time_interval) if all_origin_list: origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval) # 原创微博详情 else: origin_weibo_detail = {} if all_retweeted_list: retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval) # 转发微博详情 else: retweeted_weibo_detail = {} current_total_count = statistics_count['total_count'] # 当前阶段内所有微博总数 current_retweeted_count = statistics_count['retweeted'] current_comment_count = statistics_count['comment'] # PART 2 # 聚合当前时间内积极、中性、悲伤、愤怒情绪分布 # sentiment_dict = {"0": "neutral", "1":"positive", "2":"sad", "3": "anger"} sentiment_count = {"0": 0, "1": 0, "2": 0, "3": 0} search_results = aggregation_sentiment_related_weibo(ts, all_mid_list, time_interval) sentiment_count = search_results print "sentiment_count: ", sentiment_count negetive_key = ["2", "3", "4", "5", "6"] negetive_count = 0 for key in negetive_key: negetive_count += sentiment_count[key] # 聚合当前时间内重要的人 important_uid_list = [] datetime = ts2datetime(ts-time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: search_results = get_important_user(ts, all_mid_list, time_interval) important_uid_list = search_results.keys() # 根据获得uid_list,从人物库中匹配重要人物 if important_uid_list: important_results = es_user_portrait.mget(index=portrait_index_name,doc_type=portrait_index_type, body={"ids": important_uid_list})['docs'] else: important_results = {} filter_important_list = [] # uid_list if important_results: for item in important_results: if item['found']: #if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD: filter_important_list.append(item['_id']) print filter_important_list #判断感知 burst_reason = signal_nothing_variation warning_status = signal_nothing finish = unfinish_signal # "0" process_status = "1" if forward_result[0]: # 根据移动平均判断是否有时间发生 mean_count = forward_result[1] std_count = forward_result[2] mean_sentiment = forward_result[3] std_sentiment = forward_result[4] if mean_count >= MEAN_COUNT and current_total_count > mean_count+1.96*std_count or current_total_count >= len(social_sensors)*0.2*AVERAGE_COUNT: # 异常点发生 print "=====================================================" if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track else: warning_status = signal_brust burst_reason += signal_count_varition # 数量异常 if negetive_count > mean_sentiment+1.96*std_sentiment and mean_sentiment >= MEAN_COUNT or negetive_count >= len(social_sensors)*0.2*AVERAGE_COUNT: warning_status = signal_brust burst_reason += signal_sentiment_varition # 负面情感异常, "12"表示两者均异常 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track if int(stop_time) <= ts: # 检查任务是否已经完成 finish = finish_signal process_status = "0" # 感知到的事, all_mid_list tmp_burst_reason = burst_reason topic_list = [] # 有事件发生时开始 if warning_status: index_list = [] important_words = [] datetime_1 = ts2datetime(ts) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es = es_text.indices.exists(index=index_name_1) if exist_es: index_list.append(index_name_1) datetime_2 = ts2datetime(ts-DAY) index_name_2 = flow_text_index_name_pre + datetime_2 exist_es = es_text.indices.exists(index=index_name_2) if exist_es: index_list.append(index_name_2) if index_list and all_mid_list: query_body = { "query":{ "filtered":{ "filter":{ "terms":{"mid": all_mid_list} } } }, "size": 2000 } search_results = es_text.search(index=index_list, doc_type="text", body=query_body)['hits']['hits'] text_list = [] if search_results: for item in search_results: iter_mid = item['_source']['mid'] iter_text = item['_source']['text'] temp_dict = dict() temp_dict["mid"] = iter_mid temp_dict["text"] = iter_text text_list.append(temp_dict) for item in text_list: print item['text'] if len(text_list) == 1: top_word = freq_word(text_list[0]) topic_list = [top_word.keys()] elif len(text_list) == 0: topic_list = [] tmp_burst_reason = "" #没有相关微博,归零 print "***********************************" else: feature_words, input_word_dict = tfidf(text_list) #生成特征词和输入数据 word_label, evaluation_results = kmeans(feature_words, text_list) #聚类 inputs = text_classify(text_list, word_label, feature_words) clustering_topic = cluster_evaluation(inputs) print "===============================================================" print "===============================================================" sorted_dict = sorted(clustering_topic.items(), key=lambda x:x[1], reverse=True) topic_list = [] if sorted_dict: for item in sorted_dict: topic_list.append(word_label[item[0]]) print "topic_list, ", topic_list #if not topic_list: # warning_status = signal_nothing # tmp_burst_reason = signal_nothing_variation results = dict() results['origin_weibo_number'] = len(all_origin_list) results['retweeted_weibo_number'] = len(all_retweeted_list) results['origin_weibo_detail'] = json.dumps(origin_weibo_detail) results['retweeted_weibo_detail'] = json.dumps(retweeted_weibo_detail) results['retweeted_weibo_count'] = current_retweeted_count results['comment_weibo_count'] = current_comment_count results['weibo_total_number'] = current_total_count results['sentiment_distribution'] = json.dumps(sentiment_count) results['important_users'] = json.dumps(filter_important_list) results['unfilter_users'] = json.dumps(important_uid_list) results['burst_reason'] = tmp_burst_reason results['timestamp'] = ts if tmp_burst_reason: results['clustering_topic'] = json.dumps(topic_list) # es存储当前时段的信息 doctype = create_by + '-' + task_name es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results) # 更新manage social sensing的es信息 temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=doctype)['_source'] temporal_result['warning_status'] = warning_status temporal_result['burst_reason'] = tmp_burst_reason temporal_result['finish'] = finish temporal_result['processing_status'] = process_status history_status = json.loads(temporal_result['history_status']) history_status.append([ts, task_name, warning_status]) temporal_result['history_status'] = json.dumps(history_status) es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=doctype, body=temporal_result) return "1"
result_list = list(uid_set) if not isall: uid_list = in_sort_filter(time,sort_norm ,sort_scope ,None , result_list , True) else: uid_list = all_sort_filter(result_list , sort_norm , time ,True) results = make_up_user_info(uid_list,isall,time,sort_norm) query = {"query":{"bool":{"must":[{"term":{"user_rank_task.user_ts":search_key}}],"must_not":[],"should":[]}},"from":0,"size":10,"sort":[],"facets":{}} if True: result = es_9200.search(index = USER_RANK_KEYWORD_TASK_INDEX , doc_type = USER_RANK_KEYWORD_TASK_TYPE , body = query)['hits']['hits'] search_id = result[0]['_id'] item = result[0]['_source'] item['status'] = 1 item['result'] = json.dumps(results) es_9200.index(index = USER_RANK_KEYWORD_TASK_INDEX , doc_type=USER_RANK_KEYWORD_TASK_TYPE , id=search_id, body=item) return results def scan_offlice_task(): query = {"query":{"bool":{"must":[{"term":{"user_rank_task.status":"0"}}],"must_not":[],"should":[]}},"from":0,"size":10,"sort":[],"facets":{}} results = es_9200.search(index = USER_RANK_KEYWORD_TASK_INDEX , doc_type = USER_RANK_KEYWORD_TASK_TYPE,body=query)['hits'] if results['total'] > 0 : for item in results['hits']: search_type = item['_source']['search_type'] pre = item['_source']['pre'] during = item['_source']['during'] start_time = item['_source']['start_time'] keyword = item['_source']['keyword'] search_key = item['_source']['user_ts']
def social_sensing(task_detail): # 任务名 传感器 终止时间 之前状态 创建者 时间 task_name = task_detail[0] social_sensors = task_detail[1] stop_time = task_detail[2] create_by = task_detail[3] ts = int(task_detail[4]) print ts2date(ts) # PART 1 #forward_result = get_forward_numerical_info(task_name, ts, create_by) # 之前时间阶段内的原创微博list/retweeted forward_origin_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range) forward_retweeted_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range, 3) # 当前阶段内原创微博list current_mid_list = query_mid_list(ts, social_sensors, time_interval) current_retweeted_mid_list = query_mid_list(ts, social_sensors, time_interval, 3) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(current_retweeted_mid_list) all_mid_list.extend(forward_origin_weibo_list) all_mid_list.extend(forward_retweeted_weibo_list) all_origin_list = [] all_origin_list.extend(current_mid_list) all_origin_list.extend(forward_origin_weibo_list) all_retweeted_list = [] all_retweeted_list.extend(current_retweeted_mid_list) all_retweeted_list.extend(forward_retweeted_weibo_list)#被转发微博的mid/root-mid print "all mid list: ", len(all_mid_list) #print "all_origin_list", all_origin_list #print "all_retweeted_list", all_retweeted_list # 查询微博在当前时间内的转发和评论数, 聚合按照message_type statistics_count = query_related_weibo(ts, all_mid_list, time_interval) if all_origin_list: origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval) # 原创微博详情 else: origin_weibo_detail = {} if all_retweeted_list: retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval) # 转发微博详情 else: retweeted_weibo_detail = {} current_total_count = statistics_count['total_count'] # 当前阶段内所有微博总数 current_retweeted_count = statistics_count['retweeted'] current_comment_count = statistics_count['comment'] # 聚合当前时间内重要的人 important_uid_list = [] datetime = ts2datetime(ts-time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: search_results = get_important_user(ts, all_mid_list, time_interval) important_uid_list = search_results # 根据获得uid_list,从人物库中匹配重要人物 if important_uid_list: important_results = es_user_portrait.mget(index=portrait_index_name,doc_type=portrait_index_type, body={"ids": important_uid_list})['docs'] else: important_results = [] filter_important_list = [] # uid_list if important_results: for item in important_results: if item['found']: #if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD: filter_important_list.append(item['_id']) print "filter_important_list", filter_important_list print "important_results", important_uid_list #判断感知 finish = unfinish_signal # "0" process_status = "1" if int(stop_time) <= ts: # 检查任务是否已经完成 finish = finish_signal process_status = "0" # 感知到的事, all_mid_list sensitive_text_list = [] # 有事件发生时开始 if 1: index_list = [] important_words = [] datetime_1 = ts2datetime(ts) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es = es_text.indices.exists(index=index_name_1) if exist_es: index_list.append(index_name_1) datetime_2 = ts2datetime(ts-DAY) index_name_2 = flow_text_index_name_pre + datetime_2 exist_es = es_text.indices.exists(index=index_name_2) if exist_es: index_list.append(index_name_2) if index_list and all_mid_list: query_body = { "query":{ "filtered":{ "filter":{ "terms":{"mid": all_mid_list} } } }, "size": 5000 } search_results = es_text.search(index=index_list, doc_type="text", body=query_body)['hits']['hits'] tmp_sensitive_warning = "" text_dict = dict() # 文本信息 mid_value = dict() # 文本赋值 duplicate_dict = dict() # 重合字典 portrait_dict = dict() # 背景信息 classify_text_dict = dict() # 分类文本 classify_uid_list = [] duplicate_text_list = [] sensitive_words_dict = dict() if search_results: for item in search_results: iter_uid = item['_source']['uid'] iter_mid = item['_source']['mid'] iter_text = item['_source']['text'].encode('utf-8', 'ignore') iter_sensitive = item['_source'].get('sensitive', 0) duplicate_text_list.append({"_id":iter_mid, "title": "", "content":iter_text}) if iter_sensitive: tmp_sensitive_warning = signal_sensitive_variation #涉及到敏感词的微博 sensitive_words_dict[iter_mid] = iter_sensitive keywords_dict = json.loads(item['_source']['keywords_dict']) personal_keywords_dict = dict() for k, v in keywords_dict.iteritems(): k = k.encode('utf-8', 'ignore') personal_keywords_dict[k] = v classify_text_dict[iter_mid] = personal_keywords_dict classify_uid_list.append(iter_uid) # 去重 if duplicate_text_list: dup_results = duplicate(duplicate_text_list) for item in dup_results: if item['duplicate']: duplicate_dict[item['_id']] = item['same_from'] # 分类 if classify_text_dict: classify_results = topic_classfiy(classify_uid_list, classify_text_dict) mid_value = dict() #print "classify_results: ", classify_results for k,v in classify_results.iteritems(): # mid:value mid_value[k] = topic_value_dict[v[0]] sensitive_weibo_detail = {} if sensitive_words_dict: sensitive_mid_list = sensitive_words_dict.keys() sensitivie_weibo_detail = query_hot_weibo(ts, sensitive_mid_list, time_interval) results = dict() results['mid_topic_value'] = json.dumps(mid_value) results['duplicate_dict'] = json.dumps(duplicate_dict) results['sensitive_words_dict'] = json.dumps(sensitive_words_dict) results['sensitive_weibo_detail'] = json.dumps(sensitive_weibo_detail) results['origin_weibo_number'] = len(all_origin_list) results['retweeted_weibo_number'] = len(all_retweeted_list) results['origin_weibo_detail'] = json.dumps(origin_weibo_detail) results['retweeted_weibo_detail'] = json.dumps(retweeted_weibo_detail) results['retweeted_weibo_count'] = current_retweeted_count results['comment_weibo_count'] = current_comment_count results['weibo_total_number'] = current_total_count results['important_users'] = json.dumps(filter_important_list) results['unfilter_users'] = json.dumps(important_uid_list) results['timestamp'] = ts #results['clustering_topic'] = json.dumps(topic_list) # es存储当前时段的信息 doctype = create_by + '-' + task_name es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results) # 更新manage social sensing的es信息 temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=doctype)['_source'] temporal_result['finish'] = finish temporal_result['processing_status'] = process_status history_status = json.loads(temporal_result['history_status']) history_status.append(ts) temporal_result['history_status'] = json.dumps(history_status) es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=doctype, body=temporal_result) return "1"
"from": 0, "size": 10, "sort": [], "facets": {} } if True: result = es_9200.search(index=USER_RANK_KEYWORD_TASK_INDEX, doc_type=USER_RANK_KEYWORD_TASK_TYPE, body=query)['hits']['hits'] search_id = result[0]['_id'] item = result[0]['_source'] item['status'] = 1 item['result'] = json.dumps(results) es_9200.index(index=USER_RANK_KEYWORD_TASK_INDEX, doc_type=USER_RANK_KEYWORD_TASK_TYPE, id=search_id, body=item) return results def scan_offlice_task(): query = { "query": { "bool": { "must": [{ "term": { "user_rank_task.status": "0" } }], "must_not": [],
def social_sensing(task_detail): # 任务名 传感器 终止时间 之前状态 创建者 时间 task_name = task_detail[0] social_sensors = task_detail[1] stop_time = task_detail[2] forward_warning_status = task_detail[3] create_by = task_detail[4] ts = int(task_detail[5]) new = int(task_detail[6]) print ts2date(ts) # PART 1 forward_result = get_forward_numerical_info(task_name, ts, create_by) # 之前时间阶段内的原创微博list/retweeted forward_origin_weibo_list = query_mid_list(ts - time_interval, social_sensors, forward_time_range) forward_retweeted_weibo_list = query_mid_list(ts - time_interval, social_sensors, forward_time_range, 3) # 当前阶段内原创微博list current_mid_list = query_mid_list(ts, social_sensors, time_interval) current_retweeted_mid_list = query_mid_list(ts, social_sensors, time_interval, 3) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(current_retweeted_mid_list) all_mid_list.extend(forward_origin_weibo_list) all_mid_list.extend(forward_retweeted_weibo_list) all_origin_list = [] all_origin_list.extend(current_mid_list) all_origin_list.extend(forward_origin_weibo_list) all_retweeted_list = [] all_retweeted_list.extend(current_retweeted_mid_list) all_retweeted_list.extend(forward_retweeted_weibo_list) # 被转发微博的mid/root-mid print "all mid list: ", len(all_mid_list) # print "all_origin_list", all_origin_list # print "all_retweeted_list", all_retweeted_list # 查询微博在当前时间内的转发和评论数, 聚合按照message_type statistics_count = query_related_weibo(ts, all_mid_list, time_interval) if all_origin_list: origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval) # 原创微博详情 else: origin_weibo_detail = {} if all_retweeted_list: retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval) # 转发微博详情 else: retweeted_weibo_detail = {} current_total_count = statistics_count["total_count"] # 当前阶段内所有微博总数 current_retweeted_count = statistics_count["retweeted"] current_comment_count = statistics_count["comment"] # PART 2 # 聚合当前时间内积极、中性、悲伤、愤怒情绪分布 # sentiment_dict = {"0": "neutral", "1":"positive", "2":"sad", "3": "anger"} sentiment_count = {"0": 0, "1": 0, "2": 0, "3": 0} search_results = aggregation_sentiment_related_weibo(ts, all_mid_list, time_interval) sentiment_count = search_results print "sentiment_count: ", sentiment_count negetive_key = ["2", "3", "4", "5", "6"] negetive_count = 0 for key in negetive_key: negetive_count += sentiment_count[key] # 聚合当前时间内重要的人 important_uid_list = [] datetime = ts2datetime(ts - time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: search_results = get_important_user(ts, all_mid_list, time_interval) important_uid_list = search_results # 根据获得uid_list,从人物库中匹配重要人物 if important_uid_list: important_results = es_user_portrait.mget( index=portrait_index_name, doc_type=portrait_index_type, body={"ids": important_uid_list} )["docs"] else: important_results = [] filter_important_list = [] # uid_list if important_results: for item in important_results: if item["found"]: # if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD: filter_important_list.append(item["_id"]) # 判断感知 burst_reason = signal_nothing_variation warning_status = signal_nothing finish = unfinish_signal # "0" process_status = "1" if forward_result[0]: # 根据移动平均判断是否有时间发生 mean_count = forward_result[1] std_count = forward_result[2] mean_sentiment = forward_result[3] std_sentiment = forward_result[4] if ( mean_count >= MEAN_COUNT and current_total_count > mean_count + 1.96 * std_count or current_total_count >= len(all_mid_list) * AVERAGE_COUNT ): # 异常点发生 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track else: warning_status = signal_brust burst_reason += signal_count_varition # 数量异常 if ( negetive_count > mean_sentiment + 1.96 * std_sentiment and mean_sentiment >= MEAN_COUNT or negetive_count >= len(all_mid_list) * AVERAGE_COUNT ): warning_status = signal_brust burst_reason += signal_sentiment_varition # 负面情感异常, "12"表示两者均异常 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track if int(stop_time) <= ts: # 检查任务是否已经完成 finish = finish_signal process_status = "0" # 感知到的事, all_mid_list tmp_burst_reason = burst_reason topic_list = [] sensitive_text_list = [] # 有事件发生时开始 # if warning_status: if 1: index_list = [] important_words = [] datetime_1 = ts2datetime(ts) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es = es_text.indices.exists(index=index_name_1) if exist_es: index_list.append(index_name_1) datetime_2 = ts2datetime(ts - DAY) index_name_2 = flow_text_index_name_pre + datetime_2 exist_es = es_text.indices.exists(index=index_name_2) if exist_es: index_list.append(index_name_2) if index_list and all_mid_list: query_body = {"query": {"filtered": {"filter": {"terms": {"mid": all_mid_list}}}}, "size": 5000} search_results = es_text.search(index=index_list, doc_type="text", body=query_body)["hits"]["hits"] tmp_sensitive_warning = "" text_dict = dict() # 文本信息 mid_value = dict() # 文本赋值 duplicate_dict = dict() # 重合字典 portrait_dict = dict() # 背景信息 classify_text_dict = dict() # 分类文本 classify_uid_list = [] duplicate_text_list = [] sensitive_words_dict = dict() if search_results: for item in search_results: iter_uid = item["_source"]["uid"] iter_mid = item["_source"]["mid"] iter_text = item["_source"]["text"].encode("utf-8", "ignore") iter_sensitive = item["_source"].get("sensitive", 0) duplicate_text_list.append({"_id": iter_mid, "title": "", "content": iter_text}) if iter_sensitive: tmp_sensitive_warning = signal_sensitive_variation # 涉及到敏感词的微博 sensitive_words_dict[iter_mid] = iter_sensitive keywords_dict = json.loads(item["_source"]["keywords_dict"]) personal_keywords_dict = dict() for k, v in keywords_dict.iteritems(): k = k.encode("utf-8", "ignore") personal_keywords_dict[k] = v classify_text_dict[iter_mid] = personal_keywords_dict classify_uid_list.append(iter_uid) # 去重 if duplicate_text_list: dup_results = duplicate(duplicate_text_list) for item in dup_results: if item["duplicate"]: duplicate_dict[item["_id"]] = item["same_from"] # 分类 if classify_text_dict: classify_results = topic_classfiy(classify_uid_list, classify_text_dict) mid_value = dict() # print "classify_results: ", classify_results for k, v in classify_results.iteritems(): # mid:value mid_value[k] = topic_value_dict[v[0]] if tmp_sensitive_warning: warning_status = signal_brust burst_reason += signal_sensitive_variation sensitive_weibo_detail = {} if sensitive_words_dict: sensitive_mid_list = sensitive_words_dict.keys() sensitivie_weibo_detail = query_hot_weibo(ts, sensitive_mid_list, time_interval) results = dict() results["mid_topic_value"] = json.dumps(mid_value) results["duplicate_dict"] = json.dumps(duplicate_dict) results["sensitive_words_dict"] = json.dumps(sensitive_words_dict) results["sensitive_weibo_detail"] = json.dumps(sensitive_weibo_detail) results["origin_weibo_number"] = len(all_origin_list) results["retweeted_weibo_number"] = len(all_retweeted_list) results["origin_weibo_detail"] = json.dumps(origin_weibo_detail) results["retweeted_weibo_detail"] = json.dumps(retweeted_weibo_detail) results["retweeted_weibo_count"] = current_retweeted_count results["comment_weibo_count"] = current_comment_count results["weibo_total_number"] = current_total_count results["sentiment_distribution"] = json.dumps(sentiment_count) results["important_users"] = json.dumps(filter_important_list) results["unfilter_users"] = json.dumps(important_uid_list) results["burst_reason"] = tmp_burst_reason results["timestamp"] = ts # results['clustering_topic'] = json.dumps(topic_list) # es存储当前时段的信息 doctype = create_by + "-" + task_name es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results) # 更新manage social sensing的es信息 if not new: temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=doctype)[ "_source" ] temporal_result["warning_status"] = warning_status temporal_result["burst_reason"] = tmp_burst_reason temporal_result["finish"] = finish temporal_result["processing_status"] = process_status history_status = json.loads(temporal_result["history_status"]) history_status.append([ts, task_name, warning_status]) temporal_result["history_status"] = json.dumps(history_status) es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=doctype, body=temporal_result) else: print "test" return "1"
def key_words_search(search_type, pre, during, start_time, keyword, search_key='', sort_norm='', sort_scope='', time=1, isall=False): query = { "query": { "bool": { "must": [{ "term": { "user_rank_task.user_ts": search_key } }], "must_not": [], "should": [] } }, "from": 0, "size": 10, "sort": [], "facets": {} } result = es_9200.search(index=USER_RANK_KEYWORD_TASK_INDEX, doc_type=USER_RANK_KEYWORD_TASK_TYPE, body=query)['hits']['hits'] search_id = result[0]['_id'] item = result[0]['_source'] item['status'] = -1 # 任务 item['result'] = json.dumps(results) es_9200.index(index=USER_RANK_KEYWORD_TASK_INDEX, doc_type=USER_RANK_KEYWORD_TASK_TYPE, id=search_id, body=item) keywords = keyword.split(",") should = [] for key in keywords: if search_type == "hashtag": should.append({"prefix": {"text.text": "#" + key + "#"}}) else: should.append({"prefix": {"text.text": key}}) date = start_time index_name = pre + start_time while not es_9206.indices.exists(index=index_name): new_time = datetime2ts(date) + DAY date = ts2datetime(new_time) index_name = pre + date during -= 1 uid_set = set() for i in range(during): print index_name query = { "query": { "bool": { "must": [], "must_not": [], "should": should } }, "size": MAX_ITEMS, "sort": [], "facets": {}, "fields": ['uid'] } try: temp = es_9206.search(index=index_name, doc_type='text', body=query) result = temp['hits']['hits'] print "Fetch " + str(len(result)) for item in result: uid_set.add(item['fields']['uid'][0].encode("utf-8")) except Exception, e: print e raise Exception('user_list failed!') new_time = datetime2ts(date) + DAY date = ts2datetime(new_time) index_name = pre + date i += 1
def sensors_keywords_detection(task_detail): task_name = task_detail[0] social_sensors = task_detail[1] keywords_list = task_detail[2] sensitive_words = task_detail[3] stop_time = task_detail[4] forward_warning_status = task_detail[5] ts = task_detail[7] forward_result = get_forward_numerical_info(task_name, ts, keywords_list) # 1. 聚合前12个小时内传感人物发布的所有与关键词相关的原创微博 forward_origin_weibo_list = query_mid_list(ts - time_interval, keywords_list, forward_time_range, 1, social_sensors) # 2. 聚合当前阶段内的原创微博 current_mid_list = query_mid_list(ts, keywords_list, time_interval, 1, social_sensors) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(forward_origin_weibo_list) all_mid_list = list(set(all_mid_list)) print len(all_mid_list) # 3. 查询当前的原创微博和之前12个小时的原创微博在当前时间内的转发和评论数, 聚合按照message_type statistics_count = query_related_weibo(ts, all_mid_list, time_interval, keywords_list, 1, social_sensors) current_total_count = statistics_count['total_count'] # 当前阶段内所有微博总数 print "current all weibo: ", statistics_count current_origin_count = statistics_count['origin'] current_retweeted_count = statistics_count['retweeted'] current_comment_count = statistics_count['comment'] # 4. 聚合当前时间内积极、中性、悲伤、愤怒情绪分布 sentiment_count = {"0": 0, "1": 0, "2": 0, "3": 0} datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts - time_interval) if datetime == datetime_1: index_name = flow_text_index_name_pre + datetime else: index_name = flow_text_index_name_pre + datetime_1 exist_es = es_text.indices.exists(index_name) if exist_es: search_results = aggregation_sentiment_related_weibo( ts, all_mid_list, time_interval, keywords_list, 1) sentiment_count = search_results print "sentiment_count: ", sentiment_count negetive_count = sentiment_count['2'] + sentiment_count['3'] # 5. 那些社会传感器参与事件讨论 query_body = { "query": { "filtered": { "filter": { "bool": { "must": [{ "range": { "timestamp": { "gte": ts - time_interval, "lt": ts } } }, { "terms": { "uid": social_sensors } }], "should": [{ "terms": { "root_mid": all_mid_list } }, { "terms": { "mid": all_mid_list } }] } } } }, "size": 10000 } datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts - time_interval) if datetime == datetime_1: index_name = flow_text_index_name_pre + datetime else: index_name = flow_text_index_name_pre + datetime_1 search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)['hits']['hits'] attend_users = [] if search_results: for item in search_results: attend_users.append(item['_source']['uid']) important_users = list(set(attend_users)) print "important users", important_users # 6. 敏感词识别,如果传感器的微博中出现这么一个敏感词,那么就会预警------PS.敏感词是一个危险的设置 sensitive_origin_weibo_number = 0 sensitive_retweeted_weibo_number = 0 sensitive_comment_weibo_number = 0 sensitive_total_weibo_number = 0 if sensitive_words: query_sensitive_body = { "query": { "filtered": { "filter": { "bool": { "must": [{ "range": { "timestamp": { "gte": ts - time_interval, "lt": ts } } }, { "terms": { "keywords_string": sensitive_words } }, { "terms": { "uid": social_sensors } }] } } } }, "aggs": { "all_list": { "terms": { "field": "message_type" } } } } sensitive_results = es_text.search( index=index_name, doc_type=flow_text_index_type, body=query_sensitive_body)['aggregations']['all_list']["buckets"] if sensitive_results: for item in sensitive_results: if int(item["key"]) == 1: sensitive_origin_weibo_number = item['doc_count'] elif int(item["key"]) == 2: sensitive_comment_weibo_number = item['doc_count'] elif int(item["key"]) == 3: sensitive_retweeted_weibo_number = item["doc_count"] else: pass sensitive_total_weibo_number = sensitive_origin_weibo_number + sensitive_comment_weibo_number + sensitive_retweeted_weibo_number burst_reason = signal_nothing_variation warning_status = signal_nothing finish = unfinish_signal # "0" process_status = "1" if sensitive_total_weibo_number: # 敏感微博的数量异常 print "======================" if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track else: warning_status = signal_brust burst_reason = signal_sensitive_variation if forward_result[0]: # 根据移动平均判断是否有时间发生 mean_count = forward_result[1] std_count = forward_result[2] mean_sentiment = forward_result[3] std_sentiment = forward_result[4] if current_total_count > mean_count + 1.96 * std_count: # 异常点发生 print "=====================================================" if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track else: warning_status = signal_brust burst_reason += signal_count_varition # 数量异常 if negetive_count > mean_sentiment + 1.96 * std_sentiment: warning_status = signal_brust burst_reason += signal_sentiment_varition # 负面情感异常, "12"表示两者均异常 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track if int(stop_time) <= ts: # 检查任务是否已经完成 finish = finish_signal process_status = '0' tmp_burst_reason = burst_reason topic_list = [] # 7. 感知到的事, all_mid_list if burst_reason: # 有事情发生 text_list = [] mid_set = set() if signal_sensitive_variation in burst_reason: query_sensitive_body = { "query": { "filtered": { "filter": { "bool": { "must": [{ "range": { "timestamp": { "gte": ts - time_interval, "lt": ts } } }, { "terms": { "keywords_string": sensitive_words } }] } } } }, "size": 10000 } if social_sensors: query_sensitive_body['query']['filtered']['filter']['bool'][ 'must'].append({"terms": { "uid": social_sensors }}) sensitive_results = es_text.search( index=index_name, doc_type=flow_text_index_type, body=query_sensitive_body)['hits']["hits"] if sensitive_results: for item in sensitive_results: iter_mid = item['_source']['mid'] iter_text = item['_source']['text'] temp_dict = dict() temp_dict["mid"] = iter_mid temp_dict["text"] = iter_text if iter_mid not in mid_set: text_list.append(temp_dict) # 整理后的文本,mid,text mid_set.add(iter_mid) burst_reason.replace(signal_sensitive_variation, "") if burst_reason and all_mid_list: sensing_text = es_text.mget(index=index_name, doc_type=flow_text_index_type, body={"ids": all_mid_list}, fields=["mid", "text"])["docs"] if sensing_text: for item in sensing_text: if item['found']: iter_mid = item["fields"]["mid"][0] iter_text = item["fields"]["text"][0] temp_dict = dict() temp_dict["mid"] = iter_mid temp_dict["text"] = iter_text if iter_mid not in mid_set: text_list.append(temp_dict) mid_set.add(iter_mid) if len(text_list) == 1: top_word = freq_word(text_list[0]) topic_list = [top_word.keys()] elif len(text_list) == 0: topic_list = [] tmp_burst_reason = "" #没有相关微博,归零 print "***********************************" else: feature_words, input_word_dict = tfidf(text_list) #生成特征词和输入数据 word_label, evaluation_results = kmeans(feature_words, text_list) #聚类 inputs = text_classify(text_list, word_label, feature_words) clustering_topic = cluster_evaluation(inputs) sorted_dict = sorted(clustering_topic.items(), key=lambda x: x[1], reverse=True)[0:5] topic_list = [] if sorted_dict: for item in sorted_dict: topic_list.append(word_label[item[0]]) print "topic_list:", topic_list if not topic_list: tmp_burst_reason = signal_nothing_variation warning_status = signal_nothing results = dict() results['sensitive_origin_weibo_number'] = sensitive_origin_weibo_number results[ 'sensitive_retweeted_weibo_number'] = sensitive_retweeted_weibo_number results['sensitive_comment_weibo_number'] = sensitive_comment_weibo_number results['sensitive_weibo_total_number'] = sensitive_total_weibo_number results['origin_weibo_number'] = current_origin_count results['retweeted_weibo_number'] = current_retweeted_count results['comment_weibo_number'] = current_comment_count results['weibo_total_number'] = current_total_count results['sentiment_distribution'] = json.dumps(sentiment_count) results['important_users'] = json.dumps(important_users) results['burst_reason'] = tmp_burst_reason results['timestamp'] = ts if tmp_burst_reason: results["clustering_topic"] = json.dumps(topic_list[:5]) # es存储当前时段的信息 doctype = task_name es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results) # 更新manage social sensing的es信息 temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=task_name)['_source'] temporal_result['warning_status'] = warning_status temporal_result['burst_reason'] = tmp_burst_reason temporal_result['finish'] = finish temporal_result['processing_status'] = process_status history_status = json.loads(temporal_result['history_status']) history_status.append([ts, ' '.join(keywords_list), warning_status]) temporal_result['history_status'] = json.dumps(history_status) es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=task_name, body=temporal_result) return "1"
# -*- coding:utf-8 -*- import json import sys reload(sys) sys.path.append('../../') from global_utils import R_SOCIAL_SENSING as r from global_utils import es_user_portrait as es from parameter import INDEX_MANAGE_SOCIAL_SENSING as index_name from parameter import DOC_TYPE_MANAGE_SOCIAL_SENSING as task_doc_type from time_utils import ts2datetime, datetime2ts, ts2date task_name = "两会".decode('utf-8') task_detail = es.get(index="manage_sensing_task", doc_type="task", id=task_name)['_source'] #task_detail['create_at'] = 1456934400 #task_detail['keywords'] = json.dumps(["两会", "人大", "政协"]) #task_detail['sensitive_words'] = json.dumps([]) #task_detail['task_type'] = "2" task_detail['stop_time'] = '1457020800' task_detail['finish'] = '1' task_detail['processing_status'] = "0" es.index(index="manage_sensing_task", doc_type="task", id=task_name, body=task_detail) print task_detail