def query_hot_weibo(ts, origin_mid_list, time_segment, keywords_list, aggregation_field="root_mid", size=100): query_all_body = { "query": { "filtered": { "filter": { "bool": { "must": [ {"range": { "timestamp":{ "gte": ts - time_segment, "lt": ts } }}], "should": [ {"terms":{ "keywords_string": keywords_list } } ] } } } }, "aggs":{ "all_count":{ "terms":{"field": aggregation_field, "size": size} } } } datetime = ts2datetime(ts) # test #datetime = "2013-09-07" hot_mid_dict = dict() index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if origin_mid_list and exist_es: query_all_body["query"]["filtered"]["filter"]["bool"]["should"].append({"terms": {"root_mid": origin_mid_list}}) query_all_body["query"]["filtered"]["filter"]["bool"]["should"].append({"terms": {"mid": origin_mid_list}}) results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_all_body)['aggregations']['all_count']['buckets'] if results: for item in results: hot_mid_dict[item['key']] = item['doc_count'] datetime_1 = ts2datetime(ts-time_segment) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) if datetime_1 != datetime and exist_es_1: query_all_body["query"]["filtered"]["filter"]["bool"]["should"].append({"terms": {"root_mid": origin_mid_list}}) query_all_body["query"]["filtered"]["filter"]["bool"]["should"].append({"terms": {"mid": origin_mid_list}}) results_1 = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_all_body)['aggregations']['all_count']['buckets'] if results_1: for item in results: hot_mid_dict[item['key']] = item['doc_count'] return hot_mid_dict
def query_hot_mid(ts, keywords_list, text_type,size=100): query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte":ts - time_interval, "lt": ts } }}, {"terms": {"keywords_string": keywords_list}}, {"term": {"message_type": "0"}} ] } } } }, "aggs":{ "all_interests":{ "terms":{"field": "root_mid", "size": size} } } } datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_bool_1 = es_text.indices.exists(index_name_1) print datetime, datetime_1 if datetime == datetime_1 and exist_es: search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["aggregations"]["all_interests"]["buckets"] elif datetime != datetime_1 and exist_bool_1: search_results = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body)["aggregations"]["all_interests"]["buckets"] else: search_results = [] hot_mid_list = [] if search_results: for item in search_results: print item temp = [] temp.append(item['key']) temp.append(item['doc_count']) hot_mid_list.append(temp) #print hot_mid_list return hot_mid_list
def get_sensitive_value(date_time, field_name, uid_list): flow_text_index_name = flow_text_index_name_pre + ts2datetime(date_time) query_body = { 'query': { 'filtered': { 'filter': { 'bool': { 'must': [{ 'terms': { 'uid': uid_list } }] } } } }, 'aggs': { 'index_field_sum': { 'terms': { 'field': field_name } } } } try: result=es_flow_text.search(index=flow_text_index_name,doc_type=flow_text_index_type,\ body=query_body)['aggregations']['index_field_sum']['buckets'] index_value_list = [] for item in result: index_value_list.append(item['doc_count']) except Exception, e: print '敏感度查询错误::', e index_value_list = []
def search_posts(xnr_user_no, from_ts, to_ts, extend_keywords_size=0): keywords = load_keywords(xnr_user_no, extend_keywords_size) query_body = load_query_body(keywords) index_list = load_index(flow_text_index_name_pre, from_ts, to_ts) search_results = es_flow_text.search(index=index_list, doc_type=flow_text_index_type, body=query_body)['hits']['hits'] return [item['_source'] for item in search_results]
def search_sensored_weibo(uid_list, ts): query_body = { "query": { "filtered": { "filter": { "bool": { "must":[ {"range": { "timestamp": { "gte": ts-time_interval, "lt":ts } }}, {"terms":{"uid": uid_list}} ] } } } }, "size":200000 } mid_set = set() index_name = "flow_text_" + ts2datetime(ts-time_interval) results = es_text.search(index=index_name, doc_type="text", body=query_body)["hits"]["hits"] for item in results: if item["_source"]["message_type"] == 1: mid_set.add(item['_source']["mid"]) else: try: mid_set.add(item['_source']["root_mid"]) except Exception, r: print Exception, r
def get_influence_relative(uid, influence): if S_TYPE == 'test': datetime = S_DATE_BCI else: datetime = ts2datetime(time.time() - DAY) new_datetime = datetime[0:4] + datetime[5:7] + datetime[8:10] weibo_bci_index_name = weibo_bci_index_name_pre + new_datetime query_body = { 'query': { 'match_all': {} }, 'sort': { 'user_index': { 'order': 'desc' } } } results = es_flow_text.search(index=weibo_bci_index_name, doc_type=weibo_bci_index_type, body=query_body)['hits']['hits'] user_index_max = results[0]['_source']['user_index'] influence_relative = influence / user_index_max return influence_relative
def get_community_keyword(uid_list, date_time): query_body = { 'query': { 'filtered': { 'filter': { 'bool': { 'must': [{ 'terms': { 'uid': uid_list } }] } } } }, 'aggs': { 'keywords': { 'terms': { 'field': 'keywords_string', 'size': 1000 } } } } flow_text_index_name_list = get_flow_text_index_list(date_time) flow_text_exist = es_flow_text.search(index = flow_text_index_name_list,doc_type = flow_text_index_type,\ body = query_body)['aggregations']['keywords']['buckets'] word_dict = dict() word_dict_new = dict() keywords_string = '' for item in flow_text_exist: word = item['key'] count = item['doc_count'] word_dict[word] = count keywords_string += '&' keywords_string += item['key'] k_dict = extract_keywords(keywords_string) for item_item in k_dict: keyword = item_item.word # print 'keyword::',keyword,type(keyword) if word_dict.has_key(keyword): word_dict_new[keyword] = word_dict[keyword] else: word_dict_new[keyword] = 1 keyword_dict = sorted(word_dict_new.items(), key=lambda d: d[1], reverse=True) #print 'keyword_dict',keyword_dict,keyword_dict[0],type(keyword_dict[0]) try: keyword_name = keyword_dict[0][0] + '_' + keyword_dict[1][0] except: keyword_name = 'X' return json.dumps(keyword_dict), keyword_name
def get_sensitive_user(timestamp, uid): score = 0 query_body = {'query': {'term': {'uid': uid}}, 'size': 50} index_name = flow_text_index_name_pre + ts2datetime(timestamp) search_results = es_flow_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)['hits']['hits'] for result in search_results: text = result['_source']['text'].encode('utf-8') node = createWordTree() sensitive_words_dict = searchWord(text, node) if sensitive_words_dict: sensitive_words_list = [] for k, v in sensitive_words_dict.iteritems(): tmp_stage = r_sensitive.hget("sensitive_words", k) if tmp_stage: score += v * sensitive_score_dict[str(tmp_stage)] return score
def day_post_num_compute(uids_list, datetime): flow_text_index_name = flow_text_index_name_pre + datetime query_body = { 'query': { 'filtered': { 'filter': { 'terms': { 'uid': uids_list } } } }, 'aggs': { 'all_uids': { 'terms': { 'field': 'uid', 'size': MAX_SEARCH_SIZE } } } } es_uid_counts = es_flow_text.search(index=flow_text_index_name,doc_type=flow_text_index_type,\ body=query_body)['aggregations']['all_uids']['buckets'] uid_count_list = [] # 所有uid的日发帖量组成list for uid_count in es_uid_counts: uid_count_list.append(uid_count['doc_count']) uid_count_list_np = np.array(uid_count_list) day_post_median = np.median(uid_count_list_np) ## 日发帖量的中位数 return day_post_median
def get_user_keywords(uid,today_datetime): flow_text_index_list = get_flow_text_index_list(today_datetime) query_body={ '_source':['keywords_string'], 'query':{ 'filtered':{ 'filter':{ 'bool':{ 'must':[ {'term':{'uid':uid}} ] } } } }, 'size':MAX_SEARCH_SIZE } results = es_flow_text.search(index=flow_text_index_list,doc_type=flow_text_index_type,body=query_body)['hits']['hits'] # print results keywords_list = [] for item in results: keywords_list.extend(item['_source']['keywords_string'].split('&')) temp_keywords = list(set(keywords_list)) keywords = '&'.join(temp_keywords) # print keywords return keywords
def query_hot_weibo(ts, origin_mid_list, time_segment): query_all_body = { "query": { "filtered": { "filter": { "bool": { "must": [ {"range": { "timestamp":{ "gte": ts - time_segment, "lt": ts } }}, {"terms":{"root_mid":origin_mid_list}} ] } } } }, "aggs":{ "all_mid":{ "terms":{"field": "root_mid", "size":400}, "aggs":{ "message_type":{ "terms":{ "field":"message_type" } } } } } } return_results = dict() datetime_1 = ts2datetime(ts) datetime_2 = ts2datetime(ts-24*3600) index_name_1 = flow_text_index_name_pre + datetime_1 index_name_2 = flow_text_index_name_pre + datetime_2 index_list = [] exist_es_1 = es_text.indices.exists(index_name_1) exist_es_2 = es_text.indices.exists(index_name_2) if exist_es_1: index_list.append(index_name_1) if exist_es_2: index_list.append(index_name_2) if index_list: results = es_text.search(index=index_list, doc_type=flow_text_index_type,body=query_all_body)['aggregations']['all_mid']['buckets'] if results: for item in results: temp_dict = dict() temp_dict[item['key']] = item['doc_count'] detail = item['message_type']['buckets'] detail_dict = dict() for iter_item in detail: detail_dict[iter_item['key']] = iter_item['doc_count'] temp_dict['retweeted'] = detail_dict.get(3, 0) temp_dict['comment'] = detail_dict.get(2, 0) return_results[item['key']] = temp_dict return return_results
def get_important_user(ts, origin_mid_list, time_segment): query_all_body = { "query": { "filtered": { "filter": { "bool": { "must": [{"range": {"timestamp": {"gte": ts - time_segment, "lt": ts}}}], "should": [{"terms": {"root_mid": origin_mid_list}}, {"terms": {"mid": origin_mid_list}}], } } } }, "sort": {"user_fansnum": {"order": "desc"}}, "size": 1000, } datetime = ts2datetime(ts - time_segment) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) results = [] if origin_mid_list and exist_es: search_results = es_text.search( index=index_name, doc_type=flow_text_index_type, body=query_all_body, _source=False )["hits"]["hits"] if search_results: for item in search_results: results.append(item["_id"]) return results
def query_mid_list(ts, social_sensors, time_segment, message_type=1): query_body = { "query": { "filtered": { "filter": { "bool": { "must": [{ "range": { "timestamp": { "gte": ts - time_segment, "lt": ts } } }, { "terms": { "uid": social_sensors } }, { "term": { "message_type": message_type } }] } } } }, "sort": { "sentiment": { "order": "desc" } }, "size": 10000 } datetime_1 = ts2datetime(ts) datetime_2 = ts2datetime(ts - 24 * 3600) index_name_1 = flow_text_index_name_pre + datetime_1 index_name_2 = flow_text_index_name_pre + datetime_2 index_list = [] exist_es_1 = es_text.indices.exists(index_name_1) exist_es_2 = es_text.indices.exists(index_name_2) if exist_es_1: index_list.append(index_name_1) if exist_es_2: index_list.append(index_name_2) if index_list: search_results = es_text.search(index=index_list, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] origin_mid_list = set() if search_results: for item in search_results: if message_type == 1: origin_mid_list.add(item["_id"]) else: origin_mid_list.add(item['_source']['root_mid']) return list(origin_mid_list)
def search_posts(uids, from_ts, to_ts): query_body = { 'query': { "filtered": { "filter": { "bool": { "must": [{ "terms": { "uid": uids } }, { "query": { "range": { "timestamp": { "gte": from_ts, "lte": to_ts } } } }], } } } }, 'size': MAX_SEARCH_SIZE, "sort": { "timestamp": { "order": "desc" } }, } search_results = es_flow_text.search(index=flow_text_index_name_pre + '*', doc_type=flow_text_index_type, body=query_body)['hits']['hits'] return [item['_source'] for item in search_results]
def detect_by_keywords(keywords,datetime_list): keyword_list = keywords group_uid_list = set() if datetime_list == []: return [] keyword_query_list = [] query_item = 'keywords_string' for datetime in datetime_list: flow_text_index_name = flow_text_index_name_pre + datetime nest_query_list = [] for keyword in keyword_list: nest_query_list.append({'wildcard':{query_item:'*'+keyword+'*'}}) keyword_query_list.append({'bool':{'should':nest_query_list}}) flow_text_index_name = flow_text_index_name_pre + datetime count = MAX_DETECT_COUNT es_results = es_flow_text.search(index=flow_text_index_name,doc_type=flow_text_index_type,\ body={'query':{'bool':{'must':keyword_query_list}},'size':count,'sort':[{'user_fansnum':{'order':'desc'}}]})['hits']['hits'] for i in range(len(es_results)): uid = es_results[i]['_source']['uid'] group_uid_list.add(uid) return group_uid_list
def read_flow_text(uid_list): ''' 读取用户微博(返回结果没有微博情绪标签): 输入数据:uid_list(字符串型列表) 输出数据:word_dict(用户分词结果字典),weibo_list(用户微博列表) word_dict示例:{uid1:{'w1':f1,'w2':f2...}...} weibo_list示例:[[uid1,text1,ts1],[uid2,text2,ts2],...](每一条记录对应三个值:uid、text、timestamp) ''' word_dict = dict() #词频字典 weibo_list = [] #微博列表 online_pattern_dict = {} # {uid:[online_pattern1, ..],...} now_ts = time.time() #run_type if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = datetime2ts(RUN_TEST_TIME) start_date_ts = now_date_ts - DAY * WEEK for i in range(0, WEEK): iter_date_ts = start_date_ts + DAY * i flow_text_index_date = ts2datetime(iter_date_ts) flow_text_index_name = flow_text_index_name_pre + flow_text_index_date print flow_text_index_name flow_text_exist = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'size': MAX_VALUE}, _source=False, fields=['text','uid','keywords_dict','timestamp'])['hits']['hits'] for flow_text_item in flow_text_exist: uid = flow_text_item['fields']['uid'][0].encode('utf-8') text = flow_text_item['fields']['text'][0].encode('utf-8') ts = flow_text_item['fields']['timestamp'][0] keywords_dict = json.loads( flow_text_item['fields']['keywords_dict'][0]) keywords_dict = json.dumps(keywords_dict, encoding="UTF-8", ensure_ascii=False) keywords_dict = eval(keywords_dict) if word_dict.has_key(uid): item_dict = Counter(word_dict[uid]) keywords_dict = Counter(keywords_dict) item_dict = dict(item_dict + keywords_dict) word_dict[uid] = item_dict else: word_dict[uid] = keywords_dict weibo_list.append([uid, text, ts]) #test online pattern online_pattern = u'weibo.com' try: user_online_pattern_dict = online_pattern_dict[uid] except: online_pattern_dict[uid] = {} try: online_pattern_dict[uid][online_pattern] += 1 except: online_pattern_dict[uid][online_pattern] = 1 return word_dict, weibo_list, online_pattern_dict, start_date_ts
def read_flow_text(uid_list): ''' 读取用户微博(返回结果没有微博情绪标签): 输入数据:uid_list(字符串型列表) 输出数据:word_dict(用户分词结果字典),weibo_list(用户微博列表) word_dict示例:{uid1:{'w1':f1,'w2':f2...}...} weibo_list示例:[[uid1,text1,ts1],[uid2,text2,ts2],...](每一条记录对应三个值:uid、text、timestamp) ''' word_dict = dict()#词频字典 weibo_list = []#微博列表 online_pattern_dict = {} # {uid:[online_pattern1, ..],...} now_ts = time.time() #run_type if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = datetime2ts(RUN_TEST_TIME) start_date_ts = now_date_ts - DAY * WEEK for i in range(0,WEEK): iter_date_ts = start_date_ts + DAY * i flow_text_index_date = ts2datetime(iter_date_ts) flow_text_index_name = flow_text_index_name_pre + flow_text_index_date print flow_text_index_name try: flow_text_exist = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'size': MAX_VALUE}, _source=False, fields=['text','uid','keywords_dict','timestamp'])['hits']['hits'] except: flow_text_exist = [] for flow_text_item in flow_text_exist: uid = flow_text_item['fields']['uid'][0].encode('utf-8') text = flow_text_item['fields']['text'][0].encode('utf-8') ts = flow_text_item['fields']['timestamp'][0] keywords_dict = json.loads(flow_text_item['fields']['keywords_dict'][0]) keywords_dict = json.dumps(keywords_dict, encoding="UTF-8", ensure_ascii=False) keywords_dict = eval(keywords_dict) if word_dict.has_key(uid): item_dict = Counter(word_dict[uid]) keywords_dict = Counter(keywords_dict) item_dict = dict(item_dict + keywords_dict) word_dict[uid] = item_dict else: word_dict[uid] = keywords_dict weibo_list.append([uid,text,ts]) #test online pattern online_pattern = u'weibo.com' try: user_online_pattern_dict = online_pattern_dict[uid] except: online_pattern_dict[uid] = {} try: online_pattern_dict[uid][online_pattern] += 1 except: online_pattern_dict[uid][online_pattern] = 1 return word_dict,weibo_list, online_pattern_dict, start_date_ts
def aggregation_sentiment_related_weibo(ts, origin_mid_list, time_segment, message_type=1, uid_list=[]): if message_type == 1: query_all_body = { "query": { "filtered": { "filter": { "bool": { "must": [ {"range": {"timestamp": {"gte": ts - time_segment, "lt": ts}}}, {"terms": {"root_mid": origin_mid_list}}, ] } } } }, "aggs": {"all_sentiments": {"terms": {"field": "sentiment"}}}, } else: query_all_body = { "query": { "filtered": { "filter": { "bool": { "must": [ {"range": {"timestamp": {"gte": ts - time_segment, "lt": ts}}}, {"terms": {"root_mid": origin_mid_list}}, {"terms": {"directed_uid": uid_list}}, ] } } } }, "aggs": {"all_sentiments": {"terms": {"field": "sentiment"}}}, } results = {"0": 0, "1": 0, "2": 0, "3": 0, "4": 0, "5": 0, "6": 0} datetime_1 = ts2datetime(ts) datetime_2 = ts2datetime(ts - 24 * 3600) index_name_1 = flow_text_index_name_pre + datetime_1 index_name_2 = flow_text_index_name_pre + datetime_2 index_list = [] exist_es_1 = es_text.indices.exists(index_name_1) exist_es_2 = es_text.indices.exists(index_name_2) if exist_es_1: index_list.append(index_name_1) if exist_es_2: index_list.append(index_name_2) if index_list: search_results = es_text.search(index=index_list, doc_type=flow_text_index_type, body=query_all_body)[ "aggregations" ]["all_sentiments"]["buckets"] if search_results: for item in search_results: key = item["key"] count = item["doc_count"] results[key] = count print "results: ", results, sum(results.values()) return results
def compute_full_keywords(): now_time = int(time.time()) date_time = ts2datetime(now_time) flow_text_index_name = flow_text_index_name_pre + date_time query_body = { 'aggs': { 'keywords': { 'terms': { 'field': 'keywords_string', 'size': 1000 } } } } flow_text_exist=es_flow_text.search(index=flow_text_index_name,doc_type=flow_text_index_type,\ body=query_body)['aggregations']['keywords']['buckets'] word_dict = dict() word_dict_new = dict() keywords_string = '' for item in flow_text_exist: word = item['key'] count = item['doc_count'] word_dict[word] = count keywords_string += '&' keywords_string += item['key'] k_dict = extract_keywords(keywords_string) for item_item in k_dict: keyword = item_item.word # print 'keyword::',keyword,type(keyword) if word_dict.has_key(keyword): word_dict_new[keyword] = word_dict[keyword] else: word_dict_new[keyword] = 1 # print 'count:',word_dict_new[keyword] keywords_task_detail = dict() keywords_task_detail['date_time'] = date_time keywords_task_detail['timestamp'] = datetime2ts(date_time) keywords_task_detail['keyword_value_string'] = json.dumps(word_dict_new) keywords_task_id = date_time try: es_xnr.index(index=weibo_full_keyword_index_name, doc_type=weibo_full_keyword_index_type, body=keywords_task_detail, id=keywords_task_id) mark = True except: mark = False return mark
def xnr_keywords_compute(xnr_user_no): #查询关注列表 followers_list=lookup_weiboxnr_concernedusers(xnr_user_no) lookup_condition_list=[] # print 'xnr_user_no, followers_list:', xnr_user_no, followers_list lookup_condition_list.append({'filtered':{'filter':{'bool':{'must':{'terms':{'uid':followers_list}}}}}}) #根据日期确定查询表 if S_TYPE == 'test': date_time = test_date else: now_time=int(time.time()) - DAY date_time=ts2datetime(now_time) flow_text_index_name=flow_text_index_name_pre+date_time #按日期统计 #print lookup_condition_list for item_condition in lookup_condition_list: query_body={ 'query':item_condition, 'aggs':{ 'keywords':{ 'terms':{ 'field':'keywords_string', 'size': 1000 } } } } flow_text_exist=es_flow_text.search(index=flow_text_index_name,doc_type=flow_text_index_type,\ body=query_body)['aggregations']['keywords']['buckets'] word_dict = dict() word_dict_new = dict() keywords_string = '' for item in flow_text_exist: word = item['key'] count = item['doc_count'] word_dict[word] = count keywords_string += '&' keywords_string += item['key'] k_dict = extract_keywords(keywords_string) for item_item in k_dict: keyword = item_item.word # print 'keyword::',keyword,type(keyword) if word_dict.has_key(keyword): word_dict_new[keyword] = word_dict[keyword] else: word_dict_new[keyword] = 1 # print 'count:',word_dict_new[keyword] return word_dict_new
def aggregation_root_weibo_retweet(mid_list,ts): size = len(mid_list) if size == 0: size = 10 query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range": {"timestamp":{ "gte":ts-time_interval, "lt":ts }} }, {"terms":{"root_mid":mid_list}} ] } } } }, "aggs":{ "all_count":{ "terms":{"field":"root_mid", "size":size}, "aggs":{ "message_type":{ "terms":{ "field":"message_type" } } } } } } datetime = ts2datetime(ts-time_interval) index_name = "flow_text_"+datetime results = es_text.search(index=index_name, doc_type="text", body=query_body)['aggregations']['all_count']['buckets'] return_results = dict() if results: for item in results: temp_dict = dict() temp_dict[item['key']] = item['doc_count'] detail = item['message_type']['buckets'] detail_dict = dict() for iter_item in detail: detail_dict[iter_item['key']] = iter_item['doc_count'] temp_dict['retweeted'] = detail_dict.get(3, 0) temp_dict['comment'] = detail_dict.get(2, 0) return_results[item['key']] = temp_dict['retweeted'] be_retweetd_list = return_results.keys() no_retweeted = set(mid_list) - set(be_retweetd_list) for mid in no_retweeted: return_results[mid] = 0 return return_results
def get_hashtag(today_datetime): weibo_flow_text_index_name = get_day_flow_text_index_list(today_datetime) query_body = { 'query': { 'filtered': { 'filter': { 'bool': { 'must': [{ 'range': { 'sensitive': { 'gte': 1 } } }] } } } }, 'aggs': { 'all_hashtag': { 'terms': { 'field': 'hashtag' }, 'aggs': { 'sum_sensitive': { 'sum': { 'field': 'sensitive' } } } } }, 'size': EVENT_OFFLINE_COUNT } weibo_text_exist=es_flow_text.search(index=weibo_flow_text_index_name,doc_type=flow_text_index_type,\ body=query_body)['aggregations']['all_hashtag']['buckets'] hashtag_list = [] for item in weibo_text_exist: event_dict = dict() if item['key']: # print item['key'] event_dict['event_name'] = item['key'] event_dict['event_count'] = item['doc_count'] event_dict['event_sensitive'] = item['sum_sensitive']['value'] hashtag_list.append(event_dict) else: pass hashtag_list.sort(key=lambda k: (k.get('event_sensitive', 0), k.get('event_count', 0)), reverse=True) # print hashtag_list return hashtag_list
def query_related_weibo(ts, origin_mid_list, time_segment): query_all_body = { "query": { "filtered": { "filter": { "bool": { "must": [ {"range": { "timestamp":{ "gte": ts - time_segment, "lt": ts } }}, {"terms":{"root_mid":origin_mid_list}} ] } } } }, "aggs":{ "all_count":{ "terms":{"field": "message_type"} } } } return_results = {"origin": 0, "retweeted": 0, "comment": 0} datetime_1 = ts2datetime(ts) datetime_2 = ts2datetime(ts-24*3600) index_name_1 = flow_text_index_name_pre + datetime_1 index_name_2 = flow_text_index_name_pre + datetime_2 index_list = [] exist_es_1 = es_text.indices.exists(index_name_1) exist_es_2 = es_text.indices.exists(index_name_2) if exist_es_1: index_list.append(index_name_1) if exist_es_2: index_list.append(index_name_2) if index_list: results = es_text.search(index=index_list, doc_type=flow_text_index_type,body=query_all_body)['aggregations']['all_count']['buckets'] if results: for item in results: if int(item['key']) == 1: return_results['origin'] = item['doc_count'] elif int(item['key']) == 3: return_results['retweeted'] = item['doc_count'] elif int(item['key']) == 2: return_results['comment'] = item['doc_count'] else: pass return_results['total_count'] = sum(return_results.values()) print "return_results: ", return_results return return_results
def create_speech_warning(xnr_user_no,start_time,end_time): #查询关注列表 lookup_type='followers_list' followers_list=lookup_xnr_fans_followers(xnr_user_no,lookup_type) result = [] query_body={ 'query':{ 'filtered':{ 'filter':{ 'bool':{ 'must':[ {'range':{'sensitive':{'gte':1}}}, {'range':{'timestamp':{'gte':start_time,'lte':end_time}}} ]} } } }, 'size':MAX_HOT_POST_SIZE, 'sort':{'sensitive':{'order':'desc'}} } flow_text_index_name=get_day_flow_text_index_list(end_time) results=es_flow_text.search(index=flow_text_index_name,doc_type=flow_text_index_type,body=query_body)['hits']['hits'] warning_type = 'speech' r_result = remove_repeat(results,warning_type) for item in r_result: item['nick_name']=get_user_nickname(item['uid']) followers_mark = set_intersection(item['uid'],followers_list) if followers_mark != 0: item['content_type']='follow' else: item['content_type']='unfollow' item['validity']=0 item['xnr_user_no']=xnr_user_no task_id=xnr_user_no+'_'+item['mid'] #写入数据库 today_date=ts2datetime(end_time) weibo_speech_warning_index_name=weibo_speech_warning_index_name_pre+today_date if not es_xnr.indices.exists(index=weibo_speech_warning_index_name): weibo_speech_warning_mappings(weibo_speech_warning_index_name) try: es_xnr.index(index=weibo_speech_warning_index_name,doc_type=weibo_speech_warning_index_type,body=item,id=task_id) mark=True except: mark=False result.append(mark) print 'speech_result::',result return result
def query_related_weibo(ts, origin_mid_list, time_segment): query_all_body = { "query": { "filtered": { "filter": { "bool": { "must": [ {"range": { "timestamp":{ "gte": ts - time_segment, "lt": ts } }}, {"terms":{"root_mid":origin_mid_list}} ] } } } }, "aggs":{ "all_count":{ "terms":{"field": "message_type"} } } } return_results = {"origin": 0, "retweeted": 0, "comment": 0} datetime_1 = ts2datetime(ts) datetime_2 = ts2datetime(ts-24*3600) index_name_1 = flow_text_index_name_pre + datetime_1 index_name_2 = flow_text_index_name_pre + datetime_2 index_list = [] exist_es_1 = es_text.indices.exists(index_name_1) exist_es_2 = es_text.indices.exists(index_name_2) if exist_es_1: index_list.append(index_name_1) if exist_es_2: index_list.append(index_name_2) if index_list: results = es_text.search(index=index_list, doc_type=flow_text_index_type,body=query_all_body)['aggregations']['all_count']['buckets'] if results: for item in results: if int(item['key']) == 1: return_results['origin'] = item['doc_count'] elif int(item['key']) == 3: return_results['retweeted'] = item['doc_count'] elif int(item['key']) == 2: return_results['comment'] = item['doc_count'] else: pass return_results['total_count'] = sum(return_results.values()) return return_results
def query_mid_list(ts, social_sensors, time_segment, message_type=1): query_body = { "query": { "filtered": { "filter": { "bool": { "must":[ {"range": { "timestamp": { "gte": ts - time_segment, "lt": ts } }}, {"terms":{"uid": social_sensors}}, {"term":{"message_type": message_type}} ] } } } }, "sort": {"sentiment": {"order": "desc"}}, "size": 10000 } index_list = ["flow_text_gangdu"] # 被感知的数据库,后期根据情况修改 search_results = es_flow_text.search(index=index_list, doc_type=type_flow_text_index, body=query_body)["hits"]["hits"] mid_dict = dict() origin_mid_list = set() if search_results: for item in search_results: if message_type == 1: origin_mid_list.add(item["_id"]) else: origin_mid_list.add(item['_source']['root_mid']) mid_dict[item['_source']['root_mid']] = item["_id"] # 源头微博和当前转发微博的mid if message_type != 1: # 保证获取的源头微博能在index_list索引中内找到,否则丢弃,即:如果是很早之前发的帖子就丢弃,保证时效性 filter_list = [] filter_mid_dict = dict() for iter_index in index_list: if origin_mid_list: exist_es = es_flow_text.mget(index=iter_index, doc_type=type_flow_text_index, body={"ids":list(origin_mid_list)})["docs"] for item in exist_es: if item["found"]: filter_list.append(item["_id"]) filter_mid_dict[item["_id"]] = mid_dict[item["_id"]] origin_mid_list = filter_list mid_dict = filter_mid_dict return list(origin_mid_list), mid_dict
def sort_retweet_sensitive_weibo(sensitive_mid_list): #sensitive_mid_list = get_top_mid() query_body = { "query": { "filtered": { "filter": { "bool": { "must": [{ "terms": { "root_mid": sensitive_mid_list } }, { "term": { "message_type": 3 } }] } } } }, "aggs": { "all_count": { "terms": { "field": "root_uid", "size": 20 } } } } if RUN_TYPE: index_name = "flow_text_" + ts2datetime(time.time()) else: index_name = "flow_text_2016-05-20" uid_list = [] results = [] number = [] search_results = es_flow_text.search( index=index_name, doc_type="text", body=query_body)["aggregations"]["all_count"]["buckets"] for item in search_results: uid_list.append(item['key']) number.append(item['doc_count']) if uid_list: results = get_user_profile(uid_list, ['nick_name']) for i in range(len(uid_list)): results[i].append(number[i]) return results
def get_important_user(ts, origin_mid_list, time_segment): query_all_body = { "query": { "filtered": { "filter": { "bool": { "must": [{ "range": { "timestamp": { "gte": ts - time_segment, "lt": ts } } }], "should": [{ "terms": { "root_mid": origin_mid_list } }, { "terms": { "mid": origin_mid_list } }] } } } }, "sort": { "user_fansnum": { "order": "desc" } }, "size": 1000 } if RUN_TYPE == 0: query_all_body["sort"] = {"timestamp": {"order": "desc"}} datetime = ts2datetime(ts - time_segment) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) results = [] if origin_mid_list and exist_es: search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_all_body, fields=["uid"], _source=False)["hits"]["hits"] if search_results: for item in search_results: results.append(item['fields']['uid'][0]) return results
def get_tweets_from_flow(monitor_keywords_list,sort_item_new): nest_query_list = [] for monitor_keyword in monitor_keywords_list: nest_query_list.append({'wildcard':{'keywords_string':'*'+monitor_keyword+'*'}}) query_body = { 'query':{ 'bool':{ 'should':nest_query_list } }, 'sort':[{sort_item_new:{'order':'desc'}},{'timestamp':{'order':'desc'}}], 'size':TOP_WEIBOS_LIMIT } if S_TYPE == 'test': now_ts = datetime2ts(S_DATE) else: now_ts = int(time.time()) datetime = ts2datetime(now_ts-24*3600) index_name = flow_text_index_name_pre + datetime es_results = es_flow_text.search(index=index_name,doc_type=flow_text_index_type,body=query_body)['hits']['hits'] if not es_results: es_results = es_flow_text.search(index=index_name,doc_type=flow_text_index_type,\ body={'query':{'match_all':{}},'size':TOP_WEIBOS_LIMIT,\ 'sort':{sort_item_new:{'order':'desc'}}})['hits']['hits'] results_all = [] for result in es_results: result = result['_source'] uid = result['uid'] nick_name,photo_url = uid2nick_name_photo(uid) result['nick_name'] = nick_name result['photo_url'] = photo_url results_all.append(result) return results_all
def get_weibo_content(index_list,mid_list): query_body = { "query":{ "filtered":{ "filter":{ "terms":{"mid":mid_list} } } }, "size": 99999 } search_results = es_text.search(index=index_list, doc_type="text", body=query_body)['hits']['hits'] return search_results
def temporal_keywords(ts1, ts2): keywords_set = set() date = ts2datetime(time.time()) date = "2013-09-07" index_date = flow_text_index_name_pre + date #search_results = es_text.search(index=index_date, doc_type=flow_text_index_type, body=aggregation_range(ts1, ts2))['aggregations']['all_interests']['buckets'] search_results = es_text.search(index=index_date, doc_type=flow_text_index_type, body=aggregation_sentiment(ts1, ts2, ["舟曲", "泥石流"], "keywords_string", 20))['aggregations']['all_sentiment']['buckets'] # print keywords for item in search_results: print item["key"].encode("utf-8", "ignore"), item["doc_count"], "\n" #keywords_set.add(item["key"].encode("utf-8", "ignore")) return keywords_set
def query_mid_list(ts, social_sensors, time_segment, message_type=1): query_body = { "query": { "filtered": { "filter": { "bool": { "must":[ {"range": { "timestamp": { "gte": ts - time_segment, "lt": ts } }}, {"terms":{"uid": social_sensors}}, {"term":{"message_type": message_type}} ] } } } }, "sort": {"sentiment": {"order": "desc"}}, "size": 10000 } datetime_1 = ts2datetime(ts) datetime_2 = ts2datetime(ts-24*3600) index_name_1 = flow_text_index_name_pre + datetime_1 index_name_2 = flow_text_index_name_pre + datetime_2 index_list = [] exist_es_1 = es_text.indices.exists(index_name_1) exist_es_2 = es_text.indices.exists(index_name_2) if exist_es_1: index_list.append(index_name_1) if exist_es_2: index_list.append(index_name_2) if index_list: search_results = es_text.search(index=index_list, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] origin_mid_list = set() if search_results: for item in search_results: if message_type == 1: origin_mid_list.add(item["_id"]) else: origin_mid_list.add(item['_source']['root_mid']) return list(origin_mid_list)
def find_flow_texts(start_ts, end_ts, topic, en_name): #多个wildcard/时间戳的range # if RUN_TYPE == 1: # today = datetime.date.today() # else: # today = datetime.date(2016,05,23) query_body = {'query': {'wildcard': {'text': '*' + topic + '*'}}} index_names = get_day_zero(start_ts, end_ts) result = [] index_list = [] for index_name in index_names: index_list.append(flow_text_index_name_pre + index_name) result = es_flow_text.search(index=index_list, doc_type=flow_text_index_type, body=query_body)['hits']['hits'] print flow_text_index_name_pre + index_name, es_flow_text, len(result) if result: save_es(en_name, result)
def uid_list_2_uid_keywords_dict(uids_list,datetime_list,label='other'): uid_weibo_keywords_dict = dict() keywords_dict_all_users = dict() uid_weibo = [] # [[uid1,text1,ts1],[uid2,text2,ts2],...] for datetime in datetime_list: flow_text_index_name = flow_text_index_name_pre + datetime query_body = { 'query':{ 'filtered':{ 'filter':{ 'terms':{ 'uid':uids_list } } } }, 'size':MAX_SEARCH_SIZE } es_weibo_results = es_flow_text.search(index=flow_text_index_name,doc_type=flow_text_index_type,\ body=query_body)['hits']['hits'] print len(es_weibo_results) for i in range(len(es_weibo_results)): uid = es_weibo_results[i]['_source']['uid'] keywords_dict = es_weibo_results[i]['_source']['keywords_dict'] keywords_dict = json.loads(keywords_dict) if label == 'character': text = es_weibo_results[i]['_source']['text'] timestamp = es_weibo_results[i]['_source']['timestamp'] uid_weibo.append([uid,text,timestamp]) ## 合并相同id的关键词字典 if uid in uid_weibo_keywords_dict.keys(): uid_weibo_keywords_dict[uid] = dict(Counter(uid_weibo_keywords_dict[uid])+Counter(keywords_dict)) else: uid_weibo_keywords_dict[uid] = keywords_dict ## 合并所有用户的关键词字典 keywords_dict_all_users = dict(Counter(keywords_dict_all_users)+Counter(keywords_dict)) if label == 'character': return uid_weibo_keywords_dict,keywords_dict_all_users,uid_weibo else: return uid_weibo_keywords_dict,keywords_dict_all_users
def create_speech_warning(xnr_user_no,today_datetime): #查询关注列表 lookup_type='followers_list' followers_list=lookup_xnr_fans_followers(xnr_user_no,lookup_type) query_body={ 'query':{ 'filtered':{ 'filter':{ 'bool':{'must':{'range':{'sensitive':{'gte':1}}}} } } }, 'size':MAX_SEARCH_SIZE, 'sort':{'sensitive':{'order':'desc'}} } flow_text_index_name=get_day_flow_text_index_list(today_datetime) results=es_flow_text.search(index=flow_text_index_name,doc_type=flow_text_index_type,body=query_body)['hits']['hits'] result=[] for item in results: item['_source']['nick_name']=get_user_nickname(item['_source']['uid']) if item['_source']['uid'] in followers_list: item['_source']['content_type']='follow' else: item['_source']['content_type']='unfollow' item['_source']['validity']=0 item['_source']['xnr_user_no']=xnr_user_no task_id=xnr_user_no+'_'+item['_source']['mid'] #写入数据库 today_date=ts2datetime(today_datetime) weibo_speech_warning_index_name=weibo_speech_warning_index_name_pre+today_date try: es_xnr.index(index=weibo_speech_warning_index_name,doc_type=weibo_speech_warning_index_type,body=item['_source'],id=task_id) mark=True except: mark=False result.append(mark) return result
def lookup_weibo_date_warming(keywords, today_datetime): keyword_query_list = [] for keyword in keywords: # keyword = keyword.encode('utf-8') print 'keyword:', keyword, type(keyword) keyword_query_list.append({'wildcard': {'text': '*' + keyword + '*'}}) # keyword_query_list.append({'wildcard':{'text':{'wildcard':'*'+keyword.encode('utf-8')+'*'}}}) flow_text_index_name = get_day_flow_text_index_list(today_datetime) # keyword_query_list.append({'range':{'sensitive':{'gte':1}}}) query_body = { 'query': { 'bool': { # 'must':[{'range':{'sensitive':{'gte':1}}}], 'should': keyword_query_list } }, 'size': MAX_WARMING_SIZE, 'sort': { 'sensitive': { 'order': 'desc' } } } if es_flow_text.indices.exists(index=flow_text_index_name): #try: temp_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type, body=query_body)['hits']['hits'] date_result = [] print keyword_query_list for item in temp_result: # print 'item-text:', item['_source']['text'], type(item['_source']['text']) item['_source']['nick_name'] = get_user_nickname( item['_source']['uid']) date_result.append(item['_source']) #except: # date_result=[] else: pass return date_result
def get_warning_content(nodes, content_type, trace_datetime): query_body = { 'query': { 'filtered': { 'filter': { 'bool': { 'must': [{ 'terms': { 'uid': nodes } }, { 'range': { content_type: { 'gt': 0 } } }] } } } }, 'size': 50, 'sort': { content_type: { 'order': 'desc' } } } flow_text_index_name = flow_text_index_name_pre + ts2datetime( trace_datetime) print 'flow_text_index_name::', flow_text_index_name print 'content_type::', content_type try: es_content = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type, body=query_body)['hits']['hits'] warning_content = [] for item in es_content: warning_content.append(item['_source']) except: warning_content = [] return warning_content
def key_words_search( search_type , pre , during , start_time , keyword , search_key = '' , sort_norm = '', sort_scope = '' ,time = 1 , isall = False): query = {"query":{"bool":{"must":[{"term":{"user_rank_task.user_ts":search_key}}],"must_not":[],"should":[]}},"from":0,"size":10,"sort":[],"facets":{}} result = es_9200.search(index = USER_RANK_KEYWORD_TASK_INDEX , doc_type = USER_RANK_KEYWORD_TASK_TYPE , body = query)['hits']['hits'] search_id = result[0]['_id'] item = result[0]['_source'] item['status'] = -1 # 任务 item['result'] = json.dumps(results) es_9200.index(index = USER_RANK_KEYWORD_TASK_INDEX , doc_type=USER_RANK_KEYWORD_TASK_TYPE , id=search_id, body=item) keywords = keyword.split(",") should = [] for key in keywords: if search_type == "hashtag": should.append({"prefix":{"text.text": "#" + key + "#"}}) else: should.append({"prefix":{"text.text":key}}) date = start_time index_name = pre + start_time while not es_9206.indices.exists(index= index_name) : new_time = datetime2ts(date) + DAY date = ts2datetime(new_time) index_name = pre + date during -= 1 uid_set = set() for i in range(during): print index_name query = {"query":{"bool":{"must":[],"must_not":[],"should":should}},"size":MAX_ITEMS,"sort":[],"facets":{},"fields":['uid']} try : temp = es_9206.search(index = index_name , doc_type = 'text' , body = query) result = temp['hits']['hits'] print "Fetch " + str(len(result)) for item in result : uid_set.add(item['fields']['uid'][0].encode("utf-8") ) except Exception,e: print e raise Exception('user_list failed!') new_time = datetime2ts(date) + DAY date = ts2datetime(new_time) index_name = pre + date i += 1
def main(): uid_list = [] count = 0 with open('uid_list_0520.txt', 'rb') as f: for item in f: uid_list.append(item.strip()) print "uid_list: ", len(uid_list) print uid_list[:3] query_body = { "query":{ "filtered":{ "filter":{ "terms":{"uid":uid_list} } } }, "size":100000 } with open('uid_text_0523.txt', 'wb') as f_txt: #ts = datetime2ts(ts2datetime(time.time()-24*3600)) ts = datetime2ts(ts2datetime(time.time())) #today while 1: date = ts2datetime(ts) index_name = "flow_text_"+str(date) print index_name exist_bool = es_flow_text.indices.exists(index=index_name) if not exist_bool: break search_results = es_flow_text.search(index=index_name, doc_type="text", body=query_body)["hits"]["hits"] print len(search_results) if search_results: for item in search_results: f_txt.write(json.dumps(item['_source'])+"\n") count += 1 ts = ts-24*3600 break print count
def get_important_user(ts, origin_mid_list, time_segment): query_all_body = { "query": { "filtered": { "filter": { "bool": { "must":[ {"range":{ "timestamp":{ "gte": ts - time_segment, "lt": ts } }}], "should": [ {"terms":{"root_mid":origin_mid_list}}, {"terms":{"mid":origin_mid_list}} ] } } } }, "aggs":{ "all_count":{ "terms":{"field":"root_uid", "size":1000} } } } datetime = ts2datetime(ts - time_segment) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) results = dict() if origin_mid_list and exist_es: search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_all_body)['aggregations']['all_count']['buckets'] if search_results: for item in search_results: results[item['key']] = item['doc_count'] return results
def compute_network_task(network_task_information): results = {} #step1: get task information start_date = network_task_information['start_date'] start_ts = datetime2ts(start_date) end_date = network_task_information['end_date'] end_ts = datetime2ts(end_date) iter_date_ts = start_ts to_date_ts = end_ts iter_query_date_list = [] # ['2013-09-01', '2013-09-02'] while iter_date_ts <= to_date_ts: iter_date = ts2datetime(iter_date_ts) iter_query_date_list.append(iter_date) iter_date_ts += DAY #step2: get iter search flow_text_index_name #step2.1: get search keywords list query_must_list = [] keyword_nest_body_list = [] keywords_string = network_task_information['query_keywords'] keywords_list = keywords_string.split('&') for keywords_item in keywords_list: keyword_nest_body_list.append({'wildcard': {'text': '*' + keywords_item + '*'}}) query_must_list.append({'bool': {'should': keyword_nest_body_list}}) query_must_list.append({'term': {'message_type': '3'}}) #step2.2: iter search by date results = [] for iter_date in iter_query_date_list: flow_text_index_name = flow_text_index_name_pre + iter_date query_body = { 'query':{ 'bool':{ 'must':query_must_list } } } flow_text_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body=query_body)['hits']['hits'] results.extend(flow_text_result) return results
def get_group_keywords(uid_list): now_ts = time.time() now_ts = datetime2ts('2013-09-03') former_ts = now_ts - DAY flow_index_1 = flow_text_index_name_pre + ts2datetime(now_ts) flow_index_2 = flow_text_index_name_pre + ts2datetime(former_ts) query_body = { "query":{ "filtered":{ "filter":{ "terms":{ "uid":uid_list } } } }, "size":10000 } text_list = [] # 为分词前的文本 word_dict = dict() # 分词后的word dict text_results = es_flow_text.search(index=[flow_index_1, flow_index_2], doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] if text_results: for item in text_results: iter_text = item['_source']['text'].encode('utf-8', "ignore") iter_text = re_cut(iter_text) text_list.append(iter_text) if text_list: for iter_text in text_list: cut_text = sw.participle(iter_text) cut_word_list = [term for term, cx in cut_text if cx in cx_dict] tmp_list = [] for w in cut_word_list: if word_dict.has_key(w): word_dict[w] += 1 else: word_dict[w] = 1 return word_dict
def social_sensing(task_detail): # 任务名 传感器 终止时间 之前状态 创建者 时间 task_name = task_detail[0] social_sensors = task_detail[1] stop_time = task_detail[2] create_by = task_detail[3] ts = int(task_detail[4]) print ts2date(ts) # PART 1 #forward_result = get_forward_numerical_info(task_name, ts, create_by) # 之前时间阶段内的原创微博list/retweeted forward_origin_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range) forward_retweeted_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range, 3) # 当前阶段内原创微博list current_mid_list = query_mid_list(ts, social_sensors, time_interval) current_retweeted_mid_list = query_mid_list(ts, social_sensors, time_interval, 3) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(current_retweeted_mid_list) all_mid_list.extend(forward_origin_weibo_list) all_mid_list.extend(forward_retweeted_weibo_list) all_origin_list = [] all_origin_list.extend(current_mid_list) all_origin_list.extend(forward_origin_weibo_list) all_retweeted_list = [] all_retweeted_list.extend(current_retweeted_mid_list) all_retweeted_list.extend(forward_retweeted_weibo_list)#被转发微博的mid/root-mid print "all mid list: ", len(all_mid_list) #print "all_origin_list", all_origin_list #print "all_retweeted_list", all_retweeted_list # 查询微博在当前时间内的转发和评论数, 聚合按照message_type statistics_count = query_related_weibo(ts, all_mid_list, time_interval) if all_origin_list: origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval) # 原创微博详情 else: origin_weibo_detail = {} if all_retweeted_list: retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval) # 转发微博详情 else: retweeted_weibo_detail = {} current_total_count = statistics_count['total_count'] # 当前阶段内所有微博总数 current_retweeted_count = statistics_count['retweeted'] current_comment_count = statistics_count['comment'] # 聚合当前时间内重要的人 important_uid_list = [] datetime = ts2datetime(ts-time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: search_results = get_important_user(ts, all_mid_list, time_interval) important_uid_list = search_results # 根据获得uid_list,从人物库中匹配重要人物 if important_uid_list: important_results = es_user_portrait.mget(index=portrait_index_name,doc_type=portrait_index_type, body={"ids": important_uid_list})['docs'] else: important_results = [] filter_important_list = [] # uid_list if important_results: for item in important_results: if item['found']: #if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD: filter_important_list.append(item['_id']) print "filter_important_list", filter_important_list print "important_results", important_uid_list #判断感知 finish = unfinish_signal # "0" process_status = "1" if int(stop_time) <= ts: # 检查任务是否已经完成 finish = finish_signal process_status = "0" # 感知到的事, all_mid_list sensitive_text_list = [] # 有事件发生时开始 if 1: index_list = [] important_words = [] datetime_1 = ts2datetime(ts) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es = es_text.indices.exists(index=index_name_1) if exist_es: index_list.append(index_name_1) datetime_2 = ts2datetime(ts-DAY) index_name_2 = flow_text_index_name_pre + datetime_2 exist_es = es_text.indices.exists(index=index_name_2) if exist_es: index_list.append(index_name_2) if index_list and all_mid_list: query_body = { "query":{ "filtered":{ "filter":{ "terms":{"mid": all_mid_list} } } }, "size": 5000 } search_results = es_text.search(index=index_list, doc_type="text", body=query_body)['hits']['hits'] tmp_sensitive_warning = "" text_dict = dict() # 文本信息 mid_value = dict() # 文本赋值 duplicate_dict = dict() # 重合字典 portrait_dict = dict() # 背景信息 classify_text_dict = dict() # 分类文本 classify_uid_list = [] duplicate_text_list = [] sensitive_words_dict = dict() if search_results: for item in search_results: iter_uid = item['_source']['uid'] iter_mid = item['_source']['mid'] iter_text = item['_source']['text'].encode('utf-8', 'ignore') iter_sensitive = item['_source'].get('sensitive', 0) duplicate_text_list.append({"_id":iter_mid, "title": "", "content":iter_text}) if iter_sensitive: tmp_sensitive_warning = signal_sensitive_variation #涉及到敏感词的微博 sensitive_words_dict[iter_mid] = iter_sensitive keywords_dict = json.loads(item['_source']['keywords_dict']) personal_keywords_dict = dict() for k, v in keywords_dict.iteritems(): k = k.encode('utf-8', 'ignore') personal_keywords_dict[k] = v classify_text_dict[iter_mid] = personal_keywords_dict classify_uid_list.append(iter_uid) # 去重 if duplicate_text_list: dup_results = duplicate(duplicate_text_list) for item in dup_results: if item['duplicate']: duplicate_dict[item['_id']] = item['same_from'] # 分类 if classify_text_dict: classify_results = topic_classfiy(classify_uid_list, classify_text_dict) mid_value = dict() #print "classify_results: ", classify_results for k,v in classify_results.iteritems(): # mid:value mid_value[k] = topic_value_dict[v[0]] sensitive_weibo_detail = {} if sensitive_words_dict: sensitive_mid_list = sensitive_words_dict.keys() sensitivie_weibo_detail = query_hot_weibo(ts, sensitive_mid_list, time_interval) results = dict() results['mid_topic_value'] = json.dumps(mid_value) results['duplicate_dict'] = json.dumps(duplicate_dict) results['sensitive_words_dict'] = json.dumps(sensitive_words_dict) results['sensitive_weibo_detail'] = json.dumps(sensitive_weibo_detail) results['origin_weibo_number'] = len(all_origin_list) results['retweeted_weibo_number'] = len(all_retweeted_list) results['origin_weibo_detail'] = json.dumps(origin_weibo_detail) results['retweeted_weibo_detail'] = json.dumps(retweeted_weibo_detail) results['retweeted_weibo_count'] = current_retweeted_count results['comment_weibo_count'] = current_comment_count results['weibo_total_number'] = current_total_count results['important_users'] = json.dumps(filter_important_list) results['unfilter_users'] = json.dumps(important_uid_list) results['timestamp'] = ts #results['clustering_topic'] = json.dumps(topic_list) # es存储当前时段的信息 doctype = create_by + '-' + task_name es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results) # 更新manage social sensing的es信息 temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=doctype)['_source'] temporal_result['finish'] = finish temporal_result['processing_status'] = process_status history_status = json.loads(temporal_result['history_status']) history_status.append(ts) temporal_result['history_status'] = json.dumps(history_status) es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=doctype, body=temporal_result) return "1"
def social_sensing(task_detail): # 任务名 传感器 终止时间 之前状态 创建者 时间 task_name = task_detail[0] social_sensors = task_detail[1] stop_time = task_detail[2] forward_warning_status = task_detail[3] create_by = task_detail[4] ts = int(task_detail[5]) new = int(task_detail[6]) print ts2date(ts) # PART 1 forward_result = get_forward_numerical_info(task_name, ts, create_by) # 之前时间阶段内的原创微博list/retweeted forward_origin_weibo_list = query_mid_list(ts - time_interval, social_sensors, forward_time_range) forward_retweeted_weibo_list = query_mid_list(ts - time_interval, social_sensors, forward_time_range, 3) # 当前阶段内原创微博list current_mid_list = query_mid_list(ts, social_sensors, time_interval) current_retweeted_mid_list = query_mid_list(ts, social_sensors, time_interval, 3) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(current_retweeted_mid_list) all_mid_list.extend(forward_origin_weibo_list) all_mid_list.extend(forward_retweeted_weibo_list) all_origin_list = [] all_origin_list.extend(current_mid_list) all_origin_list.extend(forward_origin_weibo_list) all_retweeted_list = [] all_retweeted_list.extend(current_retweeted_mid_list) all_retweeted_list.extend(forward_retweeted_weibo_list) # 被转发微博的mid/root-mid print "all mid list: ", len(all_mid_list) # print "all_origin_list", all_origin_list # print "all_retweeted_list", all_retweeted_list # 查询微博在当前时间内的转发和评论数, 聚合按照message_type statistics_count = query_related_weibo(ts, all_mid_list, time_interval) if all_origin_list: origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval) # 原创微博详情 else: origin_weibo_detail = {} if all_retweeted_list: retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval) # 转发微博详情 else: retweeted_weibo_detail = {} current_total_count = statistics_count["total_count"] # 当前阶段内所有微博总数 current_retweeted_count = statistics_count["retweeted"] current_comment_count = statistics_count["comment"] # PART 2 # 聚合当前时间内积极、中性、悲伤、愤怒情绪分布 # sentiment_dict = {"0": "neutral", "1":"positive", "2":"sad", "3": "anger"} sentiment_count = {"0": 0, "1": 0, "2": 0, "3": 0} search_results = aggregation_sentiment_related_weibo(ts, all_mid_list, time_interval) sentiment_count = search_results print "sentiment_count: ", sentiment_count negetive_key = ["2", "3", "4", "5", "6"] negetive_count = 0 for key in negetive_key: negetive_count += sentiment_count[key] # 聚合当前时间内重要的人 important_uid_list = [] datetime = ts2datetime(ts - time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: search_results = get_important_user(ts, all_mid_list, time_interval) important_uid_list = search_results # 根据获得uid_list,从人物库中匹配重要人物 if important_uid_list: important_results = es_user_portrait.mget( index=portrait_index_name, doc_type=portrait_index_type, body={"ids": important_uid_list} )["docs"] else: important_results = [] filter_important_list = [] # uid_list if important_results: for item in important_results: if item["found"]: # if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD: filter_important_list.append(item["_id"]) # 判断感知 burst_reason = signal_nothing_variation warning_status = signal_nothing finish = unfinish_signal # "0" process_status = "1" if forward_result[0]: # 根据移动平均判断是否有时间发生 mean_count = forward_result[1] std_count = forward_result[2] mean_sentiment = forward_result[3] std_sentiment = forward_result[4] if ( mean_count >= MEAN_COUNT and current_total_count > mean_count + 1.96 * std_count or current_total_count >= len(all_mid_list) * AVERAGE_COUNT ): # 异常点发生 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track else: warning_status = signal_brust burst_reason += signal_count_varition # 数量异常 if ( negetive_count > mean_sentiment + 1.96 * std_sentiment and mean_sentiment >= MEAN_COUNT or negetive_count >= len(all_mid_list) * AVERAGE_COUNT ): warning_status = signal_brust burst_reason += signal_sentiment_varition # 负面情感异常, "12"表示两者均异常 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track if int(stop_time) <= ts: # 检查任务是否已经完成 finish = finish_signal process_status = "0" # 感知到的事, all_mid_list tmp_burst_reason = burst_reason topic_list = [] sensitive_text_list = [] # 有事件发生时开始 # if warning_status: if 1: index_list = [] important_words = [] datetime_1 = ts2datetime(ts) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es = es_text.indices.exists(index=index_name_1) if exist_es: index_list.append(index_name_1) datetime_2 = ts2datetime(ts - DAY) index_name_2 = flow_text_index_name_pre + datetime_2 exist_es = es_text.indices.exists(index=index_name_2) if exist_es: index_list.append(index_name_2) if index_list and all_mid_list: query_body = {"query": {"filtered": {"filter": {"terms": {"mid": all_mid_list}}}}, "size": 5000} search_results = es_text.search(index=index_list, doc_type="text", body=query_body)["hits"]["hits"] tmp_sensitive_warning = "" text_dict = dict() # 文本信息 mid_value = dict() # 文本赋值 duplicate_dict = dict() # 重合字典 portrait_dict = dict() # 背景信息 classify_text_dict = dict() # 分类文本 classify_uid_list = [] duplicate_text_list = [] sensitive_words_dict = dict() if search_results: for item in search_results: iter_uid = item["_source"]["uid"] iter_mid = item["_source"]["mid"] iter_text = item["_source"]["text"].encode("utf-8", "ignore") iter_sensitive = item["_source"].get("sensitive", 0) duplicate_text_list.append({"_id": iter_mid, "title": "", "content": iter_text}) if iter_sensitive: tmp_sensitive_warning = signal_sensitive_variation # 涉及到敏感词的微博 sensitive_words_dict[iter_mid] = iter_sensitive keywords_dict = json.loads(item["_source"]["keywords_dict"]) personal_keywords_dict = dict() for k, v in keywords_dict.iteritems(): k = k.encode("utf-8", "ignore") personal_keywords_dict[k] = v classify_text_dict[iter_mid] = personal_keywords_dict classify_uid_list.append(iter_uid) # 去重 if duplicate_text_list: dup_results = duplicate(duplicate_text_list) for item in dup_results: if item["duplicate"]: duplicate_dict[item["_id"]] = item["same_from"] # 分类 if classify_text_dict: classify_results = topic_classfiy(classify_uid_list, classify_text_dict) mid_value = dict() # print "classify_results: ", classify_results for k, v in classify_results.iteritems(): # mid:value mid_value[k] = topic_value_dict[v[0]] if tmp_sensitive_warning: warning_status = signal_brust burst_reason += signal_sensitive_variation sensitive_weibo_detail = {} if sensitive_words_dict: sensitive_mid_list = sensitive_words_dict.keys() sensitivie_weibo_detail = query_hot_weibo(ts, sensitive_mid_list, time_interval) results = dict() results["mid_topic_value"] = json.dumps(mid_value) results["duplicate_dict"] = json.dumps(duplicate_dict) results["sensitive_words_dict"] = json.dumps(sensitive_words_dict) results["sensitive_weibo_detail"] = json.dumps(sensitive_weibo_detail) results["origin_weibo_number"] = len(all_origin_list) results["retweeted_weibo_number"] = len(all_retweeted_list) results["origin_weibo_detail"] = json.dumps(origin_weibo_detail) results["retweeted_weibo_detail"] = json.dumps(retweeted_weibo_detail) results["retweeted_weibo_count"] = current_retweeted_count results["comment_weibo_count"] = current_comment_count results["weibo_total_number"] = current_total_count results["sentiment_distribution"] = json.dumps(sentiment_count) results["important_users"] = json.dumps(filter_important_list) results["unfilter_users"] = json.dumps(important_uid_list) results["burst_reason"] = tmp_burst_reason results["timestamp"] = ts # results['clustering_topic'] = json.dumps(topic_list) # es存储当前时段的信息 doctype = create_by + "-" + task_name es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results) # 更新manage social sensing的es信息 if not new: temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=doctype)[ "_source" ] temporal_result["warning_status"] = warning_status temporal_result["burst_reason"] = tmp_burst_reason temporal_result["finish"] = finish temporal_result["processing_status"] = process_status history_status = json.loads(temporal_result["history_status"]) history_status.append([ts, task_name, warning_status]) temporal_result["history_status"] = json.dumps(history_status) es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=doctype, body=temporal_result) else: print "test" return "1"
def key_words_search(task_id, search_type , pre , during , start_time , keyword_list , search_key = '' , sort_norm = '', sort_scope = '' ,time = 7 , isall = False, number = 100): number = int(number) should = [] for key in keyword_list: if search_type == "hashtag": should.append({"prefix":{"text": "#" + key + "#"}}) else: should.append({"wildcard":{"text": "*" +key + "*"}}) index_list = [] date = ts2datetime(start_time) index_name = pre + date while during: if es_flow_text.indices.exists(index=index_name): index_list.append(index_name) start_time = start_time + DAY date = ts2datetime(start_time) index_name = pre + date during -= 1 print index_list uid_set = set() text_results = [] sorted_text_results = [] query_body = { "query":{ "bool":{ "must":should } }, "sort":{"user_fansnum":{"order":"desc"}}, "size":5000 } results = es_flow_text.search(index = index_list , doc_type = 'text' , body = query_body, _source=False, fields=["uid", "user_fansnum","text", "message_type", "sentiment","timestamp", "geo", "retweeted", "comment"])["hits"]["hits"] id_index = 0 index_list = [] un_uid_list = [] for item in results : if item['fields']['uid'][0] not in uid_set: uid_set.add(item['fields']['uid'][0]) un_uid_list.append(item['fields']['uid'][0]) index_list.append(id_index) id_index += 1 #get_all_filed(sort_norm , time) uid_list = [] print "un_uid_list: ", len(un_uid_list) portrait_list = [] count = 0 in_index = 0 if not isall and un_uid_list : # 库内 portrait_results = es_user_portrait.mget(index=USER_INDEX_NAME, doc_type=USER_INDEX_TYPE, body={"ids":un_uid_list}, _source=False, fields=['uname'])["docs"] for item in portrait_results: if item["found"]: portrait_list.append(item['_id']) nick_name = item['fields']['uname'][0] if nick_name == 'unknown': nick_name = item['_id'] index = index_list[in_index] weibo_url = weiboinfo2url(results[index]['fields']['uid'][0], results[index]['_id']) text_results.extend([results[index]['fields']['uid'][0], results[index]['fields']['user_fansnum'][0], results[index]['fields']['text'][0], results[index]['fields']['message_type'][0], results[index]['fields']['sentiment'][0], ts2date(results[index]['fields']['timestamp'][0]), results[index]['fields']['geo'][0], results[index]['fields']['retweeted'][0], results[index]['fields']['comment'][0], nick_name, weibo_url]) count += 1 if count == number: break print "portrait_len, ", len(portrait_list) in_index += 1 if portrait_list: uid_list = in_sort_filter(time,sort_norm ,sort_scope ,None , portrait_list , True, number) # sort for iter_uid in uid_list: iter_index = portrait_list.index(iter_uid) sorted_text_results.append(text_results[i]) elif un_uid_list: profile_result = es_user_profile.mget(index="weibo_user", doc_type="user", body={"ids":un_uid_list}, fields=['nick_name'])["docs"] for i in range(len(profile_result)): index = index_list[i] try: nick_name = profile_result[i]['fields']['nick_name'][0] except: nick_name = un_uid_list[i] item = results[index] weibo_url = weiboinfo2url(item['fields']['uid'][0], results[index]['_id']) text_results.append([item['fields']['uid'][0], item['fields']['user_fansnum'][0], item['fields']['text'][0], item['fields']['message_type'][0], item['fields']['sentiment'][0], ts2date(item['fields']['timestamp'][0]), results[index]['fields']['geo'][0], results[index]['fields']['retweeted'][0], results[index]['fields']['comment'][0], nick_name, weibo_url]) if i == number: break uid_list = all_sort_filter(un_uid_list[:number] , sort_norm , time ,True, number) sorted_text_results = [] f = open("small.txt", "wb") for iter_uid in uid_list: iter_index = un_uid_list.index(iter_uid) f.write(str(iter_uid)+"\n") sorted_text_results.append(text_results[iter_index]) f.close() print "filter_uid_list: ", len(uid_list) if uid_list: results = make_up_user_info(uid_list,isall,time,sort_norm) else: results = [] print "results: ", len(results) # 修改状态 task_detail = es_user_portrait.get(index=USER_RANK_KEYWORD_TASK_INDEX , doc_type=USER_RANK_KEYWORD_TASK_TYPE, id=task_id) item = task_detail['_source'] item['status'] = 1 item['result'] = json.dumps(results) item['text_results'] = json.dumps(sorted_text_results) item['number'] = len(results) es_user_portrait.index(index = USER_RANK_KEYWORD_TASK_INDEX , doc_type=USER_RANK_KEYWORD_TASK_TYPE , id=task_id, body=item) return "1"
weibo_list = []#微博列表 now_ts = time.time() #run_type if RUN_TYPE = 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = datetime2ts(RUN_TEST_TIME) start_date_ts = now_date_ts - DAY * WEEK for i in range(0,WEEK): iter_date_ts = start_date_ts + DAY * i flow_text_index_date = ts2datetime(iter_date_ts) flow_text_index_name = flow_text_index_name_pre + flow_text_index_date print flow_text_index_name try: flow_text_exist = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'size': MAX_VALUE}, _source=False, fields=['text','uid','keywords_dict','timestamp'])['hits']['hits'] except: flow_text_exist = [] for flow_text_item in flow_text_exist: uid = flow_text_item['fields']['uid'][0].encode('utf-8') text = flow_text_item['fields']['text'][0].encode('utf-8') ts = flow_text_item['fields']['timestamp'][0] keywords_dict = json.loads(flow_text_item['fields']['keywords_dict'][0]) keywords_dict = json.dumps(keywords_dict, encoding="UTF-8", ensure_ascii=False) keywords_dict = eval(keywords_dict) if word_dict.has_key(uid): item_dict = Counter(word_dict[uid]) keywords_dict = Counter(keywords_dict) item_dict = dict(item_dict + keywords_dict)
def specific_keywords_burst_dection(task_detail): task_name = task_detail[0] social_sensors = task_detail[1] keywords_list = task_detail[2] sensitive_words = task_detail[3] stop_time = task_detail[4] forward_warning_status = task_detail[5] ts = int(task_detail[7]) forward_result = get_forward_numerical_info(task_name, ts, keywords_list) # 之前时间阶段内的原创微博list forward_origin_weibo_list = query_mid_list(ts-time_interval, keywords_list, forward_time_range) # 当前阶段内原创微博list current_mid_list = query_mid_list(ts, keywords_list, time_interval) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(forward_origin_weibo_list) print "all mid list: ", len(all_mid_list) # 查询当前的原创微博和之前12个小时的原创微博在当前时间内的转发和评论数, 聚合按照message_type statistics_count = query_related_weibo(ts, all_mid_list, time_interval, keywords_list) current_total_count = statistics_count['total_count'] # 当前阶段内所有微博总数 print "current all weibo: ", statistics_count current_origin_count = statistics_count['origin'] current_retweeted_count = statistics_count['retweeted'] current_comment_count = statistics_count['comment'] # 针对敏感微博的监测,给定传感器和敏感词的前提下,只要传感器的微博里提及到敏感词即会认为是预警 # 聚合当前时间内积极、中性、悲伤、愤怒情绪分布 # sentiment_dict = {"0": "neutral", "1":"positive", "2":"sad", "3": "anger"} sentiment_count = {"0": 0, "1": 0, "2": 0, "3": 0} datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-time_interval) if datetime != datetime_1: index_name = flow_text_index_name_pre + datetime_1 else: index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: search_results = aggregation_sentiment_related_weibo(ts, all_mid_list, time_interval, keywords_list) sentiment_count = search_results print "sentiment_count: ", sentiment_count negetive_count = sentiment_count['2'] + sentiment_count['3'] # 聚合当前时间内重要的人 important_uid_list = [] if exist_es: #search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=aggregation_sensor_keywords(ts-time_interval, ts, [], "root_uid", size=IMPORTANT_USER_NUMBER))['aggregations']['all_keywords']['buckets'] search_results = query_hot_weibo(ts, all_mid_list, time_interval, keywords_list, aggregation_field="root_uid", size=100) important_uid_list = search_results.keys() if datetime != datetime_1: index_name_1 = flow_text_index_name_pre + datetime_1 if es_text.indices.exists(index_name_1): #search_results_1 = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=aggregation_sensor_keywords(ts-time_interval, ts, [], "root_uid", size=IMPORTANT_USER_NUMBER))['aggregations']['all_keywords']['buckets'] search_results_1 = query_hot_weibo(ts, all_mid_list, time_interval, keywords_list, aggregation_field="root_uid", size=100) if search_results_1: for item in search_results_1: important_uid_list.append(item['key']) # 根据获得uid_list,从人物库中匹配重要人物 if important_uid_list: important_results = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, body={"ids": important_uid_list})['docs'] else: important_results = {} filter_important_list = [] # uid_list if important_results: for item in important_results: if item['found']: if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD: filter_important_list.append(item['_id']) print filter_important_list # 6. 敏感词识别,如果传感器的微博中出现这么一个敏感词,那么就会预警------PS.敏感词是一个 sensitive_origin_weibo_number = 0 sensitive_retweeted_weibo_number = 0 sensitive_comment_weibo_number = 0 sensitive_total_weibo_number = 0 if sensitive_words: query_sensitive_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte": ts - time_interval, "lt": ts }} }, {"terms": {"keywords_string": sensitive_words}} ] } } } }, "aggs":{ "all_list":{ "terms":{"field": "message_type"} } } } if social_sensors: query_sensitive_body['query']['filtered']['filter']['bool']['must'].append({"terms":{"uid": social_sensors}}) sensitive_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_sensitive_body)['aggregations']['all_list']["buckets"] if sensitive_results: for item in sensitive_results: if int(item["key"]) == 1: sensitive_origin_weibo_number = item['doc_count'] elif int(item["key"]) == 2: sensitive_comment_weibo_number = item['doc_count'] elif int(item["key"]) == 3: sensitive_retweeted_weibo_number = item["doc_count"] else: pass sensitive_total_weibo_number = sensitive_origin_weibo_number + sensitive_comment_weibo_number + sensitive_retweeted_weibo_number burst_reason = signal_nothing_variation warning_status = signal_nothing finish = unfinish_signal # "0" process_status = "1" if sensitive_total_weibo_number > WARNING_SENSITIVE_COUNT: # 敏感微博的数量异常 print "======================" if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track else: warning_status = signal_brust burst_reason = signal_sensitive_variation if forward_result[0]: # 根据移动平均判断是否有时间发生 mean_count = forward_result[1] std_count = forward_result[2] mean_sentiment = forward_result[3] std_sentiment = forward_result[4] if current_total_count > mean_count+1.96*std_count: # 异常点发生 print "=====================================================" if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track else: warning_status = signal_brust burst_reason += signal_count_varition # 数量异常 if negetive_count > mean_sentiment+1.96*std_sentiment: warning_status = signal_brust burst_reason += signal_sentiment_varition # 负面情感异常, "12"表示两者均异常 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track if int(stop_time) <= ts: # 检查任务是否已经完成 finish = finish_signal process_status = "0" # 7. 感知到的事, all_mid_list tmp_burst_reason = burst_reason topic_list = [] # 判断是否有敏感微博出现:有,则聚合敏感微博,replace;没有,聚合普通微博 if burst_reason: # 有事情发生 text_list = [] mid_set = set() if signal_sensitive_variation in burst_reason: query_sensitive_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte": ts - time_interval, "lt": ts }} }, {"terms": {"keywords_string": sensitive_words}} ] } } } }, "size": 10000 } if social_sensors: query_sensitive_body['query']['filtered']['filter']['bool']['must'].append({"terms":{"uid": social_sensors}}) sensitive_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_sensitive_body)['hits']['hits'] if sensitive_results: for item in sensitive_results: iter_mid = item['_source']['mid'] iter_text = item['_source']['text'] temp_dict = dict() temp_dict["mid"] = iter_mid temp_dict["text"] = iter_text if iter_mid not in mid_set: text_list.append(temp_dict) # 整理后的文本,mid,text mid_set.add(iter_mid) burst_reason.replace(signal_sensitive_variation, "") current_origin_mid_list = query_mid_list(ts, keywords_list, time_interval, 1) print "current_origin_mid_list:", len(current_origin_mid_list) if burst_reason and current_mid_list: origin_sensing_text = es_text.mget(index=index_name, doc_type=flow_text_index_type, body={"ids": current_origin_mid_list}, fields=["mid", "text"])["docs"] if origin_sensing_text: for item in origin_sensing_text: if item["found"]: iter_mid = item["fields"]["mid"][0] iter_text = item["fields"]["text"][0] temp_dict = dict() temp_dict["mid"] = iter_mid temp_dict["text"] = iter_text if iter_mid not in mid_set: text_list.append(temp_dict) # 整理后的文本,mid,text mid_set.add(iter_mid) if len(text_list) == 1: top_word = freq_word(text_list[0]) topic_list = [top_word.keys()] elif len(text_list) == 0: topic_list = [] tmp_burst_reason = "" #没有相关微博,归零 print "***********************************" else: feature_words, input_word_dict = tfidf(text_list) #生成特征词和输入数据 word_label, evaluation_results = kmeans(feature_words, text_list) #聚类 inputs = text_classify(text_list, word_label, feature_words) clustering_topic = cluster_evaluation(inputs) print "========================================================================================" print "=========================================================================================" sorted_dict = sorted(clustering_topic.items(), key=lambda x:x[1], reverse=True) topic_list = [] if sorted_dict: for item in sorted_dict: topic_list.append(word_label[item[0]]) print "topic_list, ", topic_list if not topic_list: warning_status = signal_nothing tmp_burst_reason = signal_nothing_variation results = dict() results['origin_weibo_number'] = current_origin_count results['retweeted_weibo_number'] = current_retweeted_count results['comment_weibo_number'] = current_comment_count results['weibo_total_number'] = current_total_count results['sensitive_origin_weibo_number'] = sensitive_origin_weibo_number results['sensitive_retweeted_weibo_number'] = sensitive_retweeted_weibo_number results['sensitive_comment_weibo_number'] = sensitive_comment_weibo_number results['sensitive_weibo_total_number'] = sensitive_total_weibo_number results['sentiment_distribution'] = json.dumps(sentiment_count) results['important_users'] = json.dumps(filter_important_list) results['burst_reason'] = tmp_burst_reason results['timestamp'] = ts if tmp_burst_reason: results['clustering_topic'] = json.dumps(topic_list) # es存储当前时段的信息 doctype = task_name es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results) # 更新manage social sensing的es信息 temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=task_name)['_source'] temporal_result['warning_status'] = warning_status temporal_result['burst_reason'] = tmp_burst_reason temporal_result['finish'] = finish temporal_result['processing_status'] = process_status history_status = json.loads(temporal_result['history_status']) history_status.append([ts, ' '.join(keywords_list), warning_status]) temporal_result['history_status'] = json.dumps(history_status) es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=task_name, body=temporal_result) return "1"
def sensors_keywords_detection(task_detail): task_name = task_detail[0] social_sensors = task_detail[1] keywords_list = task_detail[2] sensitive_words = task_detail[3] stop_time = task_detail[4] forward_warning_status = task_detail[5] ts = task_detail[7] forward_result = get_forward_numerical_info(task_name, ts, keywords_list) # 1. 聚合前12个小时内传感人物发布的所有与关键词相关的原创微博 forward_origin_weibo_list = query_mid_list(ts-time_interval, keywords_list, forward_time_range, 1, social_sensors) # 2. 聚合当前阶段内的原创微博 current_mid_list = query_mid_list(ts, keywords_list, time_interval, 1, social_sensors) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(forward_origin_weibo_list) all_mid_list = list(set(all_mid_list)) print len(all_mid_list) # 3. 查询当前的原创微博和之前12个小时的原创微博在当前时间内的转发和评论数, 聚合按照message_type statistics_count = query_related_weibo(ts, all_mid_list, time_interval, keywords_list, 1, social_sensors) current_total_count = statistics_count['total_count'] # 当前阶段内所有微博总数 print "current all weibo: ", statistics_count current_origin_count = statistics_count['origin'] current_retweeted_count = statistics_count['retweeted'] current_comment_count = statistics_count['comment'] # 4. 聚合当前时间内积极、中性、悲伤、愤怒情绪分布 sentiment_count = {"0": 0, "1": 0, "2": 0, "3": 0} datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-time_interval) if datetime == datetime_1: index_name = flow_text_index_name_pre + datetime else: index_name = flow_text_index_name_pre + datetime_1 exist_es = es_text.indices.exists(index_name) if exist_es: search_results = aggregation_sentiment_related_weibo(ts, all_mid_list, time_interval, keywords_list, 1) sentiment_count = search_results print "sentiment_count: ", sentiment_count negetive_count = sentiment_count['2'] + sentiment_count['3'] # 5. 那些社会传感器参与事件讨论 query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte": ts - time_interval, "lt": ts } }}, {"terms":{"uid": social_sensors}} ], "should":[ {"terms": {"root_mid": all_mid_list}}, {"terms": {"mid": all_mid_list}} ] } } } }, "size": 10000 } datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts - time_interval) if datetime == datetime_1: index_name = flow_text_index_name_pre + datetime else: index_name = flow_text_index_name_pre + datetime_1 search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)['hits']['hits'] attend_users = [] if search_results: for item in search_results: attend_users.append(item['_source']['uid']) important_users = list(set(attend_users)) print "important users", important_users # 6. 敏感词识别,如果传感器的微博中出现这么一个敏感词,那么就会预警------PS.敏感词是一个危险的设置 sensitive_origin_weibo_number = 0 sensitive_retweeted_weibo_number = 0 sensitive_comment_weibo_number = 0 sensitive_total_weibo_number = 0 if sensitive_words: query_sensitive_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte": ts - time_interval, "lt": ts }} }, {"terms": {"keywords_string": sensitive_words}}, {"terms": {"uid": social_sensors}} ] } } } }, "aggs":{ "all_list":{ "terms":{"field": "message_type"} } } } sensitive_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_sensitive_body)['aggregations']['all_list']["buckets"] if sensitive_results: for item in sensitive_results: if int(item["key"]) == 1: sensitive_origin_weibo_number = item['doc_count'] elif int(item["key"]) == 2: sensitive_comment_weibo_number = item['doc_count'] elif int(item["key"]) == 3: sensitive_retweeted_weibo_number = item["doc_count"] else: pass sensitive_total_weibo_number = sensitive_origin_weibo_number + sensitive_comment_weibo_number + sensitive_retweeted_weibo_number burst_reason = signal_nothing_variation warning_status = signal_nothing finish = unfinish_signal # "0" if sensitive_total_weibo_number: # 敏感微博的数量异常 print "======================" if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track else: warning_status = signal_brust burst_reason = signal_sensitive_variation if forward_result[0]: # 根据移动平均判断是否有时间发生 mean_count = forward_result[1] std_count = forward_result[2] mean_sentiment = forward_result[3] std_sentiment = forward_result[4] if current_total_count > mean_count+1.96*std_count: # 异常点发生 print "=====================================================" if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track else: warning_status = signal_brust burst_reason += signal_count_varition # 数量异常 if negetive_count > mean_sentiment+1.96*std_sentiment: warning_status = signal_brust burst_reason += signal_sentiment_varition # 负面情感异常, "12"表示两者均异常 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track if int(stop_time) <= ts: # 检查任务是否已经完成 finish = finish_signal tmp_burst_reason = burst_reason topic_list = [] # 7. 感知到的事, all_mid_list if burst_reason: # 有事情发生 text_list = [] mid_set = set() if signal_sensitive_variation in burst_reason: query_sensitive_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte": ts - time_interval, "lt": ts }} }, {"terms": {"keywords_string": sensitive_words}} ] } } } }, "size": 10000 } if social_sensors: query_sensitive_body['query']['filtered']['filter']['bool']['must'].append({"terms":{"uid": social_sensors}}) sensitive_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_sensitive_body)['hits']["hits"] if sensitive_results: for item in sensitive_results: iter_mid = item['_source']['mid'] iter_text = item['_source']['text'] temp_dict = dict() temp_dict["mid"] = iter_mid temp_dict["text"] = iter_text if iter_mid not in mid_set: text_list.append(temp_dict) # 整理后的文本,mid,text mid_set.add(iter_mid) burst_reason.replace(signal_sensitive_variation, "") if burst_reason and all_mid_list: sensing_text = es_text.mget(index=index_name, doc_type=flow_text_index_type, body={"ids": all_mid_list}, fields=["mid", "text"])["docs"] if sensing_text: for item in sensing_text: if item['found']: iter_mid = item["fields"]["mid"][0] iter_text = item["fields"]["text"][0] temp_dict = dict() temp_dict["mid"] = iter_mid temp_dict["text"] = iter_text if iter_mid not in mid_set: text_list.append(temp_dict) mid_set.add(iter_mid) if len(text_list) == 1: top_word = freq_word(text_list[0]) topic_list = top_word.keys() elif len(text_list) == 0: topic_list = [] tmp_burst_reason = "" #没有相关微博,归零 print "***********************************" else: feature_words, input_word_dict = tfidf(text_list) #生成特征词和输入数据 word_label, evaluation_results = kmeans(feature_words, text_list) #聚类 inputs = text_classify(text_list, word_label, feature_words) clustering_topic = cluster_evaluation(inputs) sorted_dict = sorted(clustering_topic.items(), key=lambda x:x[1], reverse=True)[0:5] topic_list = [] if sorted_dict: for item in sorted_dict: topic_list.append(word_label[item[0]]) print "topic_list:", topic_list if not topic_list: tmp_burst_reason = signal_nothing_variation warning_status = signal_nothing results = dict() results['sensitive_origin_weibo_number'] = sensitive_origin_weibo_number results['sensitive_retweeted_weibo_number'] = sensitive_retweeted_weibo_number results['sensitive_comment_weibo_number'] = sensitive_comment_weibo_number results['sensitive_weibo_total_number'] = sensitive_total_weibo_number results['origin_weibo_number'] = current_origin_count results['retweeted_weibo_number'] = current_retweeted_count results['comment_weibo_number'] = current_comment_count results['weibo_total_number'] = current_total_count results['sentiment_distribution'] = json.dumps(sentiment_count) results['important_users'] = json.dumps(important_users) results['burst_reason'] = tmp_burst_reason results['timestamp'] = ts if tmp_burst_reason: results["clustering_topic"] = json.dumps(topic_list) # es存储当前时段的信息 doctype = task_name es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results) # 更新manage social sensing的es信息 temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=task_name)['_source'] temporal_result['warning_status'] = warning_status temporal_result['burst_reason'] = tmp_burst_reason temporal_result['finish'] = finish history_status = json.loads(temporal_result['history_status']) history_status.append([ts, ' '.join(keywords_list), warning_status]) temporal_result['history_status'] = json.dumps(history_status) es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=task_name, body=temporal_result) return "1"
def get_hotspot_recommentation(): results = [] #step1: get media uid from file media_user = get_media_user() #step2: get media user weibo keywords last day query_body = { 'query':{ 'filtered':{ 'filter':{ 'terms': {'uid': media_user} } } }, 'aggs':{ 'all_interests':{ 'terms':{ 'field': 'keywords_string', 'size': RECOMMEND_MAX_KEYWORDS } } } } #run type if RUN_TYPE == 1: date = ts2datetime(time.time() - DAY) sort_type = 'retweeted' else: date = ts2datetime(datetime2ts(RUN_TEST_TIME) - DAY) sort_type = 'timestamp' flow_text_index_name = flow_text_index_name_pre + date user_keywords_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body=query_body)['aggregations']['all_interests']['buckets'] keywords_list = [item['key'] for item in user_keywords_result] #step3: get same weibo list sort by retweet_count #step4: filter out user out_user_count = 0 all_out_user = [] sort_evaluate_max = SENTIMENT_SORT_EVALUATE_MAX while out_user_count < RECOMMEND_IN_OUT_SIZE: query_body = { 'query':{ 'filtered':{ 'filter':{ 'bool':{ 'must':[ {'range': {sort_type: {'lt': sort_evaluate_max}}}, {'terms': {'keywords_string': keywords_list}} ] } } } }, 'sort': [{sort_type : {'order':'desc'}}], 'size': RECOMMEND_IN_WEIBO_MAX } weibo_user = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type, body=query_body)['hits']['hits'] weibo_user_list = [item['_source']['uid'] for item in weibo_user] #filter out if weibo_user_list: weibo_user_list = list(set(weibo_user_list)) out_weibo_user_list = filter_out(weibo_user_list) all_out_user.extend(out_weibo_user_list) all_out_user = list(set(all_out_user)) out_user_count = len(all_out_user) sort_evaluate_max = weibo_user[-1]['_source'][sort_type] results = all_out_user return results
def social_sensing(task_detail): # 任务名 传感器 终止时间 之前状态 创建者 时间 task_name = task_detail[0] social_sensors = task_detail[1] stop_time = task_detail[2] forward_warning_status = task_detail[3] create_by = task_detail[4] ts = int(task_detail[5]) # PART 1 forward_result = get_forward_numerical_info(task_name, ts, create_by) # 之前时间阶段内的原创微博list/retweeted forward_origin_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range) forward_retweeted_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range, 3) # 当前阶段内原创微博list current_mid_list = query_mid_list(ts, social_sensors, time_interval) current_retweeted_mid_list = query_mid_list(ts, social_sensors, time_interval, 3) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(current_retweeted_mid_list) all_mid_list.extend(forward_origin_weibo_list) all_mid_list.extend(forward_retweeted_weibo_list) all_origin_list = [] all_origin_list.extend(current_mid_list) all_origin_list.extend(forward_origin_weibo_list) all_retweeted_list = [] all_retweeted_list.extend(current_retweeted_mid_list) all_retweeted_list.extend(forward_retweeted_weibo_list)#被转发微博的mid/root-mid print "all mid list: ", len(all_mid_list) print "all_origin_list", all_origin_list print "all_retweeted_list", all_retweeted_list # 查询微博在当前时间内的转发和评论数, 聚合按照message_type statistics_count = query_related_weibo(ts, all_mid_list, time_interval) if all_origin_list: origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval) # 原创微博详情 else: origin_weibo_detail = {} if all_retweeted_list: retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval) # 转发微博详情 else: retweeted_weibo_detail = {} current_total_count = statistics_count['total_count'] # 当前阶段内所有微博总数 current_retweeted_count = statistics_count['retweeted'] current_comment_count = statistics_count['comment'] # PART 2 # 聚合当前时间内积极、中性、悲伤、愤怒情绪分布 # sentiment_dict = {"0": "neutral", "1":"positive", "2":"sad", "3": "anger"} sentiment_count = {"0": 0, "1": 0, "2": 0, "3": 0} search_results = aggregation_sentiment_related_weibo(ts, all_mid_list, time_interval) sentiment_count = search_results print "sentiment_count: ", sentiment_count negetive_key = ["2", "3", "4", "5", "6"] negetive_count = 0 for key in negetive_key: negetive_count += sentiment_count[key] # 聚合当前时间内重要的人 important_uid_list = [] datetime = ts2datetime(ts-time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: search_results = get_important_user(ts, all_mid_list, time_interval) important_uid_list = search_results.keys() # 根据获得uid_list,从人物库中匹配重要人物 if important_uid_list: important_results = es_user_portrait.mget(index=portrait_index_name,doc_type=portrait_index_type, body={"ids": important_uid_list})['docs'] else: important_results = {} filter_important_list = [] # uid_list if important_results: for item in important_results: if item['found']: #if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD: filter_important_list.append(item['_id']) print filter_important_list #判断感知 burst_reason = signal_nothing_variation warning_status = signal_nothing finish = unfinish_signal # "0" process_status = "1" if forward_result[0]: # 根据移动平均判断是否有时间发生 mean_count = forward_result[1] std_count = forward_result[2] mean_sentiment = forward_result[3] std_sentiment = forward_result[4] if mean_count >= MEAN_COUNT and current_total_count > mean_count+1.96*std_count or current_total_count >= len(social_sensors)*0.2*AVERAGE_COUNT: # 异常点发生 print "=====================================================" if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track else: warning_status = signal_brust burst_reason += signal_count_varition # 数量异常 if negetive_count > mean_sentiment+1.96*std_sentiment and mean_sentiment >= MEAN_COUNT or negetive_count >= len(social_sensors)*0.2*AVERAGE_COUNT: warning_status = signal_brust burst_reason += signal_sentiment_varition # 负面情感异常, "12"表示两者均异常 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track if int(stop_time) <= ts: # 检查任务是否已经完成 finish = finish_signal process_status = "0" # 感知到的事, all_mid_list tmp_burst_reason = burst_reason topic_list = [] # 有事件发生时开始 if warning_status: index_list = [] important_words = [] datetime_1 = ts2datetime(ts) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es = es_text.indices.exists(index=index_name_1) if exist_es: index_list.append(index_name_1) datetime_2 = ts2datetime(ts-DAY) index_name_2 = flow_text_index_name_pre + datetime_2 exist_es = es_text.indices.exists(index=index_name_2) if exist_es: index_list.append(index_name_2) if index_list and all_mid_list: query_body = { "query":{ "filtered":{ "filter":{ "terms":{"mid": all_mid_list} } } }, "size": 2000 } search_results = es_text.search(index=index_list, doc_type="text", body=query_body)['hits']['hits'] text_list = [] if search_results: for item in search_results: iter_mid = item['_source']['mid'] iter_text = item['_source']['text'] temp_dict = dict() temp_dict["mid"] = iter_mid temp_dict["text"] = iter_text text_list.append(temp_dict) for item in text_list: print item['text'] if len(text_list) == 1: top_word = freq_word(text_list[0]) topic_list = [top_word.keys()] elif len(text_list) == 0: topic_list = [] tmp_burst_reason = "" #没有相关微博,归零 print "***********************************" else: feature_words, input_word_dict = tfidf(text_list) #生成特征词和输入数据 word_label, evaluation_results = kmeans(feature_words, text_list) #聚类 inputs = text_classify(text_list, word_label, feature_words) clustering_topic = cluster_evaluation(inputs) print "===============================================================" print "===============================================================" sorted_dict = sorted(clustering_topic.items(), key=lambda x:x[1], reverse=True) topic_list = [] if sorted_dict: for item in sorted_dict: topic_list.append(word_label[item[0]]) print "topic_list, ", topic_list #if not topic_list: # warning_status = signal_nothing # tmp_burst_reason = signal_nothing_variation results = dict() results['origin_weibo_number'] = len(all_origin_list) results['retweeted_weibo_number'] = len(all_retweeted_list) results['origin_weibo_detail'] = json.dumps(origin_weibo_detail) results['retweeted_weibo_detail'] = json.dumps(retweeted_weibo_detail) results['retweeted_weibo_count'] = current_retweeted_count results['comment_weibo_count'] = current_comment_count results['weibo_total_number'] = current_total_count results['sentiment_distribution'] = json.dumps(sentiment_count) results['important_users'] = json.dumps(filter_important_list) results['unfilter_users'] = json.dumps(important_uid_list) results['burst_reason'] = tmp_burst_reason results['timestamp'] = ts if tmp_burst_reason: results['clustering_topic'] = json.dumps(topic_list) # es存储当前时段的信息 doctype = create_by + '-' + task_name es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results) # 更新manage social sensing的es信息 temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=doctype)['_source'] temporal_result['warning_status'] = warning_status temporal_result['burst_reason'] = tmp_burst_reason temporal_result['finish'] = finish temporal_result['processing_status'] = process_status history_status = json.loads(temporal_result['history_status']) history_status.append([ts, task_name, warning_status]) temporal_result['history_status'] = json.dumps(history_status) es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=doctype, body=temporal_result) return "1"
def filter_event(all_union_user, event_condition_list): user_result = [] new_range_dict_list = [] #step1: adjust the date condition for date new_event_condition_list = [] for event_condition_item in event_condition_list: if 'range' in event_condition_item: range_dict = event_condition_item['range'] from_ts = range_dict['timestamp']['gte'] to_ts = range_dict['timestamp']['lt'] from_date_ts = datetime2ts(ts2datetime(from_ts)) to_date_ts = datetime2ts(ts2datetime(to_ts)) new_range_dict_list = [] if from_date_ts != to_date_ts: iter_date_ts = from_date_ts while iter_date_ts < to_date_ts: iter_next_date_ts = iter_date_ts + DAY new_range_dict_list.append({'range':{'timestamp':{'gte':iter_date_ts, 'lt':iter_next_date_ts}}}) iter_date_ts = iter_next_date_ts if new_range_dict_list[0]['range']['timestamp']['gte'] < from_ts: new_range_dict_list[0]['range']['timestamp']['gte'] = from_ts if new_range_dict_list[-1]['range']['timestamp']['lt'] > to_ts: new_range_dict_list[-1]['range']['timestamp']['lt'] = to_ts else: new_range_dict_list = [{'range':{'timestamp':{'gte':from_ts, 'lt':to_ts}}}] else: new_event_condition_list.append(event_condition_item) #step2: iter to search user who publish weibo use keywords_string #step2.1: split user to bulk action #step2.2: iter to search user meet condition weibo for different day user_count = len(all_union_user) iter_count = 0 hit_user_set = set() while iter_count < user_count: iter_user_list = [union_item[0] for union_item in all_union_user[iter_count:iter_count + DETECT_ITER_COUNT / 10]] iter_user_event_condition_list = [{'terms':{'uid': iter_user_list}}] iter_user_event_condition_list.extend(new_event_condition_list) #iter date to search different flow_text es for range_item in new_range_dict_list: iter_date_event_condition_list = [item for item in iter_user_event_condition_list] iter_date_event_condition_list.append(range_item) range_from_ts = range_item['range']['timestamp']['gte'] range_from_date = ts2datetime(range_from_ts) flow_index_name = flow_text_index_name_pre + range_from_date try: flow_text_exist = es_flow_text.search(index=flow_index_name, doc_type=flow_text_index_type, \ body={'query':{'bool':{'must':iter_date_event_condition_list}}, 'size':100}, _source=False, fields=['uid'])['hits']['hits'] except: flow_text_exist = [] #get hit user set for flow_text_item in flow_text_exist: uid = flow_text_item['fields']['uid'][0] hit_user_set.add(uid) iter_count += DETECT_ITER_COUNT / 10 #identify the hit user list ranked by score rank_hit_user = [] for user_item in all_union_user: uid = user_item[0] uid_set = set(uid) if len(uid_set & hit_user_set) != 0: rank_hit_user.append(uid) return rank_hit_user
def query_mid_list(ts, keywords_list, time_segment, query_type=0, social_sensors=[]): # 第一步,聚合前六个小时相关微博mid, 首先获得原创微博 #ts = time.time() #ts = 1377964800+3600 query_body = { "query": { "filtered": { "filter": { "bool": { "must": [ {"range": { "timestamp": { "gte": ts - time_segment, "lt": ts } }}, {"terms": {"keywords_string": keywords_list}} #{"term": {"message_type": 1}} # origin weibo ] } } } }, "sort": {"sentiment": {"order": "desc"}}, "size": 10000 } if social_sensors: query_body['query']['filtered']['filter']['bool']['must'].append({"terms": {"uid": social_sensors}}) if query_type == 1: query_body['query']['filtered']['filter']['bool']['must'].append({"term": {"message_type": 1}}) datetime = ts2datetime(ts) # test #datetime = "2013-09-07" index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body, fields=["root_mid"])["hits"]["hits"] else: search_results = [] origin_mid_list = set() # all related weibo mid list if search_results: for item in search_results: #if item.get("fields", ""): # origin_mid_list.add(item["fields"]["root_mid"][0]) #else: origin_mid_list.add(item["_id"]) datetime_1 = ts2datetime(ts-time_segment) index_name_1 = flow_text_index_name_pre + datetime_1 exist_bool = es_text.indices.exists(index_name_1) if datetime != datetime_1 and exist_bool: search_results_1 = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body, fields=['root_mid'])["hits"]["hits"] if search_results_1: for item in search_results_1: #if item.get("fields", ""): # origin_mid_list.add(item["fields"]["root_mid"][0]) #else: origin_mid_list.add(item["_id"]) return list(origin_mid_list)
def compute_sentiment_task(sentiment_task_information): results = {} #step1: get task information start_date = sentiment_task_information['start_date'] start_ts = datetime2ts(start_date) end_date = sentiment_task_information['end_date'] end_ts = datetime2ts(end_date) iter_date_ts = start_ts to_date_ts = end_ts iter_query_date_list = [] # ['2013-09-01', '2013-09-02'] while iter_date_ts <= to_date_ts: iter_date = ts2datetime(iter_date_ts) iter_query_date_list.append(iter_date) iter_date_ts += DAY #step2: get iter search flow_text_index_name #step2.1: get search keywords list query_must_list = [] keyword_nest_body_list = [] keywords_string = sentiment_task_information['query_keywords'] keywords_list = keywords_string.split('&') for keywords_item in keywords_list: keyword_nest_body_list.append({'wildcard': {'text': '*' + keywords_item + '*'}}) query_must_list.append({'bool': {'should': keyword_nest_body_list}}) all_sentiment_dict = {} all_keyword_dict = {} iter_query_list = query_must_list #step2.2: iter search by date segment = sentiment_task_information['segment'] segment_ts = str2segment[segment] # segment_ts = 900/3600/3600*24 for iter_date in iter_query_date_list: flow_text_index_name = flow_text_index_name_pre + iter_date iter_start_ts = datetime2ts(iter_date) for i in range(0, DAY/segment_ts): query_start_ts = iter_start_ts + i * segment_ts iter_query_list.append({'range':{'timestamp':{'gte': query_start_ts, 'lt':query_start_ts + Fifteen}}}) query_body = { 'query':{ 'bool':{ 'must':iter_query_list } }, 'aggs':{ 'all_interests':{ 'terms':{ 'field': 'sentiment', 'size': SENTIMENT_TYPE_COUNT } } } } flow_text_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body=query_body)['aggregations']['all_interests']['buckets'] iter_query_list = iter_query_list[:-1] iter_sentiment_dict = {} for flow_text_item in flow_text_result: sentiment = flow_text_item['key'] sentiment_count = flow_text_item['doc_count'] if sentiment in SENTIMENT_SECOND: sentiment = '7' try: iter_sentiment_dict[sentiment] += sentiment_count except: iter_sentiment_dict[sentiment] = sentiment_count #add 0 to iter_sentiment_dict for sentiment in SENTIMENT_FIRST: try: count = iter_sentiment_dict[sentiment] except: iter_sentiment_dict[sentiment] = 0 all_sentiment_dict[query_start_ts] = iter_sentiment_dict sort_sentiment_dict = sorted(all_sentiment_dict.items(), key=lambda x:x[0]) trend_results = {} for sentiment in SENTIMENT_FIRST: trend_results[sentiment] = [[item[0], item[1][sentiment]] for item in sort_sentiment_dict] results = trend_results return results
def get_flow_information(uid_list): # 前七天的数据, 不能用于每天更新 lenth = len(uid_list) results = {} iter_results = {} result_dict = {} if RUN_TYPE: now_ts = time.time() now_date = ts2datetime(now_ts) # date: 2013-09-01 else: now_date = "2013-09-08" ts = datetime2ts(now_date) start_ts = ts - 8 * 3600 * 24 for i in range(1, 8): ts = start_ts + i * 3600 * 24 date = ts2datetime(ts) print "date:", date uid_day_geo = {} sensitive_uid_day_geo = {} flow_index_name = flow_text_index_name_pre + str(date) # hashtag print uid_list hashtag_results = redis_cluster.hmget("hashtag_" + str(ts), uid_list) sensitive_hashtag = redis_cluster.hmget("sensitive_hashtag_" + str(ts), uid_list) # sensitive_words sensitive_results = redis_cluster.hmget("sensitive_" + str(ts), uid_list) # ip if WORK_TYPE == 0: ip_index_name = ip_index_pre + str(date) sensitive_ip_index_name = sen_ip_index_pre + str(date) # activity_index_name = act_index_pre + str(date) # sensitive_activity_index_name = sen_act_index_pre + str(date) exist_bool = es_cluster.indices.exists(index=ip_index_name) sensitive_exist_bool = es_cluster.indices.exists(index=sensitive_ip_index_name) # activity_exist_bool = es_cluster.indices.exists(index=activity_index_name) # sensitive_activity_exist_bool = es_cluster.indices.exists(index=sensitive_activity_index_name) if exist_bool: ip_results = es_cluster.mget(index=ip_index_name, doc_type="ip", body={"ids": uid_list})["docs"] else: ip_results = [dict()] * lenth if sensitive_exist_bool: sensitive_ip_results = es_cluster.mget( index=sensitive_ip_index_name, doc_type="sensitive_ip", body={"ids": uid_list} )["docs"] else: sensitive_ip_results = [dict()] * lenth """ if activity_exist_bool: activity_results = es_cluster.mget(index=activity_index_name, doc_type="activity", body={"ids":uid_list})["docs"] else: activity_results = [dict()]*lenth if sensitive_activity_exist_bool: sensitive_activity_results = es_cluster.mget(index=sensitive_activity_index_name, doc_type="sensitive_activity", body={"ids":uid_list})["docs"] else: sensitive_activity_results = [dict()]*lenth """ else: ip_results = redis_ip.hmget("ip_" + str(ts), uid_list) sensitive_ip_results = redis_ip.hmget("sensitive_ip_" + str(ts), uid_list) # activity_results = redis_activity.hmget('activity_'+str(date), uid_list) # sensitive_activity_results = redis_activity.hmget('sensitive_activity_'+str(date), uid_list) for j in range(0, len(uid_list)): uid = uid_list[j] if uid not in iter_results: iter_results[uid] = { "hashtag": {}, "sensitive_hashtag": {}, "geo": {}, "sensitive_geo": {}, "geo_track": [], "keywords": {}, "sensitive_words": {}, "sensitive_geo_track": [], "ip": [], "sensitive_ip": [], } # sensitive words if sensitive_results[j]: sensitive_words_results = json.loads(sensitive_results[j]) for sensitive_word in sensitive_words_results: try: iter_results[uid]["sensitive_words"][sensitive_word] += sensitive_words_results[sensitive_word] except: iter_results[uid]["sensitive_words"][sensitive_word] = sensitive_words_results[sensitive_word] # print "sensitive_words:", iter_results[uid]["sensitive_words"] if hashtag_results[j]: hashtag_dict = json.loads(hashtag_results[j]) for hashtag in hashtag_dict: try: iter_results[uid]["hashtag"][hashtag] += hashtag_dict[hashtag] except: iter_results[uid]["hashtag"][hashtag] = hashtag_dict[hashtag] # print "hashtag: ", iter_results[uid]['hashtag'] if sensitive_hashtag[j]: sensitive_hashtag_dict = json.loads(sensitive_hashtag[j]) for hashtag in sensitive_hashtag_dict: try: iter_results[uid]["sensitive_hashtag"][hashtag] += sensitive_hashtag_dict[hashtag] except: iter_results[uid]["sensitive_hashtag"][hashtag] = sensitive_hashtag_dict[hashtag] # print "sensitive_hashtag:", iter_results[uid]['sensitive_hashtag'] uid_day_geo[uid] = {} sensitive_uid_day_geo[uid] = {} if WORK_TYPE == 0: # es if ip_results[j]: if ip_results[j]["found"]: detail_item = ip_results[j]["_source"] ip_dict = json.loads(detail_item["ip_dict"]) else: ip_dict = {} else: ip_dict = {} else: if ip_results[j]: ip_dict = json.loads(ip_results[j]) else: ip_dict = {} if ip_dict: # iter_results[uid]['ip'].append(ip_dict) geo_dict = ip2geo(ip_dict) for geo, count in geo_dict.iteritems(): try: iter_results[uid]["geo"][geo] += count except: iter_results[uid]["geo"][geo] = count try: uid_day_geo[uid][geo] += count except: uid_day_geo[uid][geo] = count # iter_results[uid]['ip'].append(ip_dict) iter_results[uid]["geo_track"].append(uid_day_geo[uid]) # print "ip:", iter_results[uid]['ip'], iter_results[uid]['geo_track'] if WORK_TYPE == 0: if sensitive_ip_results[j]: if sensitive_ip_results[j]["found"]: detail_item = sensitive_ip_results[j]["_source"] sensitive_ip_dict = json.loads(detail_item["sensitive_ip_dict"]) else: sensitive_ip_dict = dict() else: sensitive_ip_dict = dict() else: if sensitive_ip_results[j]: sensitive_ip_dict = json.loads(sensitive_ip_results[j]) else: sensitive_ip_dict = dict() if sensitive_ip_dict: sensitive_geo_dict = ip2geo(sensitive_ip_dict) # iter_results[uid]['sensitive_ip'].append(sensitive_ip_dict) for geo, count in sensitive_geo_dict.iteritems(): try: iter_results[uid]["sensitive_geo"][geo] += count except: iter_results[uid]["sensitive_geo"][geo] = count try: sensitive_uid_day_geo[uid][geo] += count except: sensitive_uid_day_geo[uid][geo] = count # iter_results[uid]['sensitive_ip'].append(sensitive_ip_dict) iter_results[uid]["sensitive_geo_track"].append(sensitive_uid_day_geo[uid]) # print "sensitive_ip:", iter_results[uid]['sensitive_ip'], iter_results[uid]['sensitive_geo_track'] # compute keywords flow_text_exist = es_flow_text.indices.exists(index=flow_index_name) if flow_text_exist: text_results = es_flow_text.search( index=flow_index_name, doc_type=flow_text_index_type, body={"query": {"filtered": {"filter": {"terms": {"uid": uid_list}}}}, "size": MAX_VALUE}, _source=False, fields=["uid", "keywords_dict"], )["hits"]["hits"] else: text_results = {} for item in text_results: uid = item["fields"]["uid"][0] uid_keywords_dict = json.loads(item["fields"]["keywords_dict"][0]) for keywords in uid_keywords_dict: try: iter_results[uid]["keywords"][keywords] += uid_keywords_dict[keywords] except: iter_results[uid]["keywords"][keywords] = uid_keywords_dict[keywords] # print "keywords:", iter_results[uid]['keywords'] for uid in uid_list: results[uid] = {} # hashtag hashtag_dict = iter_results[uid]["hashtag"] results[uid]["hashtag_dict"] = json.dumps(hashtag_dict) results[uid]["hashtag_string"] = "&".join(hashtag_dict.keys()) # sensitive hashtag sensitive_hashtag_dict = iter_results[uid]["sensitive_hashtag"] results[uid]["sensitive_hashtag_dict"] = json.dumps(sensitive_hashtag_dict) results[uid]["sensitive_hashtag_string"] = "&".join(sensitive_hashtag_dict.keys()) # sensitive_words sensitive_word_dict = iter_results[uid]["sensitive_words"] results[uid]["sensitive_words_dict"] = json.dumps(sensitive_word_dict) results[uid]["sensitive_words_string"] = "&".join(sensitive_word_dict.keys()) sensitive_score = 0 for k, v in sensitive_word_dict.iteritems(): tmp = r_sensitive.hget("sensitive_words", k) if tmp: tmp_stage = json.loads(tmp) sensitive_score += sensitive_score_dict[str(tmp_stage[0])] * v results[uid]["sensitive"] = sensitive_score # geo geo_dict = iter_results[uid]["geo"] geo_track_list = iter_results[uid]["geo_track"] results[uid]["activity_geo_dict"] = json.dumps(geo_track_list) geo_dict_keys = geo_dict.keys() results[uid]["activity_geo"] = "&".join(["&".join(item.split("\t")) for item in geo_dict_keys]) results[uid]["activity_geo_aggs"] = "&".join([item.split("\t")[-1] for item in geo_dict_keys]) sensitive_geo_dict = iter_results[uid]["sensitive_geo"] sensitive_geo_track_list = iter_results[uid]["sensitive_geo_track"] results[uid]["sensitive_activity_geo_dict"] = json.dumps(sensitive_geo_track_list) sensitive_geo_dict_keys = sensitive_geo_dict.keys() results[uid]["sensitive_activity_geo"] = "&".join( ["&".join(item.split("\t")) for item in sensitive_geo_dict_keys] ) results[uid]["sensitive_activity_geo_aggs"] = "&".join( [item.split("\t")[-1] for item in sensitive_geo_dict_keys] ) keywords_dict = iter_results[uid]["keywords"] keywords_top50 = sorted(keywords_dict.items(), key=lambda x: x[1], reverse=True)[:50] keywords_top50_string = "&".join([keyword_item[0] for keyword_item in keywords_top50]) results[uid]["keywords_dict"] = json.dumps(keywords_top50) results[uid]["keywords_string"] = keywords_top50_string return results
def get_flow_information(uid_list): results = {} #results = {uid:{'hashtag_dict':{},'hashtag':'', 'keywords_dict':{}, 'keywords_string':'', 'activity_geo':'', 'activity_geo_dict':dict}} iter_results = {} # iter_results = {uid:{'hashtag': hashtag_dict, 'geo':geo_dict, 'keywords':keywords_dict}} now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) #test now_date_ts = test_ts for i in range(7,0,-1): ts = now_date_ts - DAY*i iter_date = ts2datetime(ts) flow_text_index_name = flow_text_index_name_pre + iter_date uid_day_geo = {} #compute hashtag and geo hashtag_results = r_cluster_3.hmget('hashtag_'+str(ts), uid_list) ip_results = r_cluster.hmget('new_ip_'+str(ts), uid_list) #compute sensitive_words sensitive_results = r_cluster_3.hmget('sensitive_'+str(ts), uid_list) count = 0 for uid in uid_list: #init iter_results[uid] if uid not in iter_results: iter_results[uid] = {'hashtag':{}, 'geo':{},'geo_track':[],'keywords':{}, 'sensitive':{}} #compute hashtag hashtag_item = hashtag_results[count] if hashtag_item: uid_hashtag_dict = json.loads(hashtag_item) else: uid_hashtag_dict = {} for hashtag in uid_hashtag_dict: try: iter_results[uid]['hashtag'][hashtag] += uid_hashtag_dict[hashtag] except: iter_results[uid]['hashtag'][hashtag] = uid_hashtag_dict[hashtag] #compute sensitive sensitive_item = sensitive_results[count] if sensitive_item: uid_sensitive_dict = json.loads(sensitive_item) else: uid_sensitive_dict = {} for sensitive_word in uid_sensitive_dict: try: iter_results[uid]['sensitive'][sensitive_word] += uid_sensitive_dict[sensitive_word] except: iter_results[uid]['sensitive'][sensitive_word] = uid_sensitive_dict[sensitive_word] #compute geo uid_day_geo[uid] = {} ip_item = ip_results[count] if ip_item: uid_ip_dict = json.loads(ip_item) else: uid_ip_dict = {} for ip in uid_ip_dict: ip_count = len(uid_ip_dict[ip].split('&')) geo = ip2city(ip) if geo: #print 'geo:', geo try: iter_results[uid]['geo'][geo] += ip_count except: iter_results[uid]['geo'][geo] = ip_count try: uid_day_geo[uid][geo] += ip_count except: uid_day_geo[uid][geo] = ip_count iter_results[uid]['geo_track'].append(uid_day_geo[uid]) count += 1 #compute keywords: try: text_results = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type, \ body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'size':MAX_VALUE}, _source=True, fields=['uid', 'keywords_dict'])['hits']['hits'] except: text_results = {} for item in text_results: #print 'keywords item:', item uid = item['fields']['uid'][0] uid_keywords_dict = json.loads(item['fields']['keywords_dict'][0]) for keywords in uid_keywords_dict: try: iter_results[uid]['keywords'][keywords] += uid_keywords_dict[keywords] except: iter_results[uid]['keywords'][keywords] = uid_keywords_dict[keywords] #get keywords top for uid in uid_list: results[uid] = {} hashtag_dict = iter_results[uid]['hashtag'] results[uid]['hashtag_dict'] = json.dumps(hashtag_dict) results[uid]['hashtag'] = '&'.join(hashtag_dict.keys()) #sensitive words sensitive_word_dict = iter_results[uid]['sensitive'] results[uid]['sensitive_dict'] = json.dumps(sensitive_word_dict) results[uid]['sensitive_string'] = '&'.join(sensitive_word_dict.keys()) sensitive_score = 0 for item in sensitive_word_dict: k = item v = sensitive_word_dict[k] tmp_stage = r_sensitive.hget('sensitive_words', k) if tmp_stage: sensitive_score += v * sensitive_score_dict[str(tmp_stage)] results[uid]['sensitive'] = sensitive_score #print 'sensitive_dict:', results[uid]['sensitive_dict'] #print 'sensitive_string:', results[uid]['sensitive_string'] #print 'sensitive:', results[uid]['sensitive'] #geo geo_dict = iter_results[uid]['geo'] geo_track_list = iter_results[uid]['geo_track'] results[uid]['activity_geo_dict'] = json.dumps(geo_track_list) geo_dict_keys = geo_dict.keys() #print 'geo_dict_keys:', geo_dict_keys results[uid]['activity_geo'] = '&'.join(['&'.join(item.split('\t')) for item in geo_dict_keys]) #print 'activity_geo:', results[uid]['activity_geo'] keywords_dict = iter_results[uid]['keywords'] keywords_top50 = sorted(keywords_dict.items(), key=lambda x:x[1], reverse=True)[:50] keywords_top50_string = '&'.join([keyword_item[0] for keyword_item in keywords_top50]) results[uid]['keywords'] = json.dumps(keywords_top50) results[uid]['keywords_string'] = keywords_top50_string return results
def query_related_weibo(ts, origin_mid_list, time_segment, keywords_list, query_type=0, social_sensors=[]): query_all_body = { "query": { "filtered": { "filter": { "bool": { "must": [ {"range": { "timestamp":{ "gte": ts - time_segment, "lt": ts } }} ] } } } }, "aggs":{ "all_count":{ "terms":{"field": "message_type"} } } } datetime = ts2datetime(ts) # test #datetime = "2013-09-07" index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) return_results = {"origin": 0, "retweeted": 0, "comment": 0} if origin_mid_list and exist_es: query_all_body["query"]["filtered"]["filter"]["bool"]["should"] = [] query_all_body["query"]["filtered"]["filter"]["bool"]["should"].append({"terms": {"root_mid": origin_mid_list}}) query_all_body["query"]["filtered"]["filter"]["bool"]["should"].append({"terms": {"mid": origin_mid_list}}) results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_all_body)['aggregations']['all_count']['buckets'] if results: for item in results: if int(item['key']) == 1: return_results['origin'] = item['doc_count'] elif int(item['key']) == 3: return_results['retweeted'] = item['doc_count'] elif int(item['key']) == 2: return_results['comment'] = item['doc_count'] else: pass datetime_1 = ts2datetime(ts-time_segment) index_name_1 = flow_text_index_name_pre + datetime_1 exist_bool = es_text.indices.exists(index_name_1) if datetime != datetime_1 and exist_bool: repost_count_1 = 0 if origin_mid_list: query_all_body["query"]["filtered"]["filter"]["bool"]["should"] = [] query_all_body["query"]["filtered"]["filter"]["bool"]["should"].append({"terms": {"root_mid": origin_mid_list}}) query_all_body["query"]["filtered"]["filter"]["bool"]["should"].append({"terms": {"mid": origin_mid_list}}) results_1 = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_all_body)['aggregations']['all_count']['buckets'] if results_1: for item in results_1: if int(item['key']) == 1: return_results['origin'] += item['doc_count'] elif int(item['key']) == 3: return_results['retweeted'] += item['doc_count'] elif int(item['key']) == 2: return_results['comment'] += item['doc_count'] else: pass return_results['total_count'] = sum(return_results.values()) print "return_results: ", return_results return return_results
def aggregation_sentiment_related_weibo(ts, origin_mid_list, time_segment, keywords_list, query_type=0): query_all_body = { "query": { "filtered": { "filter": { "bool": { "must": [ {"range": { "timestamp":{ "gte": ts - time_segment, "lt": ts } }} ] } } } }, "aggs":{ "all_sentiments":{ "terms":{ "field": "sentiment"} } } } datetime = ts2datetime(ts) results =dict() results['0'] = 0 results['1'] = 0 results['2'] = 0 results['3'] = 0 # test #datetime = "2013-09-07" index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if origin_mid_list and exist_es: query_all_body["query"]["filtered"]["filter"]["bool"]["should"] = [] query_all_body["query"]["filtered"]["filter"]["bool"]["should"].append({"terms": {"root_mid": origin_mid_list}}) query_all_body["query"]["filtered"]["filter"]["bool"]["should"].append({"terms": {"mid": origin_mid_list}}) search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_all_body)['aggregations']['all_sentiments']['buckets'] if search_results: for item in search_results: key = item['key'] count = item['doc_count'] results[key] = count print results print "total_sentiments: ", sum(results.values()) datetime_1 = ts2datetime(ts-time_segment) index_name_1 = flow_text_index_name_pre + datetime_1 exist_bool = es_text.indices.exists(index_name_1) if datetime != datetime_1 and exist_bool: repost_count_1 = 0 if origin_mid_list: query_all_body["query"]["filtered"]["filter"]["bool"]["should"] = [] query_all_body["query"]["filtered"]["filter"]["bool"]["should"].append({"terms": {"root_mid": origin_mid_list}}) query_all_body["query"]["filtered"]["filter"]["bool"]["should"].append({"terms": {"mid": origin_mid_list}}) search_results = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_all_body)['aggregations']['all_sentiments']['buckets'] if search_results: for item in search_results: key = item['key'] count = item['doc_count'] results[key] += count return results
def attribute_filter_pattern(user_portrait_result, pattern_list): results = {} #step1: adjust the date condition for date new_pattern_list = [] new_range_dict_list = [] for pattern_item in pattern_list: if 'range' in pattern_item: range_dict = pattern_item['range']['timestamp'] from_ts = range_dict['gte'] to_ts = range_dict['lt'] from_date_ts = datetime2ts(ts2datetime(from_ts)) to_date_ts = datetime2ts(ts2datetime(to_ts)) if from_date_ts != to_date_ts: iter_date_ts = from_date_ts while iter_date_ts <= to_date_ts: iter_next_date_ts = iter_date_ts + DAY new_range_dict_list.append({'range':{'timestamp': {'gte': iter_date_ts, 'lt':iter_next_date_ts}}}) iter_date_ts = iter_next_date_ts if new_range_dict_list[0]['range']['timestamp']['gte'] < from_ts: new_range_dict_list[0]['range']['timestamp']['gte'] = from_ts if new_range_dict_list[-1]['range']['timestamp']['lt'] > to_ts: new_range_dict_list[-1]['range']['timestamp']['lt'] = to_ts else: new_range_dict_list = [{'range':{'timestamp':{'gte':from_ts, 'lt':to_ts}}}] else: new_pattern_list.append(pattern_item) #step2: iter to search user who pulish weibo meet pattern list #step2.1: split user to bulk action #step2.2: iter to search user meet pattern condition for different date user_count = len(user_portrait_result) iter_count = 0 hit_user_set = set() while iter_count < user_count: iter_user_list = [portrait_item['_id'] for portrait_item in user_portrait_result[iter_count: iter_count+DETECT_ITER_COUNT]] #get uid nest_body_list iter_user_pattern_condition_list = [{'terms': {'uid': iter_user_list}}] iter_user_pattern_condition_list.append(new_pattern_list) #iter date to search different flow_text es for range_item in new_range_dict_list: iter_date_pattern_condition_list = [item for item in iter_user_pattern_condition_list] iter_date_pattern_condition_list.append(range_item) range_from_ts = range_item['range']['timestamp']['gte'] range_from_date = ts2datetime(range_from_ts) flow_index_name = flow_text_index_name_pre + range_from_date try: flow_text_exist = es_flow_text.search(index=flow_index_name, doc_type=flow_text_index_type, \ body={'query':{'bool':{'must': iter_date_pattern_condition_list}}, 'size':MAX_VALUE}, _source=False, fields=['uid'])['hits']['hits'] except: flow_text_exist = [] #get hit user set for flow_text_item in flow_text_exist: uid = flow_text_item['fields']['uid'][0] hit_user_set.add(uid) iter_count += DETECT_ITER_COUNT #identify the hit user list ranked by score rank_hit_user = [] for user_item in user_portrait_result: uid = user_item['_id'] uid_set = set(uid) if uid in hit_user_set: rank_hit_user.append(uid) return rank_hit_user
def pattern_filter_attribute(pattern_list, filter_dict): results = {} #step1: adjust the date condition for date new_pattern_list = [] new_range_dict_list = [] for pattern_item in pattern_list: if 'range' in pattern_item: range_dict = pattern_item['range'] from_ts = range_dict['timestamp']['gte'] to_ts = range_dict['timestamp']['lt'] from_date_ts = datetime2ts(ts2datetime(from_ts)) to_date_ts = datetime2ts(ts2datetime(to_ts)) if from_date_ts != to_date_ts: iter_date_ts = from_date_ts while iter_date_ts <= to_date_ts: iter_next_date_ts = iter_date_ts + DAY / 48 new_range_dict_list.append({'range':{'timestamp':{'gte':iter_date_ts, 'lt':iter_next_date_ts}}}) iter_date_ts = iter_next_date_ts if new_range_dict_list[0]['range']['timestamp']['gte'] < from_ts: new_range_dict_list[0]['range']['timestamp']['gte'] = from_ts if new_range_dict_list[-1]['range']['timestamp']['lt'] > to_ts: new_range_dict_list[-1]['range']['timestamp']['lt'] = to_ts else: new_range_dict_list = [{'range': {'timestamp':{'gte': from_ts, 'lt': to_ts}}}] else: new_pattern_list.append(pattern_item) #step2.1: iter to search user who meet pattern condition #step2.2: filter who is in user_portrait and meet filter_dict all_hit_user = {} for range_item in new_range_dict_list: iter_date_pattern_condition_list = [item for item in new_pattern_list] iter_date_pattern_condition_list.append(range_item) range_from_ts = range_item['range']['timestamp']['gte'] range_from_date = ts2datetime(range_from_ts) flow_index_name = flow_text_index_name_pre + range_from_date try: flow_text_exist = es_flow_text.search(index=flow_index_name, doc_type=flow_text_index_type,\ body={'query':{'bool':{'must': iter_date_pattern_condition_list}}, 'size': MAX_VALUE}, _source=False, fields=['uid'])['hits']['hits'] except: flow_text_exist = [] #pattern user set pattern_user_set = set([flow_text_item['fields']['uid'][0] for flow_text_item in flow_text_exist]) pattern_user_list = list(pattern_user_set) #filter by user_portrait filter dict by bulk action pattern_user_count = len(pattern_user_list) iter_count = 0 #add filter dict inter_portrait_condition_list = [] inter_portrait_condition_list.append({'range':{'importance':{'gte': filter_dict['importance']['gte'], 'lt': filter_dict['importance']['lt']}}}) inter_portrait_condition_list.append({'range':{'influence':{'gte': filter_dict['influence']['gte'], 'lt':filter_dict['influence']['lt']}}}) while iter_count < pattern_user_count: iter_user_list = pattern_user_list[iter_count: iter_count + DETECT_ITER_COUNT] #get uid nest_body_list nest_body_list = [] for iter_user in iter_user_list: nest_body_list.append({'term': iter_user}) inter_portrait_condition_list.append({'bool':{'should': nest_body_list}}) #search user in user_portrait try: in_portrait_result = es_user_portrait.search(index=portrait_index_name, doc_type=portrait_index_type,\ body={'query':{'bool':{'must': iter_portrait_condition_list}}, 'size': MAX_VALUE}, _source=False, fields=['influence','importance'])['hits']['hits'] except: in_portrait_result = [] #add to all hit user for in_portrait_item in in_portrait_result: all_hit_user[in_portrait_item['_id']] = [in_portrait_item['fields']['influence'][0], in_portrait_item['fields']['importance'][0]] iter_count += DETECT_ITER_COUNT #sort all hit user by influence count = filter_dict['count'] sort_all_hit_user = sorted(all_hit_user.items(), key=lambda x:x[1][0], reverse=True)[:count] #detect user list ranked by iinfluence rank_user_list = [sort_item[0] for sort_item in sort_all_hit_user] return rank_user_list