def query_hot_mid(ts, keywords_list, text_type,size=100): query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte":ts - time_interval, "lt": ts } }}, {"terms": {"keywords_string": keywords_list}}, {"term": {"message_type": "0"}} ] } } } }, "aggs":{ "all_interests":{ "terms":{"field": "root_mid", "size": size} } } } datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_bool_1 = es_text.indices.exists(index_name_1) print datetime, datetime_1 if datetime == datetime_1 and exist_es: search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["aggregations"]["all_interests"]["buckets"] elif datetime != datetime_1 and exist_bool_1: search_results = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body)["aggregations"]["all_interests"]["buckets"] else: search_results = [] hot_mid_list = [] if search_results: for item in search_results: print item temp = [] temp.append(item['key']) temp.append(item['doc_count']) hot_mid_list.append(temp) #print hot_mid_list return hot_mid_list
def query_mid_list(ts, keywords_list, time_segment, social_sensors=[]): query_body = { "query": { "filtered": { "filter": { "bool": { "must": [ {"range": { "timestamp": { "gte": ts - time_segment, "lt": ts } }}, {"terms": {"keywords_string": keywords_list}} ] } } } }, "size": 10000 } if social_sensors: query_body['query']['filtered']['filter']['bool']['must'].append({"terms": {"uid": social_sensors}}) datetime = ts2datetime(ts) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body, fields=["root_mid"])["hits"]["hits"] else: search_results = [] origin_mid_list = set() # all related weibo mid list if search_results: for item in search_results: #if item.get("fields", ""): # origin_mid_list.append(item["fields"]["root_mid"][0]) #else: origin_mid_list.add(item["_id"]) datetime_1 = ts2datetime(ts-time_segment) index_name_1 = flow_text_index_name_pre + datetime_1 exist_bool = es_text.indices.exists(index_name_1) if datetime != datetime_1 and exist_bool: search_results_1 = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body, fields=['root_mid'])["hits"]["hits"] if search_results_1: for item in search_results_1: #if item.get("fields", ""): # origin_mid_list.append(item["fields"]["root_mid"][0]) #else: origin_mid_list.add(item["_id"]) return list(origin_mid_list)
def statistics_influence_people(uid, date, style): # output: different retweeted and comment, uids' domain distribution, topic distribution, registeration geo distribution results = {} # retwweted weibo people and comment weibo people date1 = str(date).replace('-', '') index_name = pre_index + date1 index_flow_text = pre_text_index + date try: bci_result = es_cluster.get(index=index_name, doc_type=influence_doctype, id=uid)["_source"] except: bci_result = [] return results origin_mid = [] # origin weibo mid retweeted_mid = [] # retweeted weibo mid query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ ] } } } }, "size":1000 } body_1 = copy.deepcopy(query_body) body_2 = copy.deepcopy(query_body) body_1["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term":{"message_type": 1}}, {"term":{"uid": uid}}]) result_1 = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=body_1)["hits"]["hits"] if result_1: for item in result_1: origin_mid.append(item['_id']) body_1["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term":{"message_type": 3}}, {"term":{"uid": uid}}]) result_2 = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=body_2)["hits"]["hits"] if result_2: for item in result_2: if item['_source'].get('root_mid', ''): retweeted_mid.append(item['_source']['root_mid']) if int(style) == 0: # retweeted retweeted_results = influenced_user_detail(uid, date, origin_mid, retweeted_mid, 3) results = retweeted_results else: comment_results = influenced_user_detail(uid, date, origin_mid, retweeted_mid, 2) results = comment_results return results
def search_group_sentiment_weibo(task_name, start_ts, sentiment): weibo_list = [] #step1:get task_name uid try: group_result = es_group_result.get(index=group_index_name, doc_type=group_index_type,\ id=task_name, _source=False, fields=['uid_list']) except: group_result = {} if group_result == {}: return 'task name invalid' try: uid_list = group_result['fields']['uid_list'] except: uid_list = [] if uid_list == []: return 'task uid list null' #step3: get ui2uname uid2uname = {} try: user_portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type,\ body={'ids':uid_list}, _source=False, fields=['uname'])['docs'] except: user_portrait_result = [] for item in user_portrait_result: uid = item['_id'] if item['found'] == True: uname = item['fields']['uname'][0] uid2uname[uid] = uname #step4:iter date to search weibo weibo_list = [] iter_date = ts2datetime(start_ts) flow_text_index_name = flow_text_index_name_pre + str(iter_date) #step4: get query_body if sentiment != '2': query_body = [{'terms': {'uid': uid_list}}, {'term':{'sentiment': sentiment}}, \ {'range':{'timestamp':{'gte':start_ts, 'lt': start_ts+DAY}}}] else: query_body = [{'terms':{'uid':uid_list}}, {'terms':{'sentiment': SENTIMENT_SECOND}},\ {'range':{'timestamp':{'gte':start_ts, 'lt':start_ts+DAY}}}] try: flow_text_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body={'query':{'bool':{'must': query_body}}, 'sort': [{'timestamp':{'order':'asc'}}], 'size': MAX_VALUE})['hits']['hits'] except: flow_text_result = [] for flow_text_item in flow_text_result: source = flow_text_item['_source'] weibo = {} weibo['uid'] = source['uid'] weibo['uname'] = uid2uname[weibo['uid']] weibo['ip'] = source['ip'] try: weibo['geo'] = '\t'.join(source['geo'].split('&')) except: weibo['geo'] = '' weibo['text'] = source['text'] weibo['timestamp'] = source['timestamp'] weibo['sentiment'] = source['sentiment'] weibo_list.append(weibo) return weibo_list
def get_user_ip(uid): flow_text_index_list = [] now_timestamp = datetime2ts(ts2datetime(time.time())) if RUN_TYPE == 0: now_timestamp = datetime2ts(RUN_TEST_TIME) for i in range(7, 0, -1): iter_date = ts2datetime(now_timestamp - DAY * i) flow_text_index_list.append(flow_text_index_name_pre + iter_date) weibo_all = es_flow_text.search(index=flow_text_index_list, doc_type=flow_text_index_type, body={ 'query': { 'filtered': { 'filter': { 'term': { 'uid': uid } } } }, 'size': 10, })['hits']['hits'] ip = weibo_all[0]["_source"]["ip"] return ip
def get_repost_weibo(mid, weibo_timestamp): repost_result = [] index_date = ts2datetime(weibo_timestamp) index_name = flow_text_index_name_pre + index_date query_body = { 'query':{ 'bool':{ 'must':[ {'term':{'root_mid': mid}}, {'range':{'timestamp':{'gte': weibo_timestamp}}}, {'term':{'message_type': 2}} ] } } } try: flow_text_result = es_flow_text.search(index=index_name, doc_type=flow_text_index_type,\ body=query_body)['hits']['hits'] except: flow_text_result = [] repost_uid_list = [item['_source']['uid'] for item in flow_text_result] repost_user_info_dict = get_user_profile_weibo(repost_uid_list) statuses = [] for item in flow_text_result: item_source = item['_source'] item_source['user'] = repost_user_info_dict[item['uid']] statuses.append(item_source) return statuses
def get_repost_weibo(mid, weibo_timestamp): repost_result = [] index_date = ts2datetime(weibo_timestamp) index_name = flow_text_index_name_pre + index_date query_body = { 'query':{ 'bool':{ 'must':[ {'term':{'root_mid': mid}}, {'range':{'timestamp':{'gte': weibo_timestamp}}}, {'term':{'message_type': 2}} ] } } } try: flow_text_result = es_flow_text.search(index=index_name, doc_type=flow_text_index_type,\ body=query_body)['hits']['hits'] except: flow_text_result = [] repost_uid_list = [item['_source']['uid'] for item in flow_text_result] repost_user_info_dict = get_user_profile_weibo(repost_uid_list) statuses = [] for item in flow_text_result: item_source = item['_source'] item_source['user'] = repost_user_info_dict[item['uid']] statuses.append(item_source) return statuses
def get_sen_ratio(topic,start_ts,end_ts): query_body = { 'query':{ 'bool':{ 'must':[ {'wildcard':{'text':'*'+topic+'*'}}, {'range':{'timestamp':{'lte':end_ts,'gte':start_ts}}} ] } }, 'aggs':{ 'all_interests':{ 'terms':{ 'field': 'sentiment', } } } } if RUN_TYPE == 0 : date = '2013-09-07' else: date = ts2datetime(time.time()) print query_body result = es_flow_text.search(index = flow_text_index_name_pre+date,doc_type=flow_text_index_type,body=query_body)\ ['aggregations']['all_interests']['buckets'] return result
def get_psycho_status(uid_list): results = {} uid_sentiment_dict = {} #time for es_flow_text now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) #run_type if RUN_TYPE == 0: now_date_ts = datetime2ts(RUN_TEST_TIME) start_date_ts = now_date_ts - DAY * WEEK for i in range(0, WEEK): iter_date_ts = start_date_ts + DAY * i flow_text_index_date = ts2datetime(iter_date_ts) flow_text_index_name = flow_text_index_name_pre + flow_text_index_date try: flow_text_exist = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'size': MAX_VALUE}, _source=False, fields=['uid', 'sentiment'])['hits']['hits'] except: flow_text_exist = [] for flow_text_item in flow_text_exist: uid = flow_text_item['fields']['uid'][0] sentiment = flow_text_item['fields']['sentiment'][0] if uid in uid_sentiment_dict: try: uid_sentiment_dict[uid][str(sentiment)] += 1 except: uid_sentiment_dict[uid][str(sentiment)] = 1 else: uid_sentiment_dict[uid] = {str(sentiment): 1} #compute first and second psycho_status for uid in uid_list: results[uid] = {'first': {}, 'second': {}} try: user_sentiment_result = uid_sentiment_dict[uid] except: user_sentiment_result = {} all_count = sum(user_sentiment_result.values()) #compute second level sentiment---negative type sentiment second_sentiment_count_list = [ user_sentiment_result[item] for item in user_sentiment_result if item in SENTIMENT_SECOND ] second_sentiment_all_count = sum(second_sentiment_count_list) for sentiment_item in SENTIMENT_SECOND: try: results[uid]['second'][sentiment_item] = float( user_sentiment_result[sentiment_item]) / all_count except: results[uid]['second'][sentiment_item] = 0 #compute first level sentiment---middle, postive, negative user_sentiment_result['7'] = second_sentiment_all_count for sentiment_item in SENTIMENT_FIRST: try: sentiment_ratio = float( user_sentiment_result[sentiment_item]) / all_count except: sentiment_ratio = 0 results[uid]['first'][sentiment_item] = sentiment_ratio return results
def search_group_sentiment_weibo(task_name, start_ts, sentiment): weibo_list = [] #step1:get task_name uid try: group_result = es_group_result.get(index=group_index_name, doc_type=group_index_type,\ id=task_name, _source=False, fields=['uid_list']) except: group_result = {} if group_result == {}: return 'task name invalid' try: uid_list = group_result['fields']['uid_list'] except: uid_list = [] if uid_list == []: return 'task uid list null' #step3: get ui2uname uid2uname = {} try: user_portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type,\ body={'ids':uid_list}, _source=False, fields=['uname'])['docs'] except: user_portrait_result = [] for item in user_portrait_result: uid = item['_id'] if item['found']==True: uname = item['fields']['uname'][0] uid2uname[uid] = uname #step4:iter date to search weibo weibo_list = [] iter_date = ts2datetime(start_ts) flow_text_index_name = flow_text_index_name_pre + str(iter_date) #step4: get query_body if sentiment != '2': query_body = [{'terms': {'uid': uid_list}}, {'term':{'sentiment': sentiment}}, \ {'range':{'timestamp':{'gte':start_ts, 'lt': start_ts+DAY}}}] else: query_body = [{'terms':{'uid':uid_list}}, {'terms':{'sentiment': SENTIMENT_SECOND}},\ {'range':{'timestamp':{'gte':start_ts, 'lt':start_ts+DAY}}}] try: flow_text_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body={'query':{'bool':{'must': query_body}}, 'sort': [{'timestamp':{'order':'asc'}}], 'size': MAX_VALUE})['hits']['hits'] except: flow_text_result = [] for flow_text_item in flow_text_result: source = flow_text_item['_source'] weibo = {} weibo['uid'] = source['uid'] weibo['uname'] = uid2uname[weibo['uid']] weibo['ip'] = source['ip'] try: weibo['geo'] = '\t'.join(source['geo'].split('&')) except: weibo['geo'] = '' weibo['text'] = source['text'] weibo['timestamp'] = source['timestamp'] weibo['sentiment'] = source['sentiment'] weibo_list.append(weibo) return weibo_list
def get_influence_content(uid, timestamp_from, timestamp_to): weibo_list = [] #split timestamp range to new_range_dict_list from_date_ts = datetime2ts(ts2datetime(timestamp_from)) to_date_ts = datetime2ts(ts2datetime(timestamp_to)) new_range_dict_list = [] if from_date_ts != to_date_ts: iter_date_ts = from_date_ts while iter_date_ts < to_date_ts: iter_next_date_ts = iter_date_ts + DAY new_range_dict_list.append({ 'range': { 'timestamp': { 'gte': iter_date_ts, 'lt': iter_next_date_ts } } }) iter_date_ts = iter_next_date_ts if new_range_dict_list[0]['range']['timestamp']['gte'] < timestamp_from: new_range_dict_list[0]['range']['timestamp'][ 'gte'] = timestamp_from if new_range_dict_list[-1]['range']['timestamp']['lt'] > timestamp_to: new_range_dict_list[-1]['range']['timestamp']['lt'] = timestamp_to else: new_range_dict_list = [{ 'range': { 'timestamp': { 'gte': timestamp_from, 'lt': timestamp_to } } }] #iter date to search flow_text iter_result = [] for range_item in new_range_dict_list: range_from_ts = range_item['range']['timestamp']['gte'] range_from_date = ts2datetime(range_from_ts) flow_text_index_name = flow_text_index_name_pre + range_from_date query = [] query.append({'term': {'uid': uid}}) query.append(range_item) try: flow_text_exist = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body={'query':{'bool':{'must': query}}, 'sort':[{'timestamp':'asc'}]})['hits']['hits'] except: flow_text_exist = [] iter_result.extend(flow_text_exist) # get weibo list for item in flow_text_exist: source = item['_source'] weibo = {} weibo['timestamp'] = ts2date(source['timestamp']) weibo['ip'] = source['ip'] weibo['text'] = source['text'] weibo['geo'] = '\t'.join(source['geo'].split('&')) weibo_list.append(weibo) return weibo_list
def new_get_user_weibo(uid, sort_type): results = [] weibo_list = [] now_date = ts2datetime(time.time()) #run_type if RUN_TYPE == 0: now_date = RUN_TEST_TIME sort_type = 'timestamp' #step1:get user name try: user_profile_result = es_user_profile.get(index=profile_index_name, doc_type=profile_index_type,\ id=uid, _source=False, fields=['nick_name']) except: user_profile_result = {} if user_profile_result: uname = user_profile_result['fields']['nick_name'][0] else: uname = '' #step2:get user weibo for i in range(7, 0, -1): iter_date = ts2datetime(datetime2ts(now_date) - i * DAY) index_name = flow_text_index_name_pre + iter_date try: weibo_result = es_flow_text.search(index=index_name, doc_type=flow_text_index_type,\ body={'query':{'filtered':{'filter':{'term': {'uid': uid}}}}, 'sort':sort_type, 'size':100})['hits']['hits'] except: weibo_result = [] if weibo_result: weibo_list.extend(weibo_result) print 'weibo_list:', weibo_list[0] sort_weibo_list = sorted(weibo_list, key=lambda x: x['_source'][sort_type], reverse=True)[:100] for weibo_item in sort_weibo_list: source = weibo_item['_source'] mid = source['mid'] uid = source['uid'] text = source['text'] ip = source['geo'] timestamp = source['timestamp'] date = ts2date(timestamp) sentiment = source['sentiment'] #run_type if RUN_TYPE == 1: retweet_count = source['retweet_count'] comment_count = source['comment_count'] sensitive_score = source['sensitive'] else: retweet_count = 0 comment_count = 0 sensitive_score = 0 city = ip2city(ip) results.append([ mid, uid, text, ip, city, timestamp, date, retweet_count, comment_count, sensitive_score ]) return results
def search_weibo(root_uid,uid,mtype): query_body = { #'query':{ 'filter':{ 'bool':{ 'must':[{'term':{'uid':uid}}, {'term':{'message_type':mtype}}], 'should':[{'term':{'root_uid':root_uid}}, {'term':{'directed_uid':root_uid}}], } } #} } index_list = [] for i in range(7, 0, -1): if RUN_TYPE == 1: iter_date = ts2datetime(datetime2ts(now_date) - i * DAY) else: iter_date = ts2datetime(datetime2ts(RUN_TEST_TIME) - i * DAY) index_list.append(flow_text_index_name_pre + iter_date) results = es_flow_text.search(index=index_list,doc_type=flow_text_index_type,body=query_body)['hits']['hits'] weibo = {} f_result = [] if len(results) > 0: for result in results: #print type(result),result weibo['last_text'] = [result['_source']['text'],result['_source']['text'],result['_source']['timestamp']] mid = result['_source']['root_mid'] # print mid len_pre = len(flow_text_index_name_pre) index = result['_index'][len_pre:] root_index = [] for j in range(0,7): #一周的,一个月的话就0,30 iter_date = ts2datetime(datetime2ts(index) - j * DAY) root_index.append(flow_text_index_name_pre + iter_date) results0 = es_flow_text.search(index=root_index,doc_type=flow_text_index_type,body={'query':{'term':{'mid':mid}}})['hits']['hits'] if len(results0)>0: for result0 in results0: weibo['ori_text'] = [result0['_source']['text'],result0['_source']['timestamp']] f_result.append(weibo) weibo={} return f_result
def get_psycho_status(uid_list): results = {} uid_sentiment_dict = {} #time for es_flow_text now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) #run_type if RUN_TYPE == 0: now_date_ts = datetime2ts(RUN_TEST_TIME) start_date_ts = now_date_ts - DAY * WEEK for i in range(0, WEEK): iter_date_ts = start_date_ts + DAY * i flow_text_index_date = ts2datetime(iter_date_ts) flow_text_index_name = flow_text_index_name_pre + flow_text_index_date try: flow_text_exist = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'size': MAX_VALUE}, _source=False, fields=['uid', 'sentiment'])['hits']['hits'] except: flow_text_exist = [] for flow_text_item in flow_text_exist: uid = flow_text_item['fields']['uid'][0] sentiment = flow_text_item['fields']['sentiment'][0] if uid in uid_sentiment_dict: try: uid_sentiment_dict[uid][str(sentiment)] += 1 except: uid_sentiment_dict[uid][str(sentiment)] = 1 else: uid_sentiment_dict[uid] = {str(sentiment): 1} #compute first and second psycho_status for uid in uid_list: results[uid] = {'first':{}, 'second':{}} try: user_sentiment_result = uid_sentiment_dict[uid] except: user_sentiment_result = {} all_count = sum(user_sentiment_result.values()) #compute second level sentiment---negative type sentiment second_sentiment_count_list = [user_sentiment_result[item] for item in user_sentiment_result if item in SENTIMENT_SECOND] second_sentiment_all_count = sum(second_sentiment_count_list) for sentiment_item in SENTIMENT_SECOND: try: results[uid]['second'][sentiment_item] = float(user_sentiment_result[sentiment_item]) / all_count except: results[uid]['second'][sentiment_item] = 0 #compute first level sentiment---middle, postive, negative user_sentiment_result['7'] = second_sentiment_all_count for sentiment_item in SENTIMENT_FIRST: try: sentiment_ratio = float(user_sentiment_result[sentiment_item]) / all_count except: sentiment_ratio = 0 results[uid]['first'][sentiment_item] = sentiment_ratio return results
def get_activity_weibo(task_name, start_ts): results = [] #step1: get task_name uid try: group_result = es_group_result.get(index=group_index_name, doc_type=group_index_type ,\ id=task_name, _source=False, fields=['uid_list']) except: group_result = {} if group_result == {}: return 'task name invalid' try: uid_list = group_result['fields']['uid_list'] except: uid_list = [] if uid_list == []: return 'task uid list null' #step2: get uid2uname uid2uname = {} try: user_portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, \ body = {'ids':uid_list}, _source=False, fields=['uname'])['docs'] except: user_portrait_result = [] for item in user_portrait_result: uid = item['_id'] if item['found']==True: uname = item['fields']['uname'][0] uid2uname[uid] = uname #step3: search time_segment weibo time_segment = FOUR_HOUR end_ts = start_ts + time_segment time_date = ts2datetime(start_ts) flow_text_index_name = flow_text_index_name_pre + time_date query = [] query.append({'terms':{'uid': uid_list}}) query.append({'range':{'timestamp':{'gte':start_ts, 'lt':end_ts}}}) try: flow_text_es_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type, \ body={'query':{'bool':{'must':query}}, 'sort':'timestamp', 'size':MAX_VALUE})['hits']['hits'] except: flow_text_es_result = [] for item in flow_text_es_result: weibo = {} source = item['_source'] weibo['timestamp'] = ts2date(source['timestamp']) weibo['ip'] = source['ip'] weibo['text'] = source['text'] if source['geo']: weibo['geo'] = '\t'.join(source['geo']) else: weibo['geo'] = '' results.append(weibo) return results
def get_activity_weibo(task_name, start_ts): results = [] #step1: get task_name uid try: group_result = es_group_result.get(index=group_index_name, doc_type=group_index_type ,\ id=task_name, _source=False, fields=['uid_list']) except: group_result = {} if group_result == {}: return 'task name invalid' try: uid_list = group_result['fields']['uid_list'] except: uid_list = [] if uid_list == []: return 'task uid list null' #step2: get uid2uname uid2uname = {} try: user_portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, \ body = {'ids':uid_list}, _source=False, fields=['uname'])['docs'] except: user_portrait_result = [] for item in user_portrait_result: uid = item['_id'] if item['found'] == True: uname = item['fields']['uname'][0] uid2uname[uid] = uname #step3: search time_segment weibo time_segment = FOUR_HOUR end_ts = start_ts + time_segment time_date = ts2datetime(start_ts) flow_text_index_name = flow_text_index_name_pre + time_date query = [] query.append({'terms': {'uid': uid_list}}) query.append({'range': {'timestamp': {'gte': start_ts, 'lt': end_ts}}}) try: flow_text_es_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type, \ body={'query':{'bool':{'must':query}}, 'sort':'timestamp', 'size':MAX_VALUE})['hits']['hits'] except: flow_text_es_result = [] for item in flow_text_es_result: weibo = {} source = item['_source'] weibo['timestamp'] = ts2date(source['timestamp']) weibo['ip'] = source['ip'] weibo['text'] = source['text'] if source['geo']: weibo['geo'] = '\t'.join(source['geo']) else: weibo['geo'] = '' results.append(weibo) return results
def new_get_user_weibo(uid, sort_type): results = [] weibo_list = [] now_date = ts2datetime(time.time()) #run_type if RUN_TYPE == 0: now_date = RUN_TEST_TIME sort_type = 'timestamp' #step1:get user name try: user_profile_result = es_user_profile.get(index=profile_index_name, doc_type=profile_index_type,\ id=uid, _source=False, fields=['nick_name']) except: user_profile_result = {} if user_profile_result: uname = user_profile_result['fields']['nick_name'][0] else: uname = '' #step2:get user weibo for i in range(7, 0, -1): iter_date = ts2datetime(datetime2ts(now_date) - i * DAY) index_name = flow_text_index_name_pre + iter_date try: weibo_result = es_flow_text.search(index=index_name, doc_type=flow_text_index_type,\ body={'query':{'filtered':{'filter':{'term': {'uid': uid}}}}, 'sort':sort_type, 'size':100})['hits']['hits'] except: weibo_result = [] if weibo_result: weibo_list.extend(weibo_result) print 'weibo_list:', weibo_list[0] sort_weibo_list = sorted(weibo_list, key=lambda x:x['_source'][sort_type], reverse=True)[:100] for weibo_item in sort_weibo_list: source = weibo_item['_source'] mid = source['mid'] uid = source['uid'] text = source['text'] ip = source['geo'] timestamp = source['timestamp'] date = ts2date(timestamp) sentiment = source['sentiment'] #run_type if RUN_TYPE == 1: retweet_count = source['retweet_count'] comment_count = source['comment_count'] sensitive_score = source['sensitive'] else: retweet_count = 0 comment_count = 0 sensitive_score = 0 city = ip2city(ip) results.append([mid, uid, text, ip, city,timestamp, date, retweet_count, comment_count, sensitive_score]) return results
def get_social_inter_content(uid1, uid2, type_mark): weibo_list = [] #get two type relation about uid1 and uid2 #search weibo list now_ts = int(time.time()) #run_type if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = datetime2ts(RUN_TEST_TIME) #uid2uname uid2uname = {} try: portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type ,\ body={'ids': [uid1, uid2]}, _source=False, fields=['uid', 'uname'])['docs'] except: portrait_result = [] for item in portrait_result: uid = item['_id'] if item['found'] == True: uname = item['fields']['uname'][0] uid2uname[uid] = uname else: uid2uname[uid] = 'unknown' #iter date to search weibo list for i in range(7, 0, -1): iter_date_ts = now_date_ts - i*DAY iter_date = ts2datetime(iter_date_ts) flow_text_index_name = flow_text_index_name_pre + str(iter_date) query = [] query.append({'bool':{'must':[{'term':{'uid':uid1}}, {'term':{'directed_uid': int(uid2)}}]}}) if type_mark=='out': query.append({'bool':{'must':[{'term':{'uid':uid2}}, {'term':{'directed_uid': int(uid1)}}]}}) try: flow_text_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body={'query': {'bool':{'should': query}}, 'sort':[{'timestamp':{'order': 'asc'}}], 'size':MAX_VALUE})['hits']['hits'] except: flow_text_result = [] for flow_text in flow_text_result: source = flow_text['_source'] weibo = {} weibo['timestamp'] = source['timestamp'] weibo['ip'] = source['ip'] weibo['geo'] = source['geo'] weibo['text'] = '\t'.join(source['text'].split('&')) weibo['uid'] = source['uid'] weibo['uname'] = uid2uname[weibo['uid']] weibo['directed_uid'] = str(source['directed_uid']) weibo['directed_uname'] = uid2uname[str(source['directed_uid'])] weibo_list.append(weibo) return weibo_list
def get_social_inter_content(uid1, uid2, type_mark): weibo_list = [] #get two type relation about uid1 and uid2 #search weibo list now_ts = int(time.time()) #run_type if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = datetime2ts(RUN_TEST_TIME) #uid2uname uid2uname = {} try: portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type ,\ body={'ids': [uid1, uid2]}, _source=False, fields=['uid', 'uname'])['docs'] except: portrait_result = [] for item in portrait_result: uid = item['_id'] if item['found'] == True: uname = item['fields']['uname'][0] uid2uname[uid] = uname else: uid2uname[uid] = 'unknown' #iter date to search weibo list for i in range(7, 0, -1): iter_date_ts = now_date_ts - i*DAY iter_date = ts2datetime(iter_date_ts) flow_text_index_name = flow_text_index_name_pre + str(iter_date) query = [] query.append({'bool':{'must':[{'term':{'uid':uid1}}, {'term':{'directed_uid': int(uid2)}}]}}) if type_mark=='out': query.append({'bool':{'must':[{'term':{'uid':uid2}}, {'term':{'directed_uid': int(uid1)}}]}}) try: flow_text_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body={'query': {'bool':{'should': query}}, 'sort':[{'timestamp':{'order': 'asc'}}], 'size':MAX_VALUE})['hits']['hits'] except: flow_text_result = [] for flow_text in flow_text_result: source = flow_text['_source'] weibo = {} weibo['timestamp'] = source['timestamp'] weibo['ip'] = source['ip'] weibo['geo'] = source['geo'] weibo['text'] = '\t'.join(source['text'].split('&')) weibo['uid'] = source['uid'] weibo['uname'] = uid2uname[weibo['uid']] weibo['directed_uid'] = str(source['directed_uid']) weibo['directed_uname'] = uid2uname[str(source['directed_uid'])] weibo_list.append(weibo) return weibo_list
def read_flow_text_sentiment(uid_list): """ 读取用户微博(返回结果有微博情绪标签): 输入数据:uid_list(字符串型列表) 输出数据:word_dict(用户分词结果字典),weibo_list(用户微博列表) word_dict示例:{uid1:{'w1':f1,'w2':f2...}...} weibo_list示例:[[uid1,text1,s1,ts1],[uid2,text2,s2,ts2],...](每一条记录对应四个值:uid、text、sentiment、timestamp) """ word_dict = dict() # 词频字典 weibo_list = [] # 微博列表 now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) now_date_ts = datetime2ts("2013-09-08") start_date_ts = now_date_ts - DAY * WEEK for i in range(0, WEEK): iter_date_ts = start_date_ts + DAY * i flow_text_index_date = ts2datetime(iter_date_ts) flow_text_index_name = flow_text_index_name_pre + flow_text_index_date print flow_text_index_name try: flow_text_exist = es_flow_text.search( index=flow_text_index_name, doc_type=flow_text_index_type, body={"query": {"filtered": {"filter": {"terms": {"uid": uid_list}}}}, "size": MAX_VALUE}, _source=False, fields=["text", "uid", "sentiment", "keywords_dict", "timestamp"], )["hits"]["hits"] except: flow_text_exist = [] for flow_text_item in flow_text_exist: uid = flow_text_item["fields"]["uid"][0].encode("utf-8") text = flow_text_item["fields"]["text"][0].encode("utf-8") sentiment = int(flow_text_item["fields"]["sentiment"][0]) ts = flow_text_item["fields"]["timestamp"][0] keywords_dict = json.loads(flow_text_item["fields"]["keywords_dict"][0]) keywords_dict = json.dumps(keywords_dict, encoding="UTF-8", ensure_ascii=False) keywords_dict = eval(keywords_dict) if word_dict.has_key(uid): item_dict = Counter(word_dict[uid]) keywords_dict = Counter(keywords_dict) item_dict = dict(item_dict + keywords_dict) word_dict[uid] = item_dict else: word_dict[uid] = keywords_dict weibo_list.append([uid, text, sentiment, ts]) return word_dict, weibo_list
def get_user_geo(uid, dropped_geos=u"中国&美国"): """ :param uid: 用户的id :param dropped_geos: &分割的地点,因为geo中都包含中国 :return: geo 位置的set """ dropped_geos = set(dropped_geos.split("&")) # 获取用户的偏好 try: user_portrait_result = es_user_portrait. \ get_source(index=portrait_index_name, doc_type=profile_index_type, id=uid) except NotFoundError: user_portrait_result = None # portrait表中存在geo信息 if user_portrait_result and len(user_portrait_result["activity_geo"]) > 0: geos = user_portrait_result["activity_geo"] - dropped_geos # 不存在geo信息,获取之前发去的微博提取 else: flow_text_index_list = [] now_timestamp = datetime2ts(ts2datetime(time.time())) if RUN_TYPE == 0: now_timestamp = datetime2ts(RUN_TEST_TIME) for i in range(7, 0, -1): iter_date = ts2datetime(now_timestamp - DAY * i) flow_text_index_list.append(flow_text_index_name_pre + iter_date) weibo_all = es_flow_text.search(index=flow_text_index_list, doc_type=flow_text_index_type, body={ 'query': { 'filtered': { 'filter': { 'term': { 'uid': uid } } } }, 'size': 2000, })['hits']['hits'] geos = set() for temp in weibo_all: geos |= set(temp["_source"]["geo"].split("&")) return geos
def get_influence_content(uid, timestamp_from, timestamp_to): weibo_list = [] #split timestamp range to new_range_dict_list from_date_ts = datetime2ts(ts2datetime(timestamp_from)) to_date_ts = datetime2ts(ts2datetime(timestamp_to)) new_range_dict_list = [] if from_date_ts != to_date_ts: iter_date_ts = from_date_ts while iter_date_ts < to_date_ts: iter_next_date_ts = iter_date_ts + DAY new_range_dict_list.append({'range':{'timestamp':{'gte':iter_date_ts, 'lt':iter_next_date_ts}}}) iter_date_ts = iter_next_date_ts if new_range_dict_list[0]['range']['timestamp']['gte'] < timestamp_from: new_range_dict_list[0]['range']['timestamp']['gte'] = timestamp_from if new_range_dict_list[-1]['range']['timestamp']['lt'] > timestamp_to: new_range_dict_list[-1]['range']['timestamp']['lt'] = timestamp_to else: new_range_dict_list = [{'range':{'timestamp':{'gte':timestamp_from, 'lt':timestamp_to}}}] #iter date to search flow_text iter_result = [] for range_item in new_range_dict_list: range_from_ts = range_item['range']['timestamp']['gte'] range_from_date = ts2datetime(range_from_ts) flow_text_index_name = flow_text_index_name_pre + range_from_date query = [] query.append({'term':{'uid':uid}}) query.append(range_item) try: flow_text_exist = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body={'query':{'bool':{'must': query}}, 'sort':[{'timestamp':'asc'}]})['hits']['hits'] except: flow_text_exist = [] iter_result.extend(flow_text_exist) # get weibo list for item in flow_text_exist: source = item['_source'] weibo = {} weibo['timestamp'] = ts2date(source['timestamp']) weibo['ip'] = source['ip'] weibo['text'] = source['text'] if source['geo']: weibo['geo'] = '\t'.join(source['geo'].split('&')) else: weibo['geo'] = '' weibo_list.append(weibo) return weibo_list
def read_flow_text_sentiment(uid_list): ''' 读取用户微博(返回结果有微博情绪标签): 输入数据:uid_list(字符串型列表) 输出数据:word_dict(用户分词结果字典),weibo_list(用户微博列表) word_dict示例:{uid1:{'w1':f1,'w2':f2...}...} weibo_list示例:[[uid1,text1,s1,ts1],[uid2,text2,s2,ts2],...](每一条记录对应四个值:uid、text、sentiment、timestamp) ''' word_dict = dict()#词频字典 weibo_list = []#微博列表 now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) now_date_ts = datetime2ts('2013-09-08') start_date_ts = now_date_ts - DAY * WEEK for i in range(0,WEEK): iter_date_ts = start_date_ts + DAY * i flow_text_index_date = ts2datetime(iter_date_ts) flow_text_index_name = flow_text_index_name_pre + flow_text_index_date print flow_text_index_name try: flow_text_exist = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'size': MAX_VALUE}, _source=False, fields=['text','uid','sentiment','keywords_dict','timestamp'])['hits']['hits'] except: flow_text_exist = [] for flow_text_item in flow_text_exist: uid = flow_text_item['fields']['uid'][0].encode('utf-8') text = flow_text_item['fields']['text'][0].encode('utf-8') sentiment = int(flow_text_item['fields']['sentiment'][0]) ts = flow_text_item['fields']['timestamp'][0] keywords_dict = json.loads(flow_text_item['fields']['keywords_dict'][0]) keywords_dict = json.dumps(keywords_dict, encoding="UTF-8", ensure_ascii=False) keywords_dict = eval(keywords_dict) if word_dict.has_key(uid): item_dict = Counter(word_dict[uid]) keywords_dict = Counter(keywords_dict) item_dict = dict(item_dict + keywords_dict) word_dict[uid] = item_dict else: word_dict[uid] = keywords_dict weibo_list.append([uid,text,sentiment,ts]) return word_dict,weibo_list
def cctv_video_rec(uid, k=10): flow_text_index_list = [] now_timestamp = datetime2ts(ts2datetime(time.time())) if RUN_TYPE == 0: now_timestamp = datetime2ts(RUN_TEST_TIME) for i in range(7, 0, -1): iter_date = ts2datetime(now_timestamp - DAY * i) flow_text_index_list.append(flow_text_index_name_pre + iter_date) weibo_all = es_flow_text.search(index=flow_text_index_list, doc_type=flow_text_index_type, body={ 'query': { 'filtered': { 'filter': { 'term': { 'uid': uid } } } }, 'size': 100, })['hits']['hits'] user_words = set() for weibo in weibo_all: weibo_text = weibo["_source"]["ip"] user_words |= set(jieba.cut(weibo_text)) rio_dict = load_topic_video_dict(RIO_VIDEO_INFO_FILE) tiger_videos = load_videos(TIGER_VIDEO_INFO_FILE) ret_dict = dict() ret_dict["tiger"] = random.sample(tiger_videos, k) user_pref_topic = set(rio_dict.keys()) & user_words # 若找不到,随机分配topic if len(user_pref_topic) == 0: user_pref_topic = set(random.sample(rio_dict.keys(), k)) ret_dict["rio"] = list() for topic in user_pref_topic: ret_dict["rio"].extend(rio_dict[topic]) if len(ret_dict["rio"]) >= k: ret_dict["rio"] = ret_dict["rio"][:k] break return ret_dict
def read_flow_text_sentiment(uid_list): ''' 读取用户微博(返回结果有微博情绪标签): 输入数据:uid_list(字符串型列表) 输出数据:word_dict(用户分词结果字典),weibo_list(用户微博列表) word_dict示例:{uid1:{'w1':f1,'w2':f2...}...} weibo_list示例:[[uid1,text1,s1,ts1],[uid2,text2,s2,ts2],...](每一条记录对应四个值:uid、text、sentiment、timestamp) ''' word_dict = dict()#词频字典 weibo_list = []#微博列表 now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) now_date_ts = datetime2ts('2013-09-08') start_date_ts = now_date_ts - DAY * WEEK for i in range(0,WEEK): iter_date_ts = start_date_ts + DAY * i flow_text_index_date = ts2datetime(iter_date_ts) flow_text_index_name = flow_text_index_name_pre + flow_text_index_date print flow_text_index_name try: flow_text_exist = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'size': MAX_VALUE}, _source=False, fields=['text','uid','sentiment','keywords_dict','timestamp'])['hits']['hits'] except: flow_text_exist = [] for flow_text_item in flow_text_exist: uid = flow_text_item['fields']['uid'][0].encode('utf-8') text = flow_text_item['fields']['text'][0].encode('utf-8') sentiment = int(flow_text_item['fields']['sentiment'][0]) ts = flow_text_item['fields']['timestamp'][0] keywords_dict = json.loads(flow_text_item['fields']['keywords_dict'][0]) keywords_dict = json.dumps(keywords_dict, encoding="UTF-8", ensure_ascii=False) keywords_dict = eval(keywords_dict) if word_dict.has_key(uid): item_dict = Counter(word_dict[uid]) keywords_dict = Counter(keywords_dict) item_dict = dict(item_dict + keywords_dict) word_dict[uid] = item_dict else: word_dict[uid] = keywords_dict weibo_list.append([uid,text,sentiment,ts]) return word_dict,weibo_list
def localRec(uid, k=200): # 运行状态, # 0 -> 当前为2016-11-28 00:00:00 # 1 -> 当前时间 now_timestamp = datetime2ts(ts2datetime(time.time())) if RUN_TYPE == 0: now_timestamp = datetime2ts(RUN_TEST_TIME) flow_text_index_list = [] for i in range(7, 0, -1): iter_date = ts2datetime(now_timestamp - DAY * i) flow_text_index_list.append(flow_text_index_name_pre + iter_date) # 获取用户地理位置 # user_geos = get_user_geo(uid) # # 根据位置查询weibo # weibo_all = es_flow_text.search(index=flow_text_index_list, doc_type=ads_weibo_index_type, # body={"query":{"bool":{"must": # [{"match":{"keywords_string":"新闻"}}, # {"match":{"geo":"合肥"}} # ]}}, # "size": 200 # })["hits"]["hits"] '''可以直接查询长度大于100的但是很慢 {"query":{"filtered":{"query":{"bool":{"must":[{"match":{"keywords_string":"新闻"}},{"match":{"geo":"合肥"}}]}},"filter":{"regexp":{"text":{"value":".{100,}"}}}}}} ''' ip = get_user_ip(uid) ip = ".".join(ip.split(".")[:-2]) weibo_all = es_flow_text.search(index=flow_text_index_list, doc_type=ads_weibo_index_type, body={ "query": { "bool": { "must": [{ "prefix": { "text.ip": ip } }] } }, "size": 2000 })["hits"]["hits"] local_weibo_rec = [] weibo_user_uids = [weibo["_source"]["uid"] for weibo in weibo_all] user_profiles = search_user_profile_by_user_ids(weibo_user_uids) exists_ip = set() for weibo in weibo_all: weibo = weibo["_source"] weibo_text = weibo["text"] if weibo["ip"] in exists_ip: continue # 一个ip只选一个 exists_ip.add(weibo["ip"]) if not is_suit(weibo_text): continue weibo["len"] = len(weibo_text) try: mid = weibo["mid"] uid = weibo["uid"] except: continue weibo["weibo_url"] = weiboinfo2url(uid, mid) # 可能出现许多userprofile查不到的情况 if uid in user_profiles: weibo["photo_url"] = user_profiles[uid]["photo_url"] weibo["nick_name"] = user_profiles[uid]["nick_name"] else: weibo["photo_url"] = "None" weibo["nick_name"] = "None" local_weibo_rec.append(weibo) return local_weibo_rec
def adsRec(uid, queryInterval=HOUR * 4): ''' 从广告表中读取当前时间点前一段时间queryInterval内的广微博,得到其中的广告部分 然后根据用户的key_word信息得到推荐的广告。 :param uid: 用户ID :param queryInterval: 查询之前多久的广告 :return: 广告微博列表,按照相关度(感兴趣程度)排序 ''' # 运行状态, # 0 -> 当前为2013-9-8 00:00:00 # 1 -> 当前时间 now_date = ts2datetime(time.time()) if RUN_TYPE == 1 else ts2datetime( datetime2ts(RUN_TEST_TIME) - DAY) # 获取用户的偏好 user_portrait_result = es_user_portrait. \ get_source(index=portrait_index_name, doc_type=profile_index_type, id=uid) user_key_words = set(user_portrait_result["keywords_string"].split("&")) # test,目前使用的是从原始数据中读取一定时间段内的微博并实时计算的方式得到 now_timestamp = datetime2ts(ts2datetime(time.time())) if RUN_TYPE == 0: now_timestamp = datetime2ts(RUN_TEST_TIME) ads_weibo_index_name = [] for i in range(7, 0, -1): iter_date = ts2datetime(now_timestamp - DAY * i) ads_weibo_index_name.append(flow_text_index_name_pre + iter_date) ads_weibo_all = es_flow_text.search( index=ads_weibo_index_name, doc_type=ads_weibo_index_type, body={ 'query': { "filtered": { "filter": { "range": { "timestamp": { "gte": datetime2ts(now_date) - queryInterval } } } } }, 'size': 2000, })['hits']['hits'] # 根据权重得到不同类别上词语的权重TFIDF topic_word_weight_dic = construct_topic_word_weight_dic( ADS_TOPIC_TFIDF_DIR) # 根据用户发微博的keywords得到用户在广告的topic上的分布 # 因为已有的topic不太适合广告的分类 user_topic_dic = construct_topic_feature_dic(user_key_words, topic_word_weight_dic) ads_weibo_prefer = adsPreferred(user_topic_dic, ads_weibo_all, topic_word_weight_dic, 30) return ads_weibo_prefer
def get_sensitive_weibo_detail(ts, social_sensors, sensitive_words_list, message_type, size=100): results = [] query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte": ts - time_interval, "lt": ts } }}, {"term": {"message_type": message_type}}, {"terms":{"keywords_string": sensitive_words_list}} ] } } } }, "size": size, "sort": {"timestamp": {"order": "desc"}} } if social_sensors: query_body['query']['filtered']['filter']['bool']['must'].append({"terms": {"uid": social_sensors}}) datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) if datetime == datetime_1 and exist_es: search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] elif datetime != datetime_1 and exist_es_1: search_results = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] uid_list = [] if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) if uid_list: portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"] for i in range(len(uid_list)): item = search_results[i]['_source'] temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type temp.append(item['uid']) if portrait_result[i]['found']: temp.append(portrait_result[i]["fields"]["nick_name"][0]) temp.append(portrait_result[i]["fields"]["photo_url"][0]) else: temp.append("unknown") temp.append("") temp.append(item["text"]) #print item['text'] temp.append(item["sentiment"]) temp.append(ts2date(item['timestamp'])) temp.append(item['geo']) keywords_set = set(item['keywords_string'].split('&')) common_keywords = set(sensitive_words_list) & keywords_set temp.append(list(common_keywords)) temp.append(item['message_type']) results.append(temp) return results
def get_sensitive_text_detail(task_name, ts, user, order): _id = user + '-' + task_name task_detail = es.get(index=index_sensing_task, doc_type=_id, id=ts)['_source'] weibo_detail = json.loads(task_detail['sensitive_weibo_detail']) weibo_detail_list = [] if weibo_detail: for iter_mid, item in weibo_detail.iteritems(): tmp = [] tmp.append(iter_mid) tmp.append(item[iter_mid]) tmp.append(item['retweeted']) tmp.append(item['comment']) weibo_detail_list.append(tmp) mid_list = weibo_detail.keys() results = [] query_body = { "query":{ "filtered":{ "filter":{ "terms":{"mid": mid_list} } } } } index_list = [] datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-DAY) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: index_list.append(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) if exist_es_1: index_list.append(index_name_1) if index_list and mid_list: search_results = es_text.search(index=index_list, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] uid_list = [] text_dict = dict() # 文本信息 portrait_dict = dict() # 背景信息 if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) text_dict[item['_id']] = item['_source'] # _id是mid if uid_list: portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"] for item in portrait_result: if item['found']: portrait_dict[item['_id']] = {"nick_name": item["fields"]["nick_name"][0], "photo_url": item["fields"]["photo_url"][0]} else: portrait_dict[item['_id']] = {"nick_name": item['_id'], "photo_url":""} if order == "total": sorted_list = sorted(weibo_detail_list, key=lambda x:x[1], reverse=True) elif order == "retweeted": sorted_list = sorted(weibo_detail_list, key=lambda x:x[2], reverse=True) elif order == "comment": sorted_list = sorted(weibo_detail_list, key=lambda x:x[3], reverse=True) else: sorted_list = weibo_detail_list count_n = 0 for item in sorted_list: mid = item[0] iter_text = text_dict.get(mid, {}) temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type if iter_text: uid = iter_text['uid'] temp.append(uid) iter_portrait = portrait_dict.get(uid, {}) if iter_portrait: temp.append(iter_portrait['nick_name']) temp.append(iter_portrait['photo_url']) else: temp.extend([uid,'']) temp.append(iter_text["text"]) temp.append(iter_text["sentiment"]) temp.append(ts2date(iter_text['timestamp'])) temp.append(iter_text['geo']) temp.append(iter_text['message_type']) temp.append(item[2]) temp.append(item[3]) temp.append(iter_text.get('sensitive', 0)) count_n += 1 results.append(temp) if results and order == "ts": results = sorted(results, key=lambda x:x[5], reverse=True) if results and order == "sensitive": results = sorted(results, key=lambda x:x[-1], reverse=True) return results
def search_sentiment_detail_all_keywords(start_ts, task_type, task_detail, time_segment, sentiment, sort_type): results = {} must_query_list = [] #step0: get query keywords list keyword_nest_body_list = [] keywords_list = task_detail.split(',') print 'keywords_list:', keywords_list for keywords_item in keywords_list: keyword_nest_body_list.append({'wildcard':{'text': '*' + keywords_item + '*'}}) must_query_list.append({'bool':{'should': keyword_nest_body_list}}) #step1: get weibo from flow_text start_ts = int(start_ts) start_date = ts2datetime(start_ts) end_ts = start_ts + str2segment[time_segment] if sentiment == '7': query_sentiment_list = SENTIMENT_SECOND else: query_sentiment_list = [sentiment] must_query_list.append({'range': {'timestamp': {'gte': start_ts, 'lt':end_ts}}}) must_query_list.append({'terms': {'sentiment': query_sentiment_list}}) query_body = { 'query':{ 'bool':{ 'must': must_query_list } }, 'size': SENTIMENT_MAX_TEXT, 'sort': sort_type } flow_text_index_name = flow_text_index_name_pre + start_date print 'flow_text_index_name:', flow_text_index_name try: flow_text_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body=query_body)['hits']['hits'] except: flow_text_result = [] print 'flow_text_result:', len(flow_text_result) print 'show weibo list' show_weibo_list, user_set = deal_show_weibo_list(flow_text_result) print 'get keyword' #step2: get keywords from flow_text keyword_query_dict = { 'query':{ 'bool':{ 'must':must_query_list } }, 'aggs':{ 'all_interests': { 'terms': { 'field': 'keywords_string', 'size': SENTIMENT_MAX_KEYWORDS } } } } show_keywords_dict = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body=keyword_query_dict)['aggregations']['all_interests']['buckets'] keywords_list = [[item['key'], item['doc_count']] for item in show_keywords_dict] #step3: get user information filter_type = 'in-out' in_portrait_result, out_portrait_result = identify_user_portrait(user_set, filter_type) #step4: add uname to show weibo list show_weibo_list = add_uname2weibo(show_weibo_list, in_portrait_result, out_portrait_result) #step5: results results['weibo'] = show_weibo_list results['in_portrait_result'] = sorted(in_portrait_result.items(), key=lambda x:x[1][1], reverse=True)[:SENTIMENT_MAX_USER] results['out_portrait_result'] = sorted(out_portrait_result.items(), key=lambda x:x[1][3], reverse=True)[:SENTIMENT_MAX_USER] results['keywords'] = keywords_list return results
def get_origin_weibo_detail(ts, user, task_name, size, order, message_type=1): _id = user + '-' + task_name task_detail = es_user_portrait.get(index=index_sensing_task, doc_type=_id, id=ts)['_source'] mid_value = json.loads(task_detail['mid_topic_value']) duplicate_dict = json.loads(task_detail['duplicate_dict']) tmp_duplicate_dict = dict() for k, v in duplicate_dict.iteritems(): try: tmp_duplicate_dict[v].append(k) except: tmp_duplicate_dict[v] = [k, v] if message_type == 1: weibo_detail = json.loads(task_detail['origin_weibo_detail']) elif message_type == 2: weibo_detail = json.loads(task_detail['retweeted_weibo_detail']) else: weibo_detail = json.loads(task_detail['sensitive_weibo_detail']) weibo_detail_list = [] if weibo_detail: for iter_mid, item in weibo_detail.iteritems(): tmp = [] tmp.append(iter_mid) tmp.append(item[iter_mid]) tmp.append(item['retweeted']) tmp.append(item['comment']) weibo_detail_list.append(tmp) mid_list = weibo_detail.keys() print len(mid_list) results = [] query_body = { "query": { "filtered": { "filter": { "terms": { "mid": mid_list } } } }, "size": 1000, "sort": { "timestamp": { "order": "desc" } } } index_list = [] datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts - DAY) index_name = flow_text_index_name_pre + datetime print es_text exist_es = es_text.indices.exists(index_name) print exist_es if exist_es: index_list.append(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) if exist_es_1: index_list.append(index_name_1) if index_list and mid_list: search_results = es_text.search(index=index_list, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] uid_list = [] text_dict = dict() # 文本信息 portrait_dict = dict() # 背景信息 sort_results = [] if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) text_dict[item['_id']] = item['_source'] # _id是mid if uid_list: portrait_result = es_profile.mget( index=profile_index_name, doc_type=profile_index_type, body={"ids": uid_list}, fields=['nick_name', 'photo_url'])["docs"] for item in portrait_result: if item['found']: portrait_dict[item['_id']] = { "nick_name": item["fields"]["nick_name"][0], "photo_url": item["fields"]["photo_url"][0] } else: portrait_dict[item['_id']] = { "nick_name": item['_id'], "photo_url": "" } if order == "total": sorted_list = sorted(weibo_detail_list, key=lambda x: x[1], reverse=True) elif order == "retweeted": sorted_list = sorted(weibo_detail_list, key=lambda x: x[2], reverse=True) elif order == "comment": sorted_list = sorted(weibo_detail_list, key=lambda x: x[3], reverse=True) else: sorted_list = weibo_detail_list count_n = 0 results_dict = dict() mid_index_dict = dict() for item in sorted_list: # size mid = item[0] iter_text = text_dict.get(mid, {}) temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, keywords_string, message_type if iter_text: uid = iter_text['uid'] temp.append(uid) iter_portrait = portrait_dict.get(uid, {}) if iter_portrait: temp.append(iter_portrait['nick_name']) temp.append(iter_portrait['photo_url']) else: temp.extend([uid, '']) temp.append(iter_text["text"]) temp.append(iter_text["sentiment"]) temp.append(ts2date(iter_text['timestamp'])) temp.append(iter_text['geo']) if message_type == 1: temp.append(1) elif message_type == 2: temp.append(3) else: temp.append(iter_text['message_type']) temp.append(iter_text['keywords_string']) temp.append(item[2]) temp.append(item[3]) temp.append(iter_text.get('sensitive', 0)) temp.append(iter_text['timestamp']) temp.append(mid_value[mid]) temp.append(mid) results.append(temp) count_n += 1 results = sorted(results, key=operator.itemgetter(-4, -2, -6), reverse=True) # -4 -2 -3 sort_results = [] count = 0 for item in results: sort_results.append([item]) mid_index_dict[item[-1]] = count count += 1 if tmp_duplicate_dict: remove_list = [] value_list = tmp_duplicate_dict.values() # [[mid, mid], ] for item in value_list: tmp = [] for mid in item: if mid_index_dict.get(mid, 0): tmp.append(mid_index_dict[mid]) if len(tmp) > 1: tmp_min = min(tmp) else: continue tmp.remove(tmp_min) for iter_count in tmp: sort_results[tmp_min].extend(sort_results[iter_count]) remove_list.append(sort_results[iter_count]) if remove_list: for item in remove_list: sort_results.remove(item) return sort_results
def get_positive_weibo_detail(ts, social_sensors, keywords_list, size, sentiment_type=1): former_mid_list = query_mid_list(ts-time_interval, keywords_list, time_segment, social_sensors) # 前一段时间内的微博mid list current_mid_list = query_mid_list(ts, keywords_list, time_interval, social_sensors) mid_list = [] mid_list.extend(former_mid_list) mid_list.extend(current_mid_list) query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte": ts - time_interval, "lt": ts } }} ], "should":[ {"terms": {"root_mid": mid_list}}, {"terms": {"mid": mid_list}}, ] } } } }, "sort": {"timestamp": {"order": "desc"}}, "size": 100 } #if social_sensors and int(sentiment_type) == 1: # query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"terms":{"uid": social_sensors}}) if int(sentiment_type) == 1 or int(sentiment_type) == 0: query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"term":{"sentiment":sentiment_type}}) else: query_body["query"]["filtered"]["filter"]["bool"]["must"].append([{"terms":{"sentiment": ["2", "3"]}}]) # 判断当前ts和ts-time_interval是否属于同一天,确定查询哪个es datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) # 1. 聚合原创微博mid list if datetime == datetime_1 and exist_es: search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] elif datetime != datetime_1 and exist_es_1: search_results = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] uid_list = [] results = [] if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) if uid_list: portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"] for i in range(len(uid_list)): item = search_results[i]['_source'] temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type temp.append(item['uid']) if portrait_result[i]['found']: temp.append(portrait_result[i]["fields"]["nick_name"][0]) temp.append(portrait_result[i]["fields"]["photo_url"][0]) else: temp.append("unknown") temp.append("") temp.append(item["text"]) temp.append(item["sentiment"]) temp.append(ts2date(item['timestamp'])) temp.append(item['geo']) keywords_set = set(item['keywords_string'].split('&')) common_keywords = set(keywords_list) & keywords_set temp.append(list(common_keywords)) temp.append(item['message_type']) results.append(temp) return results
def search_retweet_network_keywords(task_id, uid): results = {} task_results = es_network_task.get(index=network_keywords_index_name, \ doc_type=network_keywords_index_type, id=task_id)['_source'] start_date = task_results['start_date'] start_ts = datetime2ts(start_date) end_date = task_resuts['end_date'] end_ts = datetime2ts(end_date) iter_date_ts = start_ts to_date_ts = end_ts iter_query_date_list = [] # ['2013-09-01', '2013-09-02'] while iter_date_ts <= to_date_ts: iter_date = ts2datetime(iter_date_ts) iter_query_date_list.append(iter_date) iter_date_ts += DAY #step2: get iter search flow_text_index_name #step2.1: get search keywords list query_must_list = [] keyword_nest_body_list = [] keywords_string = task_results['query_keywords'] keywords_list = keywords_string.split('&') for keywords_item in keywords_list: keyword_nest_body_list.append({'wildcard': {'text': '*' + keywords_item + '*'}}) query_must_list.append({'bool': {'should': keyword_nest_body_list}}) network_results = {} retweet_query = query_must_list be_retweet_query = query_must_list #retweet retweet_query.append({'term': {'uid': uid}}) item_results = {} for iter_date in iter_query_date_list: flow_text_index_name = flow_text_index_name_pre + iter_date query_body = { 'query':{ 'bool':{ 'must':retweet_query } }, 'size': 100 } flow_text_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body=query_body)['hits']['hits'] for item in flow_text_result: source = item['_source'] source_uid = source['directed_uid'] try: item_results[source_uid] += 1 except: item_results[source_uid] = 1 results = retweet_dict2results(uid, item_results) network_results['retweet'] = results #be_retweet retweet_query.append({'term': {'directed_uid': uid}}) item_results = {} for iter_date in iter_query_date_list: flow_text_index_name = flow_text_index_name_pre + iter_date query_body = { 'query':{ 'bool':{ 'must':retweet_query } }, 'size': 100 } flow_text_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body=query_body)['hits']['hits'] for item in flow_text_result: source = item['_source'] source_uid = source['directed_uid'] try: item_results[source_uid] += 1 except: item_results[source_uid] = 1 results = retweet_dict2results(uid, item_results) network_results['be_retweet'] = results return network_results
def new_get_user_weibo(uid, sort_type): results = [] weibo_list = [] now_date = ts2datetime(time.time()) #run_type if RUN_TYPE == 0: now_date = RUN_TEST_TIME sort_type = 'timestamp' #step1:get user name print '708' try: user_profile_result = es_user_profile.get(index=profile_index_name, doc_type=profile_index_type,\ id=uid, _source=False, fields=['nick_name']) except: user_profile_result = {} print '714', len(user_profile_result) if user_profile_result: uname = user_profile_result['fields']['nick_name'][0] else: uname = '' #step2:get user weibo for i in range(7, 0, -1): if RUN_TYPE == 1: iter_date = ts2datetime(datetime2ts(now_date) - i * DAY) else: iter_date = '2013-09-01' index_name = flow_text_index_name_pre + iter_date print '726' try: weibo_result = es_flow_text.search(index=index_name, doc_type=flow_text_index_type,\ body={'query':{'filtered':{'filter':{'term': {'uid': uid}}}}, 'size':MAX_VALUE})['hits']['hits'] #print weibo_result except: weibo_result = [] print '732', len(weibo_result) if weibo_result: weibo_list.extend(weibo_result) #sort_weibo_list = sorted(weibo_list, key=lambda x:x['_source'][sort_type], reverse=True)[:100] mid_set = set() for weibo_item in weibo_list: source = weibo_item['_source'] mid = source['mid'] uid = source['uid'] text = source['text'] ip = source['ip'] timestamp = source['timestamp'] date = ts2date(timestamp) sentiment = source['sentiment'] weibo_url = weiboinfo2url(uid, mid) #run_type if RUN_TYPE == 1: try: retweet_count = source['retweeted'] except: retweet_count = 0 try: comment_count = source['comment'] except: comment_count = 0 try: sensitive_score = source['sensitive'] except: sensitive_score = 0 else: retweet_count = 0 comment_count = 0 sensitive_score = 0 city = ip2city(ip) if mid not in mid_set: results.append([ mid, uid, text, ip, city, timestamp, date, retweet_count, comment_count, sensitive_score, weibo_url ]) mid_set.add(mid) if sort_type == 'timestamp': sort_results = sorted(results, key=lambda x: x[5], reverse=True) elif sort_type == 'retweet_count': sort_results = sorted(results, key=lambda x: x[7], reverse=True) elif sort_type == 'comment_count': sort_results = sorted(results, key=lambda x: x[8], reverse=True) elif sort_type == 'sensitive': sort_results = sorted(results, key=lambda x: x[9], reverse=True) print '778' return sort_results
def influenced_detail(uid, date, style): date1 = str(date).replace('-', '') index_name = pre_index + date1 index_text = "flow_text_" + date #detail_text = {} style = int(style) try: user_info = es_cluster.get(index=index_name, doc_type=influence_doctype, id=uid)["_source"] except: result = {} return result origin_retweetd_dict = json.loads(user_info["origin_weibo_retweeted_detail"]) origin_comment_dict = json.loads(user_info['origin_weibo_comment_detail']) retweeted_retweeted_dict = json.loads(user_info["retweeted_weibo_retweeted_detail"]) retweeted_comment_dict = json.loads(user_info["retweeted_weibo_comment_detail"]) origin_retweetd = sorted(origin_retweetd_dict.items(), key=lambda x:x[1], reverse=True) origin_comment = sorted(origin_comment_dict.items(), key=lambda x:x[1], reverse=True) retweeted_retweeted = sorted(retweeted_retweeted_dict.items(), key=lambda x:x[1], reverse=True) retweeted_comment = sorted(retweeted_comment_dict.items(), key=lambda x:x[1], reverse=True) query_body_origin = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"term":{"message_type": 1}}, {"term":{"uid": uid}} ] } } } }, "size": 10000 } result_1 = es.search(index=index_text, doc_type="text", body=query_body_origin)['hits']['hits'] origin_set = set() if result_1: for item in result_1: origin_set.add(item['_id']) query_body_retweeted = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"term":{"message_type": 3}}, {"term":{"uid": uid}} ] } } } }, "size": 10000 } result_2 = es.search(index=index_text, doc_type="text", body=query_body_retweeted)['hits']['hits'] retweeted_set = set() if result_2: for item in retweeted_set: retweeted_set.add(item['_id']) if origin_retweetd: for item in origin_retweetd: if item[0] not in origin_set: origin_retweetd.remove(item) if origin_comment: for item in origin_comment: if item[0] not in origin_set: origin_comment.remove(item) if retweeted_retweeted: for item in retweeted_retweeted: if item[0] not in retweeted_set: retweeted_retweeted.remove(item) if retweeted_comment: for item in retweeted_comment: if item[0] not in retweeted_set: retweeted_comment.remove(item) if style == 0: detail_text = get_text(origin_retweetd[:20], date, user_info, style) elif style == 1: detail_text = get_text(origin_comment[:20], date, user_info, style) elif style == 2: detail_text = get_text(retweeted_retweeted[:20], date, user_info, style) else: detail_text = get_text(retweeted_comment[:20], date, user_info, style) #detail_text["origin_retweeted"] = get_text(origin_retweetd, date) #detail_text["origin_comment"] = get_text(origin_comment, date) #detail_text["retweeted_retweeted"] = get_text(retweeted_retweeted, date) #detail_text["retweeted_comment"] = get_text(retweeted_comment, date) return detail_text
def influenced_people(uid, mid, influence_style, date, default_number=20): # uid # which weibo----mid, retweeted weibo ---seek for root_mid # influence_style: retweeted(0) or comment(1) date1 = ts2datetime(datetime2ts(date)).replace('-', '') index_name = pre_index + date1 index_flow_text = pre_text_index + date text_result = es.get(index=index_flow_text, doc_type=flow_text_index_type, id=mid)["_source"] temp_mid = text_result.get("root_mid",'') #判断微博是否是原创微博 if temp_mid: mid_type = 1 # 非原创微博 else: mid_type = 0 # 原创微博 query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ ] } } } }, "size": 100000 } if mid_type == 0: if int(influence_style) == 0: # origin weibo, all retweeted people query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"root_uid": uid}}, {"term": {"message_type": 3}}, {"term": {"root_mid": mid}}]) else: # commented people query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"directed_uid": uid}}, {"term": {"message_type": 2}}, {"term": {"root_mid": mid}}]) else: if int(influence_style) == 0: # origin weibo, all retweeted people query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"directed_uid": uid}}, {"term": {"message_type": 3}}, {"term": {"root_mid": temp_mid}}]) else: # commented people query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"directed_uid": uid}}, {"term": {"message_type": 2}}, {"term": {"root_mid": temp_mid}}]) search_results = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=query_body, fields=["uid"], timeout=30)["hits"]["hits"] results = [] if search_results: for item in search_results: if int(item["fields"]["uid"][0]) == int(uid): pass else: results.append(item["fields"]["uid"][0]) results = list(set(results)) else: results = [] if results: portrait_results = es_user_portrait.mget(index=user_portrait, doc_type=portrait_index_type, body={"ids": results}, fields=["domain", "topic_string", "activity_geo_dict","importance", "influence"])["docs"] else: portrait_results = {} in_portrait = [] out_portrait = [] in_portrait_info = [] retweeted_domain = {} retweeted_topic = {} retweeted_geo = {} average_influence = 0 total_influence = 0 count = 0 if portrait_results: for item in portrait_results: if item["found"]: temp = [] count += 1 temp.append(item['_id']) temp.append(item["fields"]["importance"][0]) in_portrait.append(temp) temp_domain = item["fields"]["domain"][0].split('&') temp_topic = item["fields"]["topic_string"][0].split('&') temp_geo = json.loads(item["fields"]["activity_geo_dict"][0])[-1].keys() total_influence += item["fields"]["influence"][0] retweeted_domain = aggregation(temp_domain, retweeted_domain) retweeted_topic = aggregation(temp_topic, retweeted_topic) retweeted_geo = aggregation(temp_geo, retweeted_geo) else: out_portrait.append(item['_id']) retweeted_domain = proportion(retweeted_domain) retweeted_topic = proportion(retweeted_topic) retweeted_geo = proportion(retweeted_geo) try: average_influence = total_influence/count except: average_influence = 0 sorted_retweeted_domain = sorted(retweeted_domain.items(),key=lambda x:x[1], reverse=True) sorted_retweeted_topic = sorted(retweeted_topic.items(),key=lambda x:x[1], reverse=True) sorted_retweeted_geo = sorted(retweeted_geo.items(), key=lambda x:x[1], reverse=True) retweeted_results = dict() retweeted_results["domian"] = sorted_retweeted_domain[:5] retweeted_results["topic"] = sorted_retweeted_topic[:5] retweeted_results["geo"] = sorted_retweeted_geo[:5] retweeted_results["influence"] = average_influence in_portrait = sorted(in_portrait, key=lambda x:x[1], reverse=True) temp_list = [] for item in in_portrait: temp_list.append(item[0]) retweeted_results['in_portrait_number'] = len(temp_list) retweeted_results['out_portrait_number'] = len(out_portrait) in_portrait_url = get_user_url(temp_list[:default_number]) out_portrait_url = get_user_url(out_portrait[:default_number]) return_results = dict() return_results["influence_users"] = [in_portrait_url, out_portrait_url] return_results["influence_distribution"] = retweeted_results return return_results return ([in_portrait_url[:default_number], out_portrait_url[:default_number]])
def influenced_user_detail(uid, date, origin_retweeted_mid, retweeted_retweeted_mid, message_type, default_number=20): query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must": [ ] } } } }, "size":10000 } #详细影响到的人 date1 = str(date).replace('-', '') index_name = pre_index + date1 index_flow_text = pre_text_index + date origin_retweeted_uid = [] # influenced user uid_list retweeted_retweeted_uid = [] origin_comment_uid = [] retweeted_comment_uid = [] if origin_retweeted_mid: # 所有转发该条原创微博的用户 query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"terms": {"root_mid": origin_retweeted_mid}}) query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term":{"message_type": message_type}}, {"term":{"root_uid": uid}}]) origin_retweeted_result = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=query_body, fields=["uid"])["hits"]["hits"] if origin_retweeted_result: for item in origin_retweeted_result: origin_retweeted_uid.append(item["fields"]["uid"][0]) if retweeted_retweeted_mid: # 所有评论该条原创微博的用户 query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"terms": {"root_mid": retweeted_retweeted_mid}}) query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term":{"message_type": message_type}},{"term": {"directed_uid": uid}}]) retweeted_retweeted_result = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=query_body, fields=["uid"])["hits"]["hits"] if retweeted_retweeted_result: for item in retweeted_retweeted_result: retweeted_retweeted_uid.append(item["fields"]["uid"][0]) retweeted_uid_list = [] # all retweeted user list retweeted_results = {} # statistics of all retweeted uid information retweeted_domain = {} retweeted_topic = {} retweeted_geo = {} in_portrait = [] out_portrait = [] average_influence = 0 total_influence = 0 count = 0 retweeted_uid_list.extend(origin_retweeted_uid) retweeted_uid_list.extend(retweeted_retweeted_uid) retweeted_uid_list = list(set(retweeted_uid_list) - set([uid])) # filter uids if retweeted_uid_list: user_portrait_result = es_user_portrait.mget(index=user_portrait, doc_type=portrait_index_type, body={"ids": retweeted_uid_list}, fields=["domain", "topic_string", "activity_geo_dict","importance", "influence"])["docs"] for item in user_portrait_result: if item["found"]: temp = [] count += 1 temp.append(item['_id']) temp.append(item["fields"]["importance"][0]) in_portrait.append(temp) temp_domain = item["fields"]["domain"][0].split('&') temp_topic = item["fields"]["topic_string"][0].split('&') temp_geo = json.loads(item["fields"]["activity_geo_dict"][0])[-1].keys() total_influence += item["fields"]["influence"][0] retweeted_domain = aggregation(temp_domain, retweeted_domain) retweeted_topic = aggregation(temp_topic, retweeted_topic) retweeted_geo = aggregation(temp_geo, retweeted_geo) else: out_portrait.append(item['_id']) retweeted_domain = proportion(retweeted_domain) retweeted_topic = proportion(retweeted_topic) retweeted_geo = proportion(retweeted_geo) try: average_influence = total_influence/count except: average_influence = 0 sorted_retweeted_domain = sorted(retweeted_domain.items(),key=lambda x:x[1], reverse=True) sorted_retweeted_topic = sorted(retweeted_topic.items(),key=lambda x:x[1], reverse=True) sorted_retweeted_geo = sorted(retweeted_geo.items(), key=lambda x:x[1], reverse=True) retweeted_results["domian"] = sorted_retweeted_domain[:5] retweeted_results["topic"] = sorted_retweeted_topic[:5] retweeted_results["geo"] = sorted_retweeted_geo[:5] retweeted_results["influence"] = average_influence in_portrait = sorted(in_portrait, key=lambda x:x[1], reverse=True) temp_list = [] for item in in_portrait: temp_list.append(item[0]) retweeted_results['in_portrait_number'] = len(temp_list) retweeted_results['out_portrait_number'] = len(out_portrait) in_portrait_url = get_user_url(temp_list[:default_number]) out_portrait_url = get_user_url(out_portrait[:default_number]) retweeted_results["in_portrait"] = in_portrait_url retweeted_results["out_portrait"] = out_portrait_url retweeted_results["total_number"] = len(temp_list) + len(out_portrait) return retweeted_results
def statistics_influence_people(uid, date, style): # output: different retweeted and comment, uids' domain distribution, topic distribution, registeration geo distribution results = {} # retwweted weibo people and comment weibo people date1 = str(date).replace('-', '') index_name = pre_index + date1 index_flow_text = pre_text_index + date try: bci_result = es_cluster.get(index=index_name, doc_type=influence_doctype, id=uid)["_source"] except: bci_result = [] return results origin_mid = [] # origin weibo mid retweeted_mid = [] # retweeted weibo mid query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ ] } } } }, "size":1000 } body_1 = copy.deepcopy(query_body) body_2 = copy.deepcopy(query_body) body_1["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term":{"message_type": 1}}, {"term":{"uid": uid}}]) result_1 = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=body_1)["hits"]["hits"] if result_1: for item in result_1: origin_mid.append(item['_id']) body_1["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term":{"message_type": 3}}, {"term":{"uid": uid}}]) result_2 = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=body_2)["hits"]["hits"] if result_2: for item in result_2: if item['_source'].get('root_mid', ''): retweeted_mid.append(item['_source']['root_mid']) origin_retweeted = json.loads(bci_result["origin_weibo_retweeted_detail"]) retweeted_retweeted = json.loads(bci_result["retweeted_weibo_retweeted_detail"]) origin_comment = json.loads(bci_result["origin_weibo_comment_detail"]) retweeted_comment = json.loads(bci_result["retweeted_weibo_comment_detail"]) """ retweeted_total_number = sum(origin_retweeted.values()) + sum(retweeted_retweeted.values()) comment_total_number = sum(origin_comment.values()) + sum(retweeted_comment.values()) if origin_retweeted: origin_retweeted_mid = filter_mid(origin_retweeted) if retweeted_retweeted: retweeted_retweeted_mid = filter_mid(retweeted_retweeted) if origin_comment: origin_comment_mid = filter_mid(origin_comment) if retweeted_comment: retweeted_comment_mid = filter_mid(retweeted_comment) query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "should":[ ], "must": [ ] } } } }, "size":10000 } """ if int(style) == 0: # retweeted retweeted_results = influenced_user_detail(uid, date, origin_mid, retweeted_mid, 3) results = retweeted_results else: comment_results = influenced_user_detail(uid, date, origin_mid, retweeted_mid, 2) results = comment_results return results
def group_user_weibo(task_name, submit_user, sort_type): weibo_list = [] now_date = ts2datetime(time.time()) if sort_type == 'retweet': sort_type = 'retweeted' #run_type if RUN_TYPE == 0: now_date = RUN_TEST_TIME sort_type = 'timestamp' #step1: get group user task_id = submit_user + '-' + task_name try: group_exist_result = es_group_result.get(index=group_index_name, doc_type=group_index_type,\ id=task_id)['_source'] except: group_exist_result = {} if not group_exist_result: return 'group no exist' #step2: get user weibo list uid_list = group_exist_result['uid_list'] for i in range(6, -1, -1): iter_date = ts2datetime(datetime2ts(now_date) - i * DAY) index_name = flow_text_index_name_pre + iter_date try: weibo_result = es_flow_text.search(index=index_name, doc_type=flow_text_index_type,\ body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'sort':[{sort_type: {'order': 'desc'}}], 'size':100})['hits']['hits'] except: weibo_result = [] if weibo_result: weibo_list.extend(weibo_result) #sort_weibo_list = sorted(weibo_list, key=lambda x:x['_source'][sort_type], reverse=True)[:100] sort_weibo_list = weibo_list #step3: get user name try: portrait_exist_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, \ body={'ids':uid_list})['docs'] except: portrait_exist_result = [] uid2uname_dict = {} for portrait_item in portrait_exist_result: uid = portrait_item['_id'] if portrait_item['found'] == True: source = portrait_item['_source'] uname = source['uname'] else: uname = 'unknown' uid2uname_dict[uid] = uname weibo_list = [] for weibo_item in sort_weibo_list: source = weibo_item['_source'] mid = source['mid'] uid = source['uid'] uname = uid2uname_dict[uid] text = source['text'] ip = source['geo'] timestamp = source['timestamp'] date = ts2date(timestamp) sentiment = source['sentiment'] weibo_url = weiboinfo2url(uid, mid) #run_type: if RUN_TYPE == 1: try: retweet_count = source['retweeted'] except: retweet_count = 0 try: comment_count = source['comment'] except: comment_count = 0 try: sensitive_score = source['sensitive'] except: sensitive_score = 0 else: retweet_count = 0 comment_count = 0 sensitive_score = 0 city = ip2city(ip) weibo_list.append([ mid, uid, uname, text, ip, city, timestamp, date, retweet_count, comment_count, sensitive_score, weibo_url ]) if sort_type == 'timestamp': new_weibo_list = sorted(weibo_list, key=lambda x: x[6], reverse=True) elif sort_type == 'retweeted': new_weibo_list = sorted(weibo_list, key=lambda x: x[8], reverse=True) elif sort_type == 'comment': new_weibo_list = sorted(weibo_list, key=lambda x: x[9], reverse=True) elif sort_type == 'sensitive': new_weibo_list = sorted(weibo_list, key=lambda x: x[10], reverse=True) return new_weibo_list
def search_sentiment_detail_in_topic(start_ts, task_type, task_detail, time_segment, sentiment, sort_type): results = {} start_ts = int(start_ts) start_date = ts2datetime(start_ts) end_ts = start_ts + str2segment[time_segment] print 'start_ts:', ts2datetime(start_ts) print 'end_ts:', ts2datetime(end_ts) if sentiment == '7': query_sentiment_list = SENTIMENT_SECOND else: query_sentiment_list = [sentiment] user_topic = task_detail #step1: iter get weibo and user in topic iter_user_count = 0 in_user_result = {} all_filter_weibo_list = [] sort_evaluate_max = SENTIMENT_SORT_EVALUATE_MAX flow_text_index_name = flow_text_index_name_pre + start_date print 'flow_text_index_name:', flow_text_index_name while len(in_user_result) < SENTIMENT_MAX_USER: print 'in_user_result:', len(in_user_result) print 'sort_evaluate_max:', sort_evaluate_max query_body = { 'query':{ 'filtered':{ 'filter':{ 'bool':{ 'must':[ {'range': {sort_type: {'lt': sort_evaluate_max}}}, {'terms': {'sentiment': query_sentiment_list}}, {'range': {'timestamp':{'gte': start_ts, 'lt': end_ts}}} ] } } } }, 'sort': [{sort_type: {'order': 'desc'}}], 'size': SENTIMENT_ITER_TEXT_COUNT } try: flow_text_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body=query_body)['hits']['hits'] except: flow_text_result = [] print 'len flow_text_result:', len(flow_text_result) if not flow_text_result: break weibo_list, user_set = deal_show_weibo_list(flow_text_result) #filter topic user filter_type = 'topic' print 'identify user portrait topic' in_portrait_result = identify_user_portrait_domain_topic(user_set, filter_type, user_topic) filter_weibo_list = filter_weibo_in(weibo_list, in_portrait_result) if filter_weibo_list: all_filter_weibo_list.extend(filter_weibo_list) if in_portrait_result: in_user_result = dict(in_user_result, **in_portrait_result) sort_evaluate_max = flow_text_result[-1]['_source'][sort_type] query_uid_list = in_user_result.keys() #step2: get keywords from flow_text print 'get keyword' keyword_query_dict = { 'query':{ 'filtered':{ 'filter':{ 'bool':{ 'must':[ {'range': {'timestamp': {'gte': start_ts, 'lt': end_ts}}}, {'terms': {'uid': query_uid_list}} ] } } } }, 'aggs':{ 'all_interests':{ 'terms':{ 'field': 'keywords_string', 'size': SENTIMENT_MAX_KEYWORDS } } } } show_keywords_dict = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body=keyword_query_dict)['aggregations']['all_interests']['buckets'] keywords_list = [[item['key'], item['doc_count']] for item in show_keywords_dict] #step3: get results results['weibo'] = all_filter_weibo_list results['in_portrait_result'] = sorted(in_user_result.items(), key=lambda x:x[1][1], reverse=True) results['keywords'] = keywords_list return results
def get_retweet_weibo_detail(ts, user, task_name, size, text_type, type_value): _id = user + '-' + task_name task_detail = es_user_portrait.get(index=index_sensing_task, doc_type=_id, id=ts)['_source'] origin_weibo_detail = json.loads(task_detail['origin_weibo_detail']) retweeted_weibo_detail = json.loads(task_detail['retweeted_weibo_detail']) mid_list = [] mid_list.extend(origin_weibo_detail.keys()) mid_list.extend(retweeted_weibo_detail.keys()) query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte": ts - time_interval, "lt": ts } }}, {"terms": {"root_mid": mid_list}} ] } } } }, "sort": {"timestamp": {"order": "desc"}}, "size": 100 } if text_type == "message_type": query_body['query']['filtered']['filter']['bool']['must'].append({"term":{text_type: type_value}}) if text_type == "sentiment": #if isinstance(type_value, str): if len(type_value) == 1: query_body['query']['filtered']['filter']['bool']['must'].append({"term":{text_type: type_value}}) else: query_body['query']['filtered']['filter']['bool']['must'].append({"terms":{text_type: type_value}}) datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) # 1. 查询微博 if datetime == datetime_1 and exist_es: search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] elif datetime != datetime_1 and exist_es_1: search_results = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] #print search_results # 2. 获取微博相关信息 results = [] uid_list = [] if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) if uid_list: portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"] for i in range(len(uid_list)): item = search_results[i]['_source'] temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type temp.append(item['uid']) if portrait_result[i]['found']: temp.append(portrait_result[i]["fields"]["nick_name"][0]) temp.append(portrait_result[i]["fields"]["photo_url"][0]) else: temp.append(item['uid']) temp.append("") temp.append(item["text"]) #print item['text'] temp.append(item["sentiment"]) temp.append(ts2date(item['timestamp'])) temp.append(item['geo']) temp.append(item["message_type"]) results.append(temp) return results
def group_user_weibo(task_name, submit_user, sort_type): weibo_list = [] now_date = ts2datetime(time.time()) #run_type if RUN_TYPE == 0: now_date = RUN_TEST_TIME sort_type = 'timestamp' #step1: get group user task_id = submit_user + '-' + task_name try: group_exist_result = es_group_result.get(index=group_index_name, doc_type=group_index_type,\ id=task_id)['_source'] except: group_exist_result = {} if not group_exist_result: return 'group no exist' #step2: get user weibo list uid_list = group_exist_result['uid_list'] for i in range(7,0,-1): iter_date = ts2datetime(datetime2ts(now_date) - i * DAY) index_name = flow_text_index_name_pre + iter_date try: weibo_result = es_flow_text.search(index=index_name, doc_type=flow_text_index_type,\ body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'sort':sort_type, 'size':100})['hits']['hits'] except: weibo_result = [] if weibo_result: weibo_list.extend(weibo_result) sort_weibo_list = sorted(weibo_list, key=lambda x:x['_source'][sort_type], reverse=True)[:100] #step3: get user name try: portrait_exist_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, \ body={'ids':uid_list})['docs'] except: portrait_exist_result = [] uid2uname_dict = {} for portrait_item in portrait_exist_result: uid = portrait_item['_id'] if portrait_item['found'] == True: source = portrait_item['_source'] uname = source['uname'] else: uname = 'unknown' uid2uname_dict[uid] = uname weibo_list = [] for weibo_item in sort_weibo_list: source = weibo_item['_source'] mid = source['mid'] uid = source['uid'] uname = uid2uname_dict[uid] text = source['text'] ip = source['geo'] timestamp = source['timestamp'] date = ts2date(timestamp) sentiment = source['sentiment'] weibo_url = weiboinfo2url(uid, mid) #run_type: if RUN_TYPE == 1: retweet_count = source['retweet_count'] comment_count = source['comment_count'] sensitive_score = source['sensitive_score'] else: retweet_count = 0 comment_count = 0 sensitive_score = 0 city = ip2city(ip) weibo_list.append([mid, uid, uname, text, ip, city, timestamp, date, retweet_count, comment_count, sensitive_score, weibo_url]) return weibo_list
def influenced_people(uid, mid, influence_style, date, default_number=20): # uid # which weibo----mid, retweeted weibo ---seek for root_mid # influence_style: retweeted(0) or comment(1) date1 = str(date).replace('-', '') index_name = pre_index + date1 index_flow_text = pre_text_index + date text_result = es.get(index=index_flow_text, doc_type=flow_text_index_type, id=mid)["_source"] temp_mid = text_result.get("root_mid", '') #判断微博是否是原创微博 print temp_mid if temp_mid: mid_type = 1 # 非原创微博 else: mid_type = 0 # 原创微博 query_body = { "query": { "filtered": { "filter": { "bool": { "must": [] } } } }, "size": 100000 } if mid_type == 0: if int(influence_style) == 0: # origin weibo, all retweeted people query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{ "term": { "root_uid": uid } }, { "term": { "message_type": 3 } }, { "term": { "root_mid": mid } }]) else: # commented people query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{ "term": { "directed_uid": uid } }, { "term": { "message_type": 2 } }, { "term": { "root_mid": mid } }]) else: if int(influence_style) == 0: # origin weibo, all retweeted people query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{ "term": { "directed_uid": uid } }, { "term": { "message_type": 3 } }, { "term": { "root_mid": temp_mid } }]) else: # commented people query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{ "term": { "directed_uid": uid } }, { "term": { "message_type": 2 } }, { "term": { "root_mid": temp_mid } }]) search_results = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=query_body, fields=["uid"], timeout=30)["hits"]["hits"] results = [] if search_results: for item in search_results: if int(item["fields"]["uid"][0]) == int(uid): pass else: results.append(item["fields"]["uid"][0]) results = list(set(results)) else: results = [] if results: portrait_results = es_user_portrait.mget(index=user_portrait, doc_type=portrait_index_type, body={"ids": results}, fields=[ "domain", "topic_string", "activity_geo_dict", "importance", "influence" ])["docs"] else: portrait_results = {} in_portrait = [] out_portrait = [] in_portrait_info = [] retweeted_domain = {} retweeted_topic = {} retweeted_geo = {} average_influence = 0 total_influence = 0 count = 0 if portrait_results: for item in portrait_results: if item["found"]: temp = [] count += 1 temp.append(item['_id']) temp.append(item["fields"]["importance"][0]) in_portrait.append(temp) temp_domain = item["fields"]["domain"][0].split('&') temp_topic = item["fields"]["topic_string"][0].split('&') temp_geo = json.loads( item["fields"]["activity_geo_dict"][0])[-1].keys() total_influence += item["fields"]["influence"][0] retweeted_domain = aggregation(temp_domain, retweeted_domain) retweeted_topic = aggregation(temp_topic, retweeted_topic) retweeted_geo = aggregation(temp_geo, retweeted_geo) else: out_portrait.append(item['_id']) retweeted_domain = proportion(retweeted_domain) retweeted_topic = proportion(retweeted_topic) retweeted_geo = proportion(retweeted_geo) try: average_influence = total_influence / count except: average_influence = 0 sorted_retweeted_domain = sorted(retweeted_domain.items(), key=lambda x: x[1], reverse=True) sorted_retweeted_topic = sorted(retweeted_topic.items(), key=lambda x: x[1], reverse=True) sorted_retweeted_geo = sorted(retweeted_geo.items(), key=lambda x: x[1], reverse=True) retweeted_results = dict() retweeted_results["domian"] = sorted_retweeted_domain[:5] retweeted_results["topic"] = sorted_retweeted_topic[:5] retweeted_results["geo"] = sorted_retweeted_geo[:5] retweeted_results["influence"] = average_influence in_portrait = sorted(in_portrait, key=lambda x: x[1], reverse=True) temp_list = [] for item in in_portrait: temp_list.append(item[0]) print temp_list[:20] print out_portrait[:20] retweeted_results['in_portrait_number'] = len(temp_list) retweeted_results['out_portrait_number'] = len(out_portrait) in_portrait_url = get_user_url(temp_list[:default_number]) out_portrait_url = get_user_url(out_portrait[:default_number]) return_results = dict() return_results["influence_users"] = [in_portrait_url, out_portrait_url] return_results["influence_distribution"] = retweeted_results return return_results return ([ in_portrait_url[:default_number], out_portrait_url[:default_number] ])
def search_sentiment_detail_in_topic(start_ts, task_type, task_detail, time_segment, sentiment, sort_type): results = {} start_ts = int(start_ts) start_date = ts2datetime(start_ts) end_ts = start_ts + str2segment[time_segment] if sort_type == 'retweet': sort_type = 'retweeted' now_date = ts2datetime(time.time()) if start_date == now_date: sort_type = 'timestamp' #print 'start_ts:', ts2datetime(start_ts) #print 'end_ts:', ts2datetime(end_ts) if sentiment == '7': query_sentiment_list = SENTIMENT_SECOND else: query_sentiment_list = [sentiment] user_topic = task_detail #step1: iter get weibo and user in topic iter_user_count = 0 in_user_result = {} all_filter_weibo_list = [] sort_evaluate_max = SENTIMENT_SORT_EVALUATE_MAX flow_text_index_name = flow_text_index_name_pre + start_date #print 'flow_text_index_name:', flow_text_index_name while len(in_user_result) < SENTIMENT_MAX_USER: #print 'in_user_result:', len(in_user_result) #print 'sort_evaluate_max:', sort_evaluate_max query_body = { 'query':{ 'filtered':{ 'filter':{ 'bool':{ 'must':[ {'range': {sort_type: {'lt': sort_evaluate_max}}}, {'terms': {'sentiment': query_sentiment_list}}, {'range': {'timestamp':{'gte': start_ts, 'lt': end_ts}}} ] } } } }, 'sort': [{sort_type: {'order': 'desc'}}], 'size': SENTIMENT_ITER_TEXT_COUNT } try: flow_text_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body=query_body)['hits']['hits'] except: flow_text_result = [] #print 'len flow_text_result:', len(flow_text_result) if not flow_text_result: break weibo_list, user_set = deal_show_weibo_list(flow_text_result) #filter topic user filter_type = 'topic' #print 'identify user portrait topic' in_portrait_result = identify_user_portrait_domain_topic(user_set, filter_type, user_topic) filter_weibo_list = filter_weibo_in(weibo_list, in_portrait_result) if filter_weibo_list: all_filter_weibo_list.extend(filter_weibo_list) if in_portrait_result: in_user_result = dict(in_user_result, **in_portrait_result) sort_evaluate_max = flow_text_result[-1]['_source'][sort_type] query_uid_list = in_user_result.keys() #step2: get keywords from flow_text #print 'get keyword' keyword_query_dict = { 'query':{ 'filtered':{ 'filter':{ 'bool':{ 'must':[ {'range': {'timestamp': {'gte': start_ts, 'lt': end_ts}}}, {'terms': {'uid': query_uid_list}} ] } } } }, 'aggs':{ 'all_interests':{ 'terms':{ 'field': 'keywords_string', 'size': SENTIMENT_MAX_KEYWORDS } } } } show_keywords_dict = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body=keyword_query_dict)['aggregations']['all_interests']['buckets'] keywords_list = [[item['key'], item['doc_count']] for item in show_keywords_dict] #step3: get results results['weibo'] = all_filter_weibo_list results['in_portrait_result'] = sorted(in_user_result.items(), key=lambda x:x[1][1], reverse=True) results['keywords'] = keywords_list return results
def influenced_user_detail(uid, date, origin_retweeted_mid, retweeted_retweeted_mid, message_type, default_number=20): query_body = { "query": { "filtered": { "filter": { "bool": { "should": [], "must": [] } } } }, "size": 10000 } #详细影响到的人 date1 = str(date).replace('-', '') index_name = pre_index + date1 index_flow_text = pre_text_index + date origin_retweeted_uid = [] # influenced user uid_list retweeted_retweeted_uid = [] origin_comment_uid = [] retweeted_comment_uid = [] if origin_retweeted_mid: # 所有转发该条原创微博的用户 length = len(origin_retweeted_mid) if length != 1: for iter_mid in origin_retweeted_mid: query_body["query"]["filtered"]["filter"]["bool"][ "should"].append({"term": { "root_mid": iter_mid }}) else: iter_mid = origin_retweeted_mid[0] query_body["query"]["filtered"]["filter"]["bool"]["must"].append( {"term": { "root_mid": iter_mid }}) query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{ "term": { "message_type": message_type } }, { "term": { "root_uid": uid } }]) origin_retweeted_result = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=query_body, fields=["uid"])["hits"]["hits"] if origin_retweeted_result: for item in origin_retweeted_result: origin_retweeted_uid.append(item["fields"]["uid"][0]) if retweeted_retweeted_mid: # 所有评论该条原创微博的用户 length = len(retweeted_retweeted_mid) if length != 1: for iter_mid in retweeted_retweeted_mid: query_body["query"]["filtered"]["filter"]["bool"][ "should"].append({"term": { "root_mid": iter_mid }}) else: iter_mid = retweeted_retweeted_mid[0] query_body["query"]["filtered"]["filter"]["bool"]["must"].append( {"term": { "root_mid": iter_mid }}) query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{ "term": { "message_type": message_type } }, { "term": { "directed_uid": uid } }]) retweeted_retweeted_result = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=query_body, fields=["uid"])["hits"]["hits"] if retweeted_retweeted_result: for item in retweeted_retweeted_result: retweeted_retweeted_uid.append(item["fields"]["uid"][0]) retweeted_uid_list = [] # all retweeted user list retweeted_results = {} # statistics of all retweeted uid information retweeted_domain = {} retweeted_topic = {} retweeted_geo = {} in_portrait = [] out_portrait = [] average_influence = 0 total_influence = 0 count = 0 retweeted_uid_list.extend(origin_retweeted_uid) retweeted_uid_list.extend(retweeted_retweeted_uid) retweeted_uid_list = list(set(retweeted_uid_list) - set([uid])) # filter uids if retweeted_uid_list: user_portrait_result = es_user_portrait.mget( index=user_portrait, doc_type=portrait_index_type, body={"ids": retweeted_uid_list}, fields=[ "domain", "topic_string", "activity_geo_dict", "importance", "influence" ])["docs"] for item in user_portrait_result: if item["found"]: temp = [] count += 1 temp.append(item['_id']) temp.append(item["fields"]["importance"][0]) in_portrait.append(temp) temp_domain = item["fields"]["domain"][0].split('&') temp_topic = item["fields"]["topic_string"][0].split('&') temp_geo = json.loads( item["fields"]["activity_geo_dict"][0])[-1].keys() total_influence += item["fields"]["influence"][0] retweeted_domain = aggregation(temp_domain, retweeted_domain) retweeted_topic = aggregation(temp_topic, retweeted_topic) retweeted_geo = aggregation(temp_geo, retweeted_geo) else: out_portrait.append(item['_id']) retweeted_domain = proportion(retweeted_domain) retweeted_topic = proportion(retweeted_topic) retweeted_geo = proportion(retweeted_geo) try: average_influence = total_influence / count except: average_influence = 0 sorted_retweeted_domain = sorted(retweeted_domain.items(), key=lambda x: x[1], reverse=True) sorted_retweeted_topic = sorted(retweeted_topic.items(), key=lambda x: x[1], reverse=True) sorted_retweeted_geo = sorted(retweeted_geo.items(), key=lambda x: x[1], reverse=True) retweeted_results["domian"] = sorted_retweeted_domain[:5] retweeted_results["topic"] = sorted_retweeted_topic[:5] retweeted_results["geo"] = sorted_retweeted_geo[:5] retweeted_results["influence"] = average_influence in_portrait = sorted(in_portrait, key=lambda x: x[1], reverse=True) temp_list = [] for item in in_portrait: temp_list.append(item[0]) #print temp_list[:20] #print out_portrait[:20] retweeted_results['in_portrait_number'] = len(temp_list) retweeted_results['out_portrait_number'] = len(out_portrait) in_portrait_url = get_user_url(temp_list[:default_number]) out_portrait_url = get_user_url(out_portrait[:default_number]) retweeted_results["in_portrait"] = in_portrait_url retweeted_results["out_portrait"] = out_portrait_url return retweeted_results
def search_sentiment_detail_all_keywords(start_ts, task_type, task_detail, time_segment, sentiment, sort_type): results = {} must_query_list = [] if sort_type=='retweet': sort_type = 'retweeted' start_ts_date = ts2datetime(int(start_ts)) now_date = ts2datetime(time.time()) if start_ts_date == now_date: sort_type = 'timestamp' #step0: get query keywords list keyword_nest_body_list = [] keywords_list = task_detail.split(',') #print 'keywords_list:', keywords_list for keywords_item in keywords_list: #print 'keywords_item:', keywords_item keyword_nest_body_list.append({'wildcard':{'text': '*' + keywords_item + '*'}}) must_query_list.append({'bool':{'should': keyword_nest_body_list}}) #step1: get weibo from flow_text start_ts = int(start_ts) start_date = ts2datetime(start_ts) end_ts = start_ts + str2segment[time_segment] if sentiment == '7': query_sentiment_list = SENTIMENT_SECOND else: query_sentiment_list = [sentiment] must_query_list.append({'range': {'timestamp': {'gte': start_ts, 'lt':end_ts}}}) must_query_list.append({'terms': {'sentiment': query_sentiment_list}}) in_user_result = {} out_user_result = {} all_filter_weibo_list = [] sort_evaluate_max = SENTIMENT_SORT_EVALUATE_MAX while len(in_user_result) < SENTIMENT_MAX_USER: query_body = { 'query':{ 'bool':{ 'must': [ {'range':{sort_type: {'lt': sort_evaluate_max}}}, {'range':{'timestamp':{'gte': start_ts, 'lt': end_ts}}}, {'terms':{'sentiment': query_sentiment_list}}, {'bool':{'should': keyword_nest_body_list}} ] } }, 'size': SENTIMENT_ITER_TEXT_COUNT, 'sort': [{sort_type: {'order': 'desc'}}] } flow_text_index_name = flow_text_index_name_pre + start_date #print 'flow_text_index_name:', flow_text_index_name try: flow_text_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body=query_body)['hits']['hits'] except: flow_text_result = [] #print 'flow_text_result:', len(flow_text_result) #print 'show weibo list' if not flow_text_result: break show_weibo_list, user_set = deal_show_weibo_list(flow_text_result) filter_type = 'in-out' in_portrait_result, out_portrait_result = identify_user_portrait(user_set, filter_type) if len(all_filter_weibo_list) <= SENTIMENT_MAX_TEXT and show_weibo_list: all_filter_weibo_list.extend(show_weibo_list) if in_portrait_result: in_user_result = dict(in_user_result, **in_portrait_result) if out_portrait_result: out_user_result = dict(out_user_result, **out_portrait_result) sort_evaluate_max = flow_text_result[-1]['_source'][sort_type] #print 'get keyword' #step2: get keywords from flow_text keyword_query_dict = { 'query':{ 'bool':{ 'must':must_query_list } }, 'aggs':{ 'all_interests': { 'terms': { 'field': 'keywords_string', 'size': SENTIMENT_MAX_KEYWORDS } } } } show_keywords_dict = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body=keyword_query_dict)['aggregations']['all_interests']['buckets'] keywords_list = [[item['key'], item['doc_count']] for item in show_keywords_dict] #step3: get user information #filter_type = 'in-out' #in_portrait_result, out_portrait_result = identify_user_portrait(user_set, filter_type) in_portrait_result = in_user_result out_portrait_result = out_user_result #step4: add uname to show weibo list show_weibo_list = add_uname2weibo(all_filter_weibo_list, in_portrait_result, out_portrait_result) #step5: results results['weibo'] = show_weibo_list results['in_portrait_result'] = sorted(in_portrait_result.items(), key=lambda x:x[1][1], reverse=True)[:SENTIMENT_MAX_USER] results['out_portrait_result'] = sorted(out_portrait_result.items(), key=lambda x:x[1][3], reverse=True)[:SENTIMENT_MAX_USER] results['keywords'] = keywords_list return results
def aggregation_hot_keywords(start_time, stop_time, keywords_list): start_time = int(start_time) stop_time = int(stop_time) query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"terms": {"keywords_string": keywords_list}}, {"range":{ "timestamp":{ "gte":start_time, "lt": stop_time } }} ] } } } }, "aggs":{ "all_keywords":{ "terms": {"field": "keywords_string", "size": PRE_AGGREGATION_NUMBER} } } } keywords_dict = dict() datetime = ts2datetime(float(stop_time)) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["aggregations"]['all_keywords']['buckets'] if search_results: for item in search_results: keywords_dict[item['key']] = item['doc_count'] datetime_1 = ts2datetime(float(start_time)) if datetime_1 == datetime: pass else: ts = float(stop_time) while 1: keywords_dict_1 = dict() ts = ts-day_time datetime = ts2datetime(ts) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: search_results_1 = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["aggregations"]['all_keywords']['buckets'] if search_results_1: print search_results_1 for item in search_results_1: keywords_dict_1[item['key']] = item['doc_count'] for iter_key in keywords_dict_1.keys(): if keywords_dict.has_key(iter_key): keywords_dict[iter_key] += keywords_dict_1[iter_key] else: keywords_dict[iter_key] = keywords_dict_1[iter_key] if datetime_1 == datetime: break print keywords_dict return_dict = sorted(keywords_dict.items(), key=lambda x:x[1], reverse=True)[:AGGRAGATION_KEYWORDS_NUMBER] return return_dict
def get_origin_weibo_detail(ts, user, task_name, size, order, message_type=1): _id = user + '-' + task_name task_detail = es_user_portrait.get(index=index_sensing_task, doc_type=_id, id=ts)['_source'] mid_value = json.loads(task_detail['mid_topic_value']) duplicate_dict = json.loads(task_detail['duplicate_dict']) tmp_duplicate_dict = dict() for k,v in duplicate_dict.iteritems(): try: tmp_duplicate_dict[v].append(k) except: tmp_duplicate_dict[v] = [k, v] if message_type == 1: weibo_detail = json.loads(task_detail['origin_weibo_detail']) elif message_type == 2: weibo_detail = json.loads(task_detail['retweeted_weibo_detail']) else: weibo_detail = json.loads(task_detail['sensitive_weibo_detail']) weibo_detail_list = [] if weibo_detail: for iter_mid, item in weibo_detail.iteritems(): tmp = [] tmp.append(iter_mid) tmp.append(item[iter_mid]) tmp.append(item['retweeted']) tmp.append(item['comment']) weibo_detail_list.append(tmp) mid_list = weibo_detail.keys() results = [] query_body = { "query":{ "filtered":{ "filter":{ "terms":{"mid": mid_list} } } }, "size": 1000, "sort": {"timestamp": {"order": "desc"}} } index_list = [] datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-DAY) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: index_list.append(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) if exist_es_1: index_list.append(index_name_1) if index_list and mid_list: search_results = es_text.search(index=index_list, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] uid_list = [] text_dict = dict() # 文本信息 portrait_dict = dict() # 背景信息 sort_results = [] if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) text_dict[item['_id']] = item['_source'] # _id是mid if uid_list: portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"] for item in portrait_result: if item['found']: portrait_dict[item['_id']] = {"nick_name": item["fields"]["nick_name"][0], "photo_url": item["fields"]["photo_url"][0]} else: portrait_dict[item['_id']] = {"nick_name": item['_id'], "photo_url":""} if order == "total": sorted_list = sorted(weibo_detail_list, key=lambda x:x[1], reverse=True) elif order == "retweeted": sorted_list = sorted(weibo_detail_list, key=lambda x:x[2], reverse=True) elif order == "comment": sorted_list = sorted(weibo_detail_list, key=lambda x:x[3], reverse=True) else: sorted_list = weibo_detail_list count_n = 0 results_dict = dict() mid_index_dict = dict() for item in sorted_list: # size mid = item[0] iter_text = text_dict.get(mid, {}) temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type if iter_text: uid = iter_text['uid'] temp.append(uid) iter_portrait = portrait_dict.get(uid, {}) if iter_portrait: temp.append(iter_portrait['nick_name']) temp.append(iter_portrait['photo_url']) else: temp.extend([uid,'']) temp.append(iter_text["text"]) temp.append(iter_text["sentiment"]) temp.append(ts2date(iter_text['timestamp'])) temp.append(iter_text['geo']) if message_type == 1: temp.append(1) elif message_type == 2: temp.append(3) else: temp.append(iter_text['message_type']) temp.append(item[2]) temp.append(item[3]) temp.append(iter_text.get('sensitive', 0)) temp.append(iter_text['timestamp']) temp.append(mid_value[mid]) temp.append(mid) results.append(temp) count_n += 1 results = sorted(results, key=operator.itemgetter(-4, -2, -6), reverse=True) # -4 -2 -3 sort_results = [] count = 0 for item in results: sort_results.append([item]) mid_index_dict[item[-1]] = count count += 1 if tmp_duplicate_dict: remove_list = [] value_list = tmp_duplicate_dict.values() # [[mid, mid], ] for item in value_list: tmp = [] for mid in item: if mid_index_dict.get(mid, 0): tmp.append(mid_index_dict[mid]) if len(tmp) > 1: tmp_min = min(tmp) else: continue tmp.remove(tmp_min) for iter_count in tmp: sort_results[tmp_min].extend(sort_results[iter_count]) remove_list.append(sort_results[iter_count]) if remove_list: for item in remove_list: sort_results.remove(item) return sort_results
def get_retweet_weibo_detail(ts, user, task_name, size, text_type, type_value): _id = user + '-' + task_name task_detail = es_user_portrait.get(index=index_sensing_task, doc_type=_id, id=ts)['_source'] origin_weibo_detail = json.loads(task_detail['origin_weibo_detail']) retweeted_weibo_detail = json.loads(task_detail['retweeted_weibo_detail']) mid_list = [] mid_list.extend(origin_weibo_detail.keys()) mid_list.extend(retweeted_weibo_detail.keys()) query_body = { "query": { "filtered": { "filter": { "bool": { "must": [{ "range": { "timestamp": { "gte": ts - time_interval, "lt": ts } } }, { "terms": { "root_mid": mid_list } }] } } } }, "sort": { "timestamp": { "order": "desc" } }, "size": 100 } if text_type == "message_type": query_body['query']['filtered']['filter']['bool']['must'].append( {"term": { text_type: type_value }}) if text_type == "sentiment": #if isinstance(type_value, str): if len(type_value) == 1: query_body['query']['filtered']['filter']['bool']['must'].append( {"term": { text_type: type_value }}) else: query_body['query']['filtered']['filter']['bool']['must'].append( {"terms": { text_type: type_value }}) datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts - time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) # 1. 查询微博 if datetime == datetime_1 and exist_es: search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] elif datetime != datetime_1 and exist_es_1: search_results = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] #print search_results # 2. 获取微博相关信息 results = [] uid_list = [] if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) if uid_list: portrait_result = es_profile.mget( index=profile_index_name, doc_type=profile_index_type, body={"ids": uid_list}, fields=['nick_name', 'photo_url'])["docs"] for i in range(len(uid_list)): item = search_results[i]['_source'] temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type temp.append(item['uid']) if portrait_result[i]['found']: temp.append(portrait_result[i]["fields"]["nick_name"][0]) temp.append(portrait_result[i]["fields"]["photo_url"][0]) else: temp.append(item['uid']) temp.append("") temp.append(item["text"]) #print item['text'] temp.append(item["sentiment"]) temp.append(ts2date(item['timestamp'])) temp.append(item['geo']) temp.append(item["message_type"]) results.append(temp) return results
def influenced_detail(uid, date, style): date1 = str(date).replace('-', '') index_name = pre_index + date1 index_text = "flow_text_" + date style = int(style) query_body_origin = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"term":{"message_type": 1}}, {"term":{"uid": uid}} ] } } } }, "size": 10000 } result_1 = es.search(index=index_text, doc_type="text", body=query_body_origin)['hits']['hits'] origin_set = [] if result_1: for item in result_1: origin_set.append([item['_id'], item['_source'].get("retweeted", 0), item['_source'].get("comment", 0)]) query_body_retweeted = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"term":{"message_type": 3}}, {"term":{"uid": uid}} ] } } } }, "size": 10000 } result_2 = es.search(index=index_text, doc_type="text", body=query_body_retweeted)['hits']['hits'] retweeted_set = [] if result_2: for item in result_2: retweeted_set.append([item['_id'], item['_source'].get("retweeted", 0), item['_source'].get("comment", 0)]) if style == 0: sorted_list = sorted(origin_set, key=lambda x:x[1], reverse=True) detail_text = get_text(sorted_list[:20], date, style) elif style == 1: sorted_list = sorted(origin_set, key=lambda x:x[2], reverse=True) detail_text = get_text(sorted_list[:20], date, style) elif style == 2: sorted_list = sorted(retweeted_set, key=lambda x:x[1], reverse=True) detail_text = get_text(sorted_list[:20], date, style) else: sorted_list = sorted(retweeted_set, key=lambda x:x[2], reverse=True) detail_text = get_text(sorted_list[:20], date, style) return detail_text
def get_sensitive_text_detail(task_name, ts, user, order): _id = user + '-' + task_name task_detail = es.get(index=index_sensing_task, doc_type=_id, id=ts)['_source'] weibo_detail = json.loads(task_detail['sensitive_weibo_detail']) weibo_detail_list = [] if weibo_detail: for iter_mid, item in weibo_detail.iteritems(): tmp = [] tmp.append(iter_mid) tmp.append(item[iter_mid]) tmp.append(item['retweeted']) tmp.append(item['comment']) weibo_detail_list.append(tmp) mid_list = weibo_detail.keys() results = [] query_body = { "query":{ "filtered":{ "filter":{ "terms":{"mid": mid_list} } } } } index_list = [] datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-DAY) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: index_list.append(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) if exist_es_1: index_list.append(index_name_1) if index_list and mid_list: search_results = es_text.search(index=index_list, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] uid_list = [] text_dict = dict() # 文本信息 portrait_dict = dict() # 背景信息 if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) text_dict[item['_id']] = item['_source'] # _id是mid if uid_list: portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"] for item in portrait_result: if item['found']: portrait_dict[item['_id']] = {"nick_name": item["fields"]["nick_name"][0], "photo_url": item["fields"]["photo_url"][0]} else: portrait_dict[item['_id']] = {"nick_name": item['_id'], "photo_url":""} if order == "total": sorted_list = sorted(weibo_detail_list, key=lambda x:x[1], reverse=True) elif order == "retweeted": sorted_list = sorted(weibo_detail_list, key=lambda x:x[2], reverse=True) elif order == "comment": sorted_list = sorted(weibo_detail_list, key=lambda x:x[3], reverse=True) else: sorted_list = weibo_detail_list count_n = 0 for item in sorted_list: mid = item[0] iter_text = text_dict.get(mid, {}) temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type if iter_text: uid = iter_text['uid'] temp.append(uid) iter_portrait = portrait_dict.get(uid, {}) if iter_portrait: temp.append(iter_portrait['nick_name']) temp.append(iter_portrait['photo_url']) else: temp.extend([uid,'']) temp.append(iter_text["text"]) temp.append(iter_text["sentiment"]) temp.append(ts2date(iter_text['timestamp'])) temp.append(iter_text['geo']) temp.append(iter_text['message_type']) temp.append(item[2]) temp.append(item[3]) temp.append(iter_text.get('sensitive', 0)) count_n += 1 results.append(temp) if results and order == "ts": results = sorted(results, key=lambda x:x[5], reverse=True) if results and order == "sensitive": results = sorted(results, key=lambda x:x[-1], reverse=True) return results