def deal_show_weibo_list(flow_text_result): show_weibo_list = [] user_set = set() for weibo_item in flow_text_result: source = weibo_item['_source'] mid = source['mid'] uid = source['uid'] text = source['text'] geo = source['geo'] timestamp = source['timestamp'] date = ts2date(timestamp) weibo_url = weiboinfo2url(uid, mid) if RUN_TYPE == 1: try: retweet_count = source['retwet'] except: retweet_count = 0 try: comment_count = source['comment'] except: comment_count = 0 try: sensitive_score = source['sensitive'] except: sensitive_score = 0 else: retweet_count = 0 comment_count = 0 sensitive_score = 0 show_weibo_list.append([mid, uid, text, geo, timestamp, date, retweet_count, comment_count, sensitive_score, weibo_url]) user_set.add(uid) return show_weibo_list, user_set
def deal_show_weibo_list(flow_text_result): show_weibo_list = [] user_set = set() for weibo_item in flow_text_result: source = weibo_item['_source'] mid = source['mid'] uid = source['uid'] text = source['text'] geo = source['geo'] timestamp = source['timestamp'] date = ts2date(timestamp) weibo_url = weiboinfo2url(uid, mid) if RUN_TYPE == 1: try: retweet_count = source['retweeted'] except: retweet_count = 0 try: comment_count = source['comment'] except: comment_count = 0 try: sensitive_score = source['sensitive'] except: sensitive_score = 0 else: retweet_count = 0 comment_count = 0 sensitive_score = 0 show_weibo_list.append([mid, uid, text, geo, timestamp, date, retweet_count, comment_count, sensitive_score, weibo_url]) user_set.add(uid) return show_weibo_list, user_set
def get_text(top_list, date, style): # input: [[mid1, no.1], [mid2, no.2], ['mid3', no.3]] # output: [[text1, no.1], [text2, no.2], [text3, no.3]] # mid, retweeted, comment, text, geo, timestamp, sentiment, mid_url results = [] index_flow_text = pre_text_index + date #index_list = get_text_index(date) if len(top_list) != 0: # no one mid_list = [] for item in top_list: mid_list.append(item[0]) search_result = es.mget(index=index_flow_text, doc_type=flow_text_index_type, body={"ids":mid_list})["docs"] for i in range(len(top_list)): temp = [] temp.extend(top_list[i]) if search_result[i]['found']: source = search_result[i]['_source'] temp.append(source["text"]) temp.append(source["geo"]) temp.append(ts2date(source["timestamp"])) temp.append(source["sentiment"]) temp.append(weiboinfo2url(source['uid'], source['mid'])) temp.append(uid_url+source['uid']) temp.append(source['uid']) try: uname = es_profile.get(index=profile_index_name, doc_type=profile_index_type, id=source['uid'])["_source"]["nick_name"] temp.append(uname) except: temp.append("unknown") else: temp.extend(["", "", "", "", "", "", "", ""]) results.append(temp) return results
def get_network_task(submit_user): results = [] #step1: query body query_body = { 'query':{ 'filtered':{ 'filter':{ 'term':{'submit_user': submit_user} } } }, 'sort': [{'submit_ts': {'order': 'desc'}}], 'size': MAX_VALUE } #step2: search try: network_task_result = es_network_task.search(index=network_keywords_index_name, \ doc_type=network_keywords_index_type, body=query_body)['hits']['hits'] except: network_task_result = [] #step3: get results for task_item in network_task_result: source = task_item['_source'] task_id = source['task_id'] submit_ts = source['submit_ts'] submit_date = ts2date(submit_ts) keywords = source['query_keywords'] start_date = source['start_date'] end_date = source['end_date'] status = source['status'] results.append([task_id, keywords, submit_date, start_date, end_date, status]) return results
def get_group_detect(submit_user): results = [] query_body = { 'query':{ 'filtered':{ 'filter':{ 'bool':{ 'must':[ {'term': {'submit_user': submit_user}}, {'term': {'task_type': 'detect'}} ] } } } }, 'sort':[{'submit_date': {'order': 'desc'}}], 'size': MAX_VALUE } #search group task try: group_task_result = es_group_result.search(index=group_index_name, doc_type=group_index_type,\ body=query_body)['hits']['hits'] except: group_task_result = [] #group task results for group_item in group_task_result: source = group_item['_source'] task_name = source['task_name'] task_process = source['detect_process'] submit_ts = source['submit_date'] submit_date = ts2date(submit_ts) state = source['state'] task_type = source['detect_type'] results.append([task_name, submit_date, state, task_type, task_process]) return results
def get_sensing_task(submit_user): results = [] #step1: query_body query_body = { 'query':{ 'filtered':{ 'filter':{ 'term': {'create_by': submit_user} } } }, 'size': MAX_VALUE, 'sort': [{'create_at': {'order': 'desc'}}] } #step2: search try: sensing_task_result = es_social_sensing.search(index=sensing_index_name, doc_type=sensing_doc_type,\ body=query_body)['hits']['hits'] except: sensing_task_result = [] #step3: task results for task_item in sensing_task_result: source = task_item['_source'] task_name = source['task_name'] status = source['processing_status'] remark = source['remark'] submit_ts = source['create_at'] if submit_ts: submit_date = ts2date(int(submit_ts)) results.append([task_name, submit_date, remark, status]) return results
def get_influence_content(uid, timestamp_from, timestamp_to): weibo_list = [] #split timestamp range to new_range_dict_list from_date_ts = datetime2ts(ts2datetime(timestamp_from)) to_date_ts = datetime2ts(ts2datetime(timestamp_to)) new_range_dict_list = [] if from_date_ts != to_date_ts: iter_date_ts = from_date_ts while iter_date_ts < to_date_ts: iter_next_date_ts = iter_date_ts + DAY new_range_dict_list.append({ 'range': { 'timestamp': { 'gte': iter_date_ts, 'lt': iter_next_date_ts } } }) iter_date_ts = iter_next_date_ts if new_range_dict_list[0]['range']['timestamp']['gte'] < timestamp_from: new_range_dict_list[0]['range']['timestamp'][ 'gte'] = timestamp_from if new_range_dict_list[-1]['range']['timestamp']['lt'] > timestamp_to: new_range_dict_list[-1]['range']['timestamp']['lt'] = timestamp_to else: new_range_dict_list = [{ 'range': { 'timestamp': { 'gte': timestamp_from, 'lt': timestamp_to } } }] #iter date to search flow_text iter_result = [] for range_item in new_range_dict_list: range_from_ts = range_item['range']['timestamp']['gte'] range_from_date = ts2datetime(range_from_ts) flow_text_index_name = flow_text_index_name_pre + range_from_date query = [] query.append({'term': {'uid': uid}}) query.append(range_item) try: flow_text_exist = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body={'query':{'bool':{'must': query}}, 'sort':[{'timestamp':'asc'}]})['hits']['hits'] except: flow_text_exist = [] iter_result.extend(flow_text_exist) # get weibo list for item in flow_text_exist: source = item['_source'] weibo = {} weibo['timestamp'] = ts2date(source['timestamp']) weibo['ip'] = source['ip'] weibo['text'] = source['text'] weibo['geo'] = '\t'.join(source['geo'].split('&')) weibo_list.append(weibo) return weibo_list
def new_get_user_weibo(uid, sort_type): results = [] weibo_list = [] now_date = ts2datetime(time.time()) #run_type if RUN_TYPE == 0: now_date = RUN_TEST_TIME sort_type = 'timestamp' #step1:get user name try: user_profile_result = es_user_profile.get(index=profile_index_name, doc_type=profile_index_type,\ id=uid, _source=False, fields=['nick_name']) except: user_profile_result = {} if user_profile_result: uname = user_profile_result['fields']['nick_name'][0] else: uname = '' #step2:get user weibo for i in range(7, 0, -1): iter_date = ts2datetime(datetime2ts(now_date) - i * DAY) index_name = flow_text_index_name_pre + iter_date try: weibo_result = es_flow_text.search(index=index_name, doc_type=flow_text_index_type,\ body={'query':{'filtered':{'filter':{'term': {'uid': uid}}}}, 'sort':sort_type, 'size':100})['hits']['hits'] except: weibo_result = [] if weibo_result: weibo_list.extend(weibo_result) print 'weibo_list:', weibo_list[0] sort_weibo_list = sorted(weibo_list, key=lambda x: x['_source'][sort_type], reverse=True)[:100] for weibo_item in sort_weibo_list: source = weibo_item['_source'] mid = source['mid'] uid = source['uid'] text = source['text'] ip = source['geo'] timestamp = source['timestamp'] date = ts2date(timestamp) sentiment = source['sentiment'] #run_type if RUN_TYPE == 1: retweet_count = source['retweet_count'] comment_count = source['comment_count'] sensitive_score = source['sensitive'] else: retweet_count = 0 comment_count = 0 sensitive_score = 0 city = ip2city(ip) results.append([ mid, uid, text, ip, city, timestamp, date, retweet_count, comment_count, sensitive_score ]) return results
def get_activity_weibo(task_name, start_ts): results = [] #step1: get task_name uid try: group_result = es_group_result.get(index=group_index_name, doc_type=group_index_type ,\ id=task_name, _source=False, fields=['uid_list']) except: group_result = {} if group_result == {}: return 'task name invalid' try: uid_list = group_result['fields']['uid_list'] except: uid_list = [] if uid_list == []: return 'task uid list null' #step2: get uid2uname uid2uname = {} try: user_portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, \ body = {'ids':uid_list}, _source=False, fields=['uname'])['docs'] except: user_portrait_result = [] for item in user_portrait_result: uid = item['_id'] if item['found'] == True: uname = item['fields']['uname'][0] uid2uname[uid] = uname #step3: search time_segment weibo time_segment = FOUR_HOUR end_ts = start_ts + time_segment time_date = ts2datetime(start_ts) flow_text_index_name = flow_text_index_name_pre + time_date query = [] query.append({'terms': {'uid': uid_list}}) query.append({'range': {'timestamp': {'gte': start_ts, 'lt': end_ts}}}) try: flow_text_es_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type, \ body={'query':{'bool':{'must':query}}, 'sort':'timestamp', 'size':MAX_VALUE})['hits']['hits'] except: flow_text_es_result = [] for item in flow_text_es_result: weibo = {} source = item['_source'] weibo['timestamp'] = ts2date(source['timestamp']) weibo['ip'] = source['ip'] weibo['text'] = source['text'] if source['geo']: weibo['geo'] = '\t'.join(source['geo']) else: weibo['geo'] = '' results.append(weibo) return results
def get_activity_weibo(task_name, start_ts): results = [] #step1: get task_name uid try: group_result = es_group_result.get(index=group_index_name, doc_type=group_index_type ,\ id=task_name, _source=False, fields=['uid_list']) except: group_result = {} if group_result == {}: return 'task name invalid' try: uid_list = group_result['fields']['uid_list'] except: uid_list = [] if uid_list == []: return 'task uid list null' #step2: get uid2uname uid2uname = {} try: user_portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, \ body = {'ids':uid_list}, _source=False, fields=['uname'])['docs'] except: user_portrait_result = [] for item in user_portrait_result: uid = item['_id'] if item['found']==True: uname = item['fields']['uname'][0] uid2uname[uid] = uname #step3: search time_segment weibo time_segment = FOUR_HOUR end_ts = start_ts + time_segment time_date = ts2datetime(start_ts) flow_text_index_name = flow_text_index_name_pre + time_date query = [] query.append({'terms':{'uid': uid_list}}) query.append({'range':{'timestamp':{'gte':start_ts, 'lt':end_ts}}}) try: flow_text_es_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type, \ body={'query':{'bool':{'must':query}}, 'sort':'timestamp', 'size':MAX_VALUE})['hits']['hits'] except: flow_text_es_result = [] for item in flow_text_es_result: weibo = {} source = item['_source'] weibo['timestamp'] = ts2date(source['timestamp']) weibo['ip'] = source['ip'] weibo['text'] = source['text'] if source['geo']: weibo['geo'] = '\t'.join(source['geo']) else: weibo['geo'] = '' results.append(weibo) return results
def get_text(top_list, date, user_info, style): # input: [[mid1, no.1], [mid2, no.2], ['mid3', no.3]] # output: [[text1, no.1], [text2, no.2], [text3, no.3]] # mid, retweeted, comment, text, geo, timestamp, sentiment, mid_url results = [] detail_list = [ "origin_weibo_retweeted_detail", "origin_weibo_comment_detail", "retweeted_weibo_retweeted_detail", "retweeted_weibo_comment_detail", ] index_flow_text = pre_text_index + date if len(top_list) != 0: # no one mid_list = [] for i in range(len(top_list)): mid_list.append(top_list[i][0]) search_result = es.mget(index=index_flow_text, doc_type=flow_text_index_type, body={"ids": mid_list})["docs"] for i in range(len(top_list)): temp = [] temp.append(mid_list[i]) if int(style) == 0: temp.append(top_list[i][1]) temp.append(json.loads(user_info[detail_list[1]]).get(top_list[i][0], 0)) elif int(style) == 1: temp.append(json.loads(user_info[detail_list[0]]).get(top_list[i][0], 0)) temp.append(top_list[i][1]) elif int(style) == 2: temp.append(top_list[i][1]) temp.append(json.loads(user_info[detail_list[3]]).get(top_list[i][0], 0)) else: temp.append(json.loads(user_info[detail_list[2]]).get(top_list[i][0], 0)) temp.append(top_list[i][1]) if search_result[i]["found"]: source = search_result[i]["_source"] temp.append(source["text"]) temp.append(source["geo"]) temp.append(ts2date(source["timestamp"])) temp.append(source["sentiment"]) temp.append(weiboinfo2url(source["uid"], source["mid"])) temp.append(uid_url + source["uid"]) temp.append(source["uid"]) try: uname = es_profile.get(index=profile_index_name, doc_type=profile_index_type, id=source["uid"])[ "_source" ]["nick_name"] temp.append(uname) except: temp.append("unknown") else: temp.extend(["", "", "", "", "", "", "", ""]) results.append(temp) return results
def new_get_user_weibo(uid, sort_type): results = [] weibo_list = [] now_date = ts2datetime(time.time()) #run_type if RUN_TYPE == 0: now_date = RUN_TEST_TIME sort_type = 'timestamp' #step1:get user name try: user_profile_result = es_user_profile.get(index=profile_index_name, doc_type=profile_index_type,\ id=uid, _source=False, fields=['nick_name']) except: user_profile_result = {} if user_profile_result: uname = user_profile_result['fields']['nick_name'][0] else: uname = '' #step2:get user weibo for i in range(7, 0, -1): iter_date = ts2datetime(datetime2ts(now_date) - i * DAY) index_name = flow_text_index_name_pre + iter_date try: weibo_result = es_flow_text.search(index=index_name, doc_type=flow_text_index_type,\ body={'query':{'filtered':{'filter':{'term': {'uid': uid}}}}, 'sort':sort_type, 'size':100})['hits']['hits'] except: weibo_result = [] if weibo_result: weibo_list.extend(weibo_result) print 'weibo_list:', weibo_list[0] sort_weibo_list = sorted(weibo_list, key=lambda x:x['_source'][sort_type], reverse=True)[:100] for weibo_item in sort_weibo_list: source = weibo_item['_source'] mid = source['mid'] uid = source['uid'] text = source['text'] ip = source['geo'] timestamp = source['timestamp'] date = ts2date(timestamp) sentiment = source['sentiment'] #run_type if RUN_TYPE == 1: retweet_count = source['retweet_count'] comment_count = source['comment_count'] sensitive_score = source['sensitive'] else: retweet_count = 0 comment_count = 0 sensitive_score = 0 city = ip2city(ip) results.append([mid, uid, text, ip, city,timestamp, date, retweet_count, comment_count, sensitive_score]) return results
def format_count(count_dict): count = {} for key in count_dict: count[key] = sum(count_dict[key].values()) peak_index = detect_peaks(zip(*count.iteritems())[1]) res = [] for idx, (key, value) in enumerate(count.iteritems()): peak = "0" if idx in peak_index: peak = "1" res.append([str(count[key]), ts2date(key), peak]) return res
def get_group_analysis(submit_user): results = [] #step1: get query body query_body = { 'query': { 'filtered': { 'filter': { 'bool': { 'must': [{ 'term': { 'submit_user': submit_user } }, { 'term': { 'task_type': 'analysis' } }] } } } }, 'sort': [{ 'submit_date': { 'order': 'desc' } }], 'size': MAX_VALUE } #step2: search try: group_task_result = es_group_result.search(index=group_index_name, doc_type=group_index_type,\ body=query_body)['hits']['hits'] except: group_task_result = [] #step3: task results for group_item in group_task_result: source = group_item['_source'] task_name = source['task_name'] if not task_name: continue task_status = source['status'] submit_ts = source['submit_date'] submit_date = ts2date(submit_ts) try: state = source['state'] except: state = '' results.append([task_name, submit_date, state, task_status]) return results
def search_sentiment_all_keywords_task(submit_date, keywords_string, submit_user, start_date, end_date, status): results = [] query_list = [] if submit_date: submit_ts_start = datetime2ts(submit_date) submit_ts_end = submit_ts_start + DAY query_list.append({'range': {'submit_ts': {'gte': submit_ts_start, 'lt':submit_ts_end}}}) if keywords_string: keywords_list = keywords_string.split(',') query_list.append({'terms':{'query_keywords': keywords_list}}) if submit_user: query_list.append({'term': {'submit_user': submit_user}}) if start_date: start_s_ts = datetime2ts(start_date) if end_date: start_e_ts = datetime2ts(end_date) else: start_e_ts = start_s_ts + DAY * 30 start_date_nest_body_list = [ts2datetime(ts) for ts in range(start_s_ts, start_e_ts + DAY, DAY)] query_list.append({'terms':{'start_date': start_date_nest_body_list}}) if end_date: end_e_ts = datetime2ts(end_date) if start_date: end_s_ts = datetime2ts(start_date) else: end_s_ts = end_e_ts - DAY * 30 end_date_nest_body_list = [ts2datetime(ts) for ts in range(end_s_ts, end_e_ts + DAY, DAY)] query_list.append({'terms': {'end_date': end_date_nest_body_list}}) if status: query_list.append({'term': {'status': status}}) try: task_results = es_sentiment_task.search(index=sentiment_keywords_index_name, \ doc_type=sentiment_keywords_index_type, body={'query':{'bool':{'must':query_list}}})['hits']['hits'] except: task_results = [] for task_item in task_results: task_source = task_item['_source'] task_id = task_source['task_id'] start_date = task_source['start_date'] end_date = task_source['end_date'] keywords = task_source['query_keywords'] submit_ts = ts2date(task_source['submit_ts']) status = task_source['status'] segment = task_source['segment'] results.append([task_id, start_date, end_date, keywords, submit_ts, status, segment]) return results
def get_influence_content(uid, timestamp_from, timestamp_to): weibo_list = [] #split timestamp range to new_range_dict_list from_date_ts = datetime2ts(ts2datetime(timestamp_from)) to_date_ts = datetime2ts(ts2datetime(timestamp_to)) new_range_dict_list = [] if from_date_ts != to_date_ts: iter_date_ts = from_date_ts while iter_date_ts < to_date_ts: iter_next_date_ts = iter_date_ts + DAY new_range_dict_list.append({'range':{'timestamp':{'gte':iter_date_ts, 'lt':iter_next_date_ts}}}) iter_date_ts = iter_next_date_ts if new_range_dict_list[0]['range']['timestamp']['gte'] < timestamp_from: new_range_dict_list[0]['range']['timestamp']['gte'] = timestamp_from if new_range_dict_list[-1]['range']['timestamp']['lt'] > timestamp_to: new_range_dict_list[-1]['range']['timestamp']['lt'] = timestamp_to else: new_range_dict_list = [{'range':{'timestamp':{'gte':timestamp_from, 'lt':timestamp_to}}}] #iter date to search flow_text iter_result = [] for range_item in new_range_dict_list: range_from_ts = range_item['range']['timestamp']['gte'] range_from_date = ts2datetime(range_from_ts) flow_text_index_name = flow_text_index_name_pre + range_from_date query = [] query.append({'term':{'uid':uid}}) query.append(range_item) try: flow_text_exist = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body={'query':{'bool':{'must': query}}, 'sort':[{'timestamp':'asc'}]})['hits']['hits'] except: flow_text_exist = [] iter_result.extend(flow_text_exist) # get weibo list for item in flow_text_exist: source = item['_source'] weibo = {} weibo['timestamp'] = ts2date(source['timestamp']) weibo['ip'] = source['ip'] weibo['text'] = source['text'] if source['geo']: weibo['geo'] = '\t'.join(source['geo'].split('&')) else: weibo['geo'] = '' weibo_list.append(weibo) return weibo_list
def search_sentiment_all_keywords_task(submit_date, keywords_string, submit_user, start_date, end_date, status): results = [] query_list = [] if submit_date: submit_ts_start = datetime2ts(submit_date) submit_ts_end = submit_ts_start + DAY query_list.append({'range': {'submit_ts': {'gte': submit_ts_start, 'lt':submit_ts_end}}}) if keywords_string: keywords_list = keywords_string.split(',') query_list.append({'terms':{'query_keywords': keywords_list}}) if submit_user: query_list.append({'term': {'submit_user': submit_user}}) if start_date: start_s_ts = datetime2ts(start_date) if end_date: start_e_ts = datetime2ts(end_date) else: start_e_ts = start_s_ts + DAY * 30 start_date_nest_body_list = [ts2datetime(ts) for ts in range(start_s_ts, start_e_ts + DAY, DAY)] query_list.append({'terms':{'start_date': start_date_nest_body_list}}) if end_date: end_e_ts = datetime2ts(end_date) if start_date: end_s_ts = datetime2ts(start_date) else: end_s_ts = end_e_ts - DAY * 30 end_date_nest_body_list = [ts2datetime(ts) for ts in range(end_s_ts, end_e_ts + DAY, DAY)] query_list.append({'terms': {'end_date': end_date_mest_body_list}}) if status: query_list.append({'term': {'status': status}}) try: task_results = es_sentiment_task.search(index=sentiment_keywords_index_name, \ doc_type=sentiment_keywords_index_type, body={'query':{'bool':{'must':query_list}}})['hits']['hits'] except: task_results = [] for task_item in task_results: task_source = task_item['_source'] task_id = task_source['task_id'] start_date = task_source['start_date'] end_date = task_source['end_date'] keywords = task_source['query_keywords'] submit_ts = ts2date(task_source['submit_ts']) status = task_source['status'] segment = task_source['segment'] results.append([task_id, start_date, end_date, keywords, submit_ts, status, segment]) return results
def get_text(top_list, date, user_info, style): # input: [[mid1, no.1], [mid2, no.2], ['mid3', no.3]] # output: [[text1, no.1], [text2, no.2], [text3, no.3]] # mid, retweeted, comment, text, geo, timestamp, sentiment, mid_url results = [] detail_list = ["origin_weibo_retweeted_detail", "origin_weibo_comment_detail", "retweeted_weibo_retweeted_detail", "retweeted_weibo_comment_detail"] index_flow_text = pre_text_index + date if len(top_list) != 0: # no one mid_list = [] for i in range(len(top_list)): mid_list.append(top_list[i][0]) search_result = es.mget(index=index_flow_text, doc_type=flow_text_index_type, body={"ids":mid_list})["docs"] for i in range(len(top_list)): temp = [] temp.append(mid_list[i]) if int(style) == 0: temp.append(top_list[i][1]) temp.append(json.loads(user_info[detail_list[1]]).get(top_list[i][0], 0)) elif int(style) == 1: temp.append(json.loads(user_info[detail_list[0]]).get(top_list[i][0], 0)) temp.append(top_list[i][1]) elif int(style) == 2: temp.append(top_list[i][1]) temp.append(json.loads(user_info[detail_list[3]]).get(top_list[i][0], 0)) else: temp.append(json.loads(user_info[detail_list[2]]).get(top_list[i][0], 0)) temp.append(top_list[i][1]) if search_result[i]["found"]: source = search_result[i]["_source"] temp.append(source["text"]) temp.append(source["geo"]) temp.append(ts2date(source["timestamp"])) temp.append(source["sentiment"]) temp.append(weiboinfo2url(source['uid'], source['mid'])) temp.append(uid_url+source['uid']) temp.append(source['uid']) try: uname = es_profile.get(index=profile_index_name, doc_type=profile_index_type, id=source['uid'])["_source"]["nick_name"] temp.append(uname) except: temp.append("unknown") else: temp.extend(["", "", "", "", "", "", "", ""]) results.append(temp) return results
def get_group_detect(submit_user): results = [] query_body = { 'query': { 'filtered': { 'filter': { 'bool': { 'must': [{ 'term': { 'submit_user': submit_user } }, { 'term': { 'task_type': 'detect' } }] } } } }, 'sort': [{ 'submit_date': { 'order': 'desc' } }], 'size': MAX_VALUE } #search group task try: group_task_result = es_group_result.search(index=group_index_name, doc_type=group_index_type,\ body=query_body)['hits']['hits'] except: group_task_result = [] #group task results for group_item in group_task_result: source = group_item['_source'] task_name = source['task_name'] task_process = source['detect_process'] submit_ts = source['submit_date'] submit_date = ts2date(submit_ts) state = source['state'] task_type = source['detect_type'] results.append( [task_name, submit_date, state, task_type, task_process]) return results
def get_group_analysis(submit_user): results = [] #step1: get query body query_body = { 'query':{ 'filtered':{ 'filter':{ 'bool':{ 'must':[ {'term': {'submit_user': submit_user}}, {'term': {'task_type': 'analysis'}} ] } } } }, 'sort': [{'submit_date': {'order': 'desc'}}], 'size': MAX_VALUE } #step2: search try: group_task_result = es_group_result.search(index=group_index_name, doc_type=group_index_type,\ body=query_body)['hits']['hits'] except: group_task_result = [] #step3: task results for group_item in group_task_result: source = group_item['_source'] task_name = source['task_name'] if not task_name: continue task_status = source['status'] submit_ts = source['submit_date'] submit_date = ts2date(submit_ts) try: state = source['state'] except: state = '' results.append([task_name, submit_date, state, task_status]) return results
def get_network_task(submit_user): results = [] #step1: query body query_body = { 'query': { 'filtered': { 'filter': { 'term': { 'submit_user': submit_user } } } }, 'sort': [{ 'submit_ts': { 'order': 'desc' } }], 'size': MAX_VALUE } #step2: search try: network_task_result = es_network_task.search(index=network_keywords_index_name, \ doc_type=network_keywords_index_type, body=query_body)['hits']['hits'] except: network_task_result = [] #step3: get results for task_item in network_task_result: source = task_item['_source'] task_id = source['task_id'] submit_ts = source['submit_ts'] submit_date = ts2date(submit_ts) keywords = source['query_keywords'] start_date = source['start_date'] end_date = source['end_date'] status = source['status'] results.append( [task_id, keywords, submit_date, start_date, end_date, status]) return results
def get_sentiment_task(submit_user): results = [] #run type if RUN_TYPE == 0: submit_user = '******' #step1:query_body query_body = { 'query': { 'filtered': { 'filter': { 'term': { 'submit_user': submit_user } } } }, 'size': MAX_VALUE } #step2:search try: sentiment_task_result = es_sentiment_task.search(index=sentiment_keywords_index_name,\ doc_type=sentiment_keywords_index_type, body=query_body)['hits']['hits'] except: sentiment_task_result = [] #step3:query results for task_item in sentiment_task_result: source = task_item['_source'] task_id = source['task_id'] query_keywords = source['query_keywords'] submit_ts = source['submit_ts'] submit_date = ts2date(submit_ts) start_date = source['start_date'] end_date = source['end_date'] status = source['status'] results.append([task_id, query_keywords, start_date, end_date, \ submit_date, status, submit_ts]) #step4:sort by query_ts sort_results = sorted(results, key=lambda x: x[6], reverse=True) return sort_results
def get_sensing_task(submit_user): results = [] #step1: query_body query_body = { 'query': { 'filtered': { 'filter': { 'term': { 'create_by': submit_user } } } }, 'size': MAX_VALUE, 'sort': [{ 'create_at': { 'order': 'desc' } }] } #step2: search try: sensing_task_result = es_social_sensing.search(index=sensing_index_name, doc_type=sensing_doc_type,\ body=query_body)['hits']['hits'] except: sensing_task_result = [] #step3: task results for task_item in sensing_task_result: source = task_item['_source'] task_name = source['task_name'] status = source['processing_status'] remark = source['remark'] submit_ts = source['create_at'] if submit_ts: submit_date = ts2date(int(submit_ts)) results.append([task_name, submit_date, remark, status]) return results
def get_sentiment_task(submit_user): results = [] #run type if RUN_TYPE == 0: submit_user = '******' #step1:query_body query_body = { 'query':{ 'filtered':{ 'filter':{ 'term': {'submit_user': submit_user} } } }, 'size': MAX_VALUE } #step2:search try: sentiment_task_result = es_sentiment_task.search(index=sentiment_keywords_index_name,\ doc_type=sentiment_keywords_index_type, body=query_body)['hits']['hits'] except: sentiment_task_result = [] #step3:query results for task_item in sentiment_task_result: source = task_item['_source'] task_id = source['task_id'] query_keywords = source['query_keywords'] submit_ts = source['submit_ts'] submit_date = ts2date(submit_ts) start_date = source['start_date'] end_date = source['end_date'] status = source['status'] results.append([task_id, query_keywords, start_date, end_date, \ submit_date, status, submit_ts]) #step4:sort by query_ts sort_results = sorted(results, key=lambda x:x[6],reverse=True) return sort_results
def search_sentiment_detail_in_domain(start_ts, task_type, task_detail, time_segment, sentiment, sort_type): results = {} start_ts = int(start_ts) start_date = ts2datetime(start_ts) end_ts = start_ts + str2segment[time_segment] print 'start_ts:', ts2date(start_ts) print 'end_ts:', ts2date(end_ts) if sentiment == '7': query_sentiment_list = SENTIMENT_SECOND else: query_sentiment_list = [sentiment] user_domain = task_detail #step1: iter get weibo and user in domain iter_user_count = 0 in_user_result = {} all_filter_weibo_list = [] sort_evaluate_max = SENTIMENT_SORT_EVALUATE_MAX flow_text_index_name = flow_text_index_name_pre + start_date print 'flow_text_index_name:', flow_text_index_name while len(in_user_result) < SENTIMENT_MAX_USER: print 'in_user_result:', len(in_user_result) print 'sort_evaluate_max:', sort_evaluate_max query_body = { 'query': { 'filtered': { 'filter': { 'bool': { 'must': [{ 'range': { sort_type: { 'lt': sort_evaluate_max } } }, { 'terms': { 'sentiment': query_sentiment_list } }, { 'range': { 'timestamp': { 'gte': start_ts, 'lt': end_ts } } }] } } } }, 'sort': [{ sort_type: { 'order': 'desc' } }], 'size': SENTIMENT_ITER_TEXT_COUNT } try: flow_text_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body=query_body)['hits']['hits'] except: flow_text_result = [] print 'len flow_text_result:', len(flow_text_result) if not flow_text_result: break weibo_list, user_set = deal_show_weibo_list(flow_text_result) #filter domain user filter_type = 'domain' print 'identify user portrait domain topic' in_portrait_result = identify_user_portrait_domain_topic( user_set, filter_type, user_domain) filter_weibo_list = filter_weibo_in(weibo_list, in_portrait_result) if filter_weibo_list: all_filter_weibo_list.extend(filter_weibo_list) if in_portrait_result: in_user_result = dict(in_user_result, **in_portrait_result) sort_evaluate_max = flow_text_result[-1]['_source'][sort_type] query_uid_list = in_user_result.keys() #step2: get keywords from flow_text print 'get keyword' keyword_query_dict = { 'query': { 'filtered': { 'filter': { 'bool': { 'must': [{ 'range': { 'timestamp': { 'gte': start_ts, 'lt': end_ts } } }, { 'terms': { 'uid': query_uid_list } }] } } } }, 'aggs': { 'all_interests': { 'terms': { 'field': 'keywords_string', 'size': SENTIMENT_MAX_KEYWORDS } } } } show_keywords_dict = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body=keyword_query_dict)['aggregations']['all_interests']['buckets'] keywords_list = [[item['key'], item['doc_count']] for item in show_keywords_dict] #step3: get results results['weibo'] = all_filter_weibo_list results['in_portrait_result'] = sorted(in_user_result.items(), key=lambda x: x[1][1], reverse=True) results['keywords'] = keywords_list return results
def group_user_weibo(task_name, submit_user, sort_type): weibo_list = [] now_date = ts2datetime(time.time()) #run_type if RUN_TYPE == 0: now_date = RUN_TEST_TIME sort_type = 'timestamp' #step1: get group user task_id = submit_user + '-' + task_name try: group_exist_result = es_group_result.get(index=group_index_name, doc_type=group_index_type,\ id=task_id)['_source'] except: group_exist_result = {} if not group_exist_result: return 'group no exist' #step2: get user weibo list uid_list = group_exist_result['uid_list'] for i in range(7,0,-1): iter_date = ts2datetime(datetime2ts(now_date) - i * DAY) index_name = flow_text_index_name_pre + iter_date try: weibo_result = es_flow_text.search(index=index_name, doc_type=flow_text_index_type,\ body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'sort':sort_type, 'size':100})['hits']['hits'] except: weibo_result = [] if weibo_result: weibo_list.extend(weibo_result) sort_weibo_list = sorted(weibo_list, key=lambda x:x['_source'][sort_type], reverse=True)[:100] #step3: get user name try: portrait_exist_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, \ body={'ids':uid_list})['docs'] except: portrait_exist_result = [] uid2uname_dict = {} for portrait_item in portrait_exist_result: uid = portrait_item['_id'] if portrait_item['found'] == True: source = portrait_item['_source'] uname = source['uname'] else: uname = 'unknown' uid2uname_dict[uid] = uname weibo_list = [] for weibo_item in sort_weibo_list: source = weibo_item['_source'] mid = source['mid'] uid = source['uid'] uname = uid2uname_dict[uid] text = source['text'] ip = source['geo'] timestamp = source['timestamp'] date = ts2date(timestamp) sentiment = source['sentiment'] weibo_url = weiboinfo2url(uid, mid) #run_type: if RUN_TYPE == 1: retweet_count = source['retweet_count'] comment_count = source['comment_count'] sensitive_score = source['sensitive_score'] else: retweet_count = 0 comment_count = 0 sensitive_score = 0 city = ip2city(ip) weibo_list.append([mid, uid, uname, text, ip, city, timestamp, date, retweet_count, comment_count, sensitive_score, weibo_url]) return weibo_list
def get_retweet_weibo_detail(ts, user, task_name, size, text_type, type_value): _id = user + '-' + task_name task_detail = es_user_portrait.get(index=index_sensing_task, doc_type=_id, id=ts)['_source'] origin_weibo_detail = json.loads(task_detail['origin_weibo_detail']) retweeted_weibo_detail = json.loads(task_detail['retweeted_weibo_detail']) mid_list = [] mid_list.extend(origin_weibo_detail.keys()) mid_list.extend(retweeted_weibo_detail.keys()) query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte": ts - time_interval, "lt": ts } }}, {"terms": {"root_mid": mid_list}} ] } } } }, "sort": {"timestamp": {"order": "desc"}}, "size": 100 } if text_type == "message_type": query_body['query']['filtered']['filter']['bool']['must'].append({"term":{text_type: type_value}}) if text_type == "sentiment": #if isinstance(type_value, str): if len(type_value) == 1: query_body['query']['filtered']['filter']['bool']['must'].append({"term":{text_type: type_value}}) else: query_body['query']['filtered']['filter']['bool']['must'].append({"terms":{text_type: type_value}}) datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) # 1. 查询微博 if datetime == datetime_1 and exist_es: search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] elif datetime != datetime_1 and exist_es_1: search_results = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] #print search_results # 2. 获取微博相关信息 results = [] uid_list = [] if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) if uid_list: portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"] for i in range(len(uid_list)): item = search_results[i]['_source'] temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type temp.append(item['uid']) if portrait_result[i]['found']: temp.append(portrait_result[i]["fields"]["nick_name"][0]) temp.append(portrait_result[i]["fields"]["photo_url"][0]) else: temp.append(item['uid']) temp.append("") temp.append(item["text"]) #print item['text'] temp.append(item["sentiment"]) temp.append(ts2date(item['timestamp'])) temp.append(item['geo']) temp.append(item["message_type"]) results.append(temp) return results
def get_origin_weibo_detail(ts, user, task_name, size, order, message_type=1): _id = user + '-' + task_name task_detail = es_user_portrait.get(index=index_sensing_task, doc_type=_id, id=ts)['_source'] mid_value = json.loads(task_detail['mid_topic_value']) duplicate_dict = json.loads(task_detail['duplicate_dict']) tmp_duplicate_dict = dict() for k, v in duplicate_dict.iteritems(): try: tmp_duplicate_dict[v].append(k) except: tmp_duplicate_dict[v] = [k, v] if message_type == 1: weibo_detail = json.loads(task_detail['origin_weibo_detail']) elif message_type == 2: weibo_detail = json.loads(task_detail['retweeted_weibo_detail']) else: weibo_detail = json.loads(task_detail['sensitive_weibo_detail']) weibo_detail_list = [] if weibo_detail: for iter_mid, item in weibo_detail.iteritems(): tmp = [] tmp.append(iter_mid) tmp.append(item[iter_mid]) tmp.append(item['retweeted']) tmp.append(item['comment']) weibo_detail_list.append(tmp) mid_list = weibo_detail.keys() print len(mid_list) results = [] query_body = { "query": { "filtered": { "filter": { "terms": { "mid": mid_list } } } }, "size": 1000, "sort": { "timestamp": { "order": "desc" } } } index_list = [] datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts - DAY) index_name = flow_text_index_name_pre + datetime print es_text exist_es = es_text.indices.exists(index_name) print exist_es if exist_es: index_list.append(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) if exist_es_1: index_list.append(index_name_1) if index_list and mid_list: search_results = es_text.search(index=index_list, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] uid_list = [] text_dict = dict() # 文本信息 portrait_dict = dict() # 背景信息 sort_results = [] if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) text_dict[item['_id']] = item['_source'] # _id是mid if uid_list: portrait_result = es_profile.mget( index=profile_index_name, doc_type=profile_index_type, body={"ids": uid_list}, fields=['nick_name', 'photo_url'])["docs"] for item in portrait_result: if item['found']: portrait_dict[item['_id']] = { "nick_name": item["fields"]["nick_name"][0], "photo_url": item["fields"]["photo_url"][0] } else: portrait_dict[item['_id']] = { "nick_name": item['_id'], "photo_url": "" } if order == "total": sorted_list = sorted(weibo_detail_list, key=lambda x: x[1], reverse=True) elif order == "retweeted": sorted_list = sorted(weibo_detail_list, key=lambda x: x[2], reverse=True) elif order == "comment": sorted_list = sorted(weibo_detail_list, key=lambda x: x[3], reverse=True) else: sorted_list = weibo_detail_list count_n = 0 results_dict = dict() mid_index_dict = dict() for item in sorted_list: # size mid = item[0] iter_text = text_dict.get(mid, {}) temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, keywords_string, message_type if iter_text: uid = iter_text['uid'] temp.append(uid) iter_portrait = portrait_dict.get(uid, {}) if iter_portrait: temp.append(iter_portrait['nick_name']) temp.append(iter_portrait['photo_url']) else: temp.extend([uid, '']) temp.append(iter_text["text"]) temp.append(iter_text["sentiment"]) temp.append(ts2date(iter_text['timestamp'])) temp.append(iter_text['geo']) if message_type == 1: temp.append(1) elif message_type == 2: temp.append(3) else: temp.append(iter_text['message_type']) temp.append(iter_text['keywords_string']) temp.append(item[2]) temp.append(item[3]) temp.append(iter_text.get('sensitive', 0)) temp.append(iter_text['timestamp']) temp.append(mid_value[mid]) temp.append(mid) results.append(temp) count_n += 1 results = sorted(results, key=operator.itemgetter(-4, -2, -6), reverse=True) # -4 -2 -3 sort_results = [] count = 0 for item in results: sort_results.append([item]) mid_index_dict[item[-1]] = count count += 1 if tmp_duplicate_dict: remove_list = [] value_list = tmp_duplicate_dict.values() # [[mid, mid], ] for item in value_list: tmp = [] for mid in item: if mid_index_dict.get(mid, 0): tmp.append(mid_index_dict[mid]) if len(tmp) > 1: tmp_min = min(tmp) else: continue tmp.remove(tmp_min) for iter_count in tmp: sort_results[tmp_min].extend(sort_results[iter_count]) remove_list.append(sort_results[iter_count]) if remove_list: for item in remove_list: sort_results.remove(item) return sort_results
def get_origin_weibo_detail(ts, user, task_name, size, order, message_type=1): _id = user + '-' + task_name task_detail = es_user_portrait.get(index=index_sensing_task, doc_type=_id, id=ts)['_source'] mid_value = json.loads(task_detail['mid_topic_value']) duplicate_dict = json.loads(task_detail['duplicate_dict']) tmp_duplicate_dict = dict() for k,v in duplicate_dict.iteritems(): try: tmp_duplicate_dict[v].append(k) except: tmp_duplicate_dict[v] = [k, v] if message_type == 1: weibo_detail = json.loads(task_detail['origin_weibo_detail']) elif message_type == 2: weibo_detail = json.loads(task_detail['retweeted_weibo_detail']) else: weibo_detail = json.loads(task_detail['sensitive_weibo_detail']) weibo_detail_list = [] if weibo_detail: for iter_mid, item in weibo_detail.iteritems(): tmp = [] tmp.append(iter_mid) tmp.append(item[iter_mid]) tmp.append(item['retweeted']) tmp.append(item['comment']) weibo_detail_list.append(tmp) mid_list = weibo_detail.keys() results = [] query_body = { "query":{ "filtered":{ "filter":{ "terms":{"mid": mid_list} } } }, "size": 1000, "sort": {"timestamp": {"order": "desc"}} } index_list = [] datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-DAY) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: index_list.append(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) if exist_es_1: index_list.append(index_name_1) if index_list and mid_list: search_results = es_text.search(index=index_list, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] uid_list = [] text_dict = dict() # 文本信息 portrait_dict = dict() # 背景信息 sort_results = [] if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) text_dict[item['_id']] = item['_source'] # _id是mid if uid_list: portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"] for item in portrait_result: if item['found']: portrait_dict[item['_id']] = {"nick_name": item["fields"]["nick_name"][0], "photo_url": item["fields"]["photo_url"][0]} else: portrait_dict[item['_id']] = {"nick_name": item['_id'], "photo_url":""} if order == "total": sorted_list = sorted(weibo_detail_list, key=lambda x:x[1], reverse=True) elif order == "retweeted": sorted_list = sorted(weibo_detail_list, key=lambda x:x[2], reverse=True) elif order == "comment": sorted_list = sorted(weibo_detail_list, key=lambda x:x[3], reverse=True) else: sorted_list = weibo_detail_list count_n = 0 results_dict = dict() mid_index_dict = dict() for item in sorted_list: # size mid = item[0] iter_text = text_dict.get(mid, {}) temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type if iter_text: uid = iter_text['uid'] temp.append(uid) iter_portrait = portrait_dict.get(uid, {}) if iter_portrait: temp.append(iter_portrait['nick_name']) temp.append(iter_portrait['photo_url']) else: temp.extend([uid,'']) temp.append(iter_text["text"]) temp.append(iter_text["sentiment"]) temp.append(ts2date(iter_text['timestamp'])) temp.append(iter_text['geo']) if message_type == 1: temp.append(1) elif message_type == 2: temp.append(3) else: temp.append(iter_text['message_type']) temp.append(item[2]) temp.append(item[3]) temp.append(iter_text.get('sensitive', 0)) temp.append(iter_text['timestamp']) temp.append(mid_value[mid]) temp.append(mid) results.append(temp) count_n += 1 results = sorted(results, key=operator.itemgetter(-4, -2, -6), reverse=True) # -4 -2 -3 sort_results = [] count = 0 for item in results: sort_results.append([item]) mid_index_dict[item[-1]] = count count += 1 if tmp_duplicate_dict: remove_list = [] value_list = tmp_duplicate_dict.values() # [[mid, mid], ] for item in value_list: tmp = [] for mid in item: if mid_index_dict.get(mid, 0): tmp.append(mid_index_dict[mid]) if len(tmp) > 1: tmp_min = min(tmp) else: continue tmp.remove(tmp_min) for iter_count in tmp: sort_results[tmp_min].extend(sort_results[iter_count]) remove_list.append(sort_results[iter_count]) if remove_list: for item in remove_list: sort_results.remove(item) return sort_results
def group_user_weibo(task_name, submit_user, sort_type): weibo_list = [] now_date = ts2datetime(time.time()) if sort_type == 'retweet': sort_type = 'retweeted' #run_type if RUN_TYPE == 0: now_date = RUN_TEST_TIME sort_type = 'timestamp' #step1: get group user task_id = submit_user + '-' + task_name try: group_exist_result = es_group_result.get(index=group_index_name, doc_type=group_index_type,\ id=task_id)['_source'] except: group_exist_result = {} if not group_exist_result: return 'group no exist' #step2: get user weibo list uid_list = group_exist_result['uid_list'] for i in range(6, -1, -1): iter_date = ts2datetime(datetime2ts(now_date) - i * DAY) index_name = flow_text_index_name_pre + iter_date try: weibo_result = es_flow_text.search(index=index_name, doc_type=flow_text_index_type,\ body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'sort':[{sort_type: {'order': 'desc'}}], 'size':100})['hits']['hits'] except: weibo_result = [] if weibo_result: weibo_list.extend(weibo_result) #sort_weibo_list = sorted(weibo_list, key=lambda x:x['_source'][sort_type], reverse=True)[:100] sort_weibo_list = weibo_list #step3: get user name try: portrait_exist_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, \ body={'ids':uid_list})['docs'] except: portrait_exist_result = [] uid2uname_dict = {} for portrait_item in portrait_exist_result: uid = portrait_item['_id'] if portrait_item['found'] == True: source = portrait_item['_source'] uname = source['uname'] else: uname = 'unknown' uid2uname_dict[uid] = uname weibo_list = [] for weibo_item in sort_weibo_list: source = weibo_item['_source'] mid = source['mid'] uid = source['uid'] uname = uid2uname_dict[uid] text = source['text'] ip = source['geo'] timestamp = source['timestamp'] date = ts2date(timestamp) sentiment = source['sentiment'] weibo_url = weiboinfo2url(uid, mid) #run_type: if RUN_TYPE == 1: try: retweet_count = source['retweeted'] except: retweet_count = 0 try: comment_count = source['comment'] except: comment_count = 0 try: sensitive_score = source['sensitive'] except: sensitive_score = 0 else: retweet_count = 0 comment_count = 0 sensitive_score = 0 city = ip2city(ip) weibo_list.append([ mid, uid, uname, text, ip, city, timestamp, date, retweet_count, comment_count, sensitive_score, weibo_url ]) if sort_type == 'timestamp': new_weibo_list = sorted(weibo_list, key=lambda x: x[6], reverse=True) elif sort_type == 'retweeted': new_weibo_list = sorted(weibo_list, key=lambda x: x[8], reverse=True) elif sort_type == 'comment': new_weibo_list = sorted(weibo_list, key=lambda x: x[9], reverse=True) elif sort_type == 'sensitive': new_weibo_list = sorted(weibo_list, key=lambda x: x[10], reverse=True) return new_weibo_list
def new_get_user_weibo(uid, sort_type): results = [] weibo_list = [] now_date = ts2datetime(time.time()) #run_type if RUN_TYPE == 0: now_date = RUN_TEST_TIME sort_type = 'timestamp' #step1:get user name print '708' try: user_profile_result = es_user_profile.get(index=profile_index_name, doc_type=profile_index_type,\ id=uid, _source=False, fields=['nick_name']) except: user_profile_result = {} print '714', len(user_profile_result) if user_profile_result: uname = user_profile_result['fields']['nick_name'][0] else: uname = '' #step2:get user weibo for i in range(7, 0, -1): if RUN_TYPE == 1: iter_date = ts2datetime(datetime2ts(now_date) - i * DAY) else: iter_date = '2013-09-01' index_name = flow_text_index_name_pre + iter_date print '726' try: weibo_result = es_flow_text.search(index=index_name, doc_type=flow_text_index_type,\ body={'query':{'filtered':{'filter':{'term': {'uid': uid}}}}, 'size':MAX_VALUE})['hits']['hits'] #print weibo_result except: weibo_result = [] print '732', len(weibo_result) if weibo_result: weibo_list.extend(weibo_result) #sort_weibo_list = sorted(weibo_list, key=lambda x:x['_source'][sort_type], reverse=True)[:100] mid_set = set() for weibo_item in weibo_list: source = weibo_item['_source'] mid = source['mid'] uid = source['uid'] text = source['text'] ip = source['ip'] timestamp = source['timestamp'] date = ts2date(timestamp) sentiment = source['sentiment'] weibo_url = weiboinfo2url(uid, mid) #run_type if RUN_TYPE == 1: try: retweet_count = source['retweeted'] except: retweet_count = 0 try: comment_count = source['comment'] except: comment_count = 0 try: sensitive_score = source['sensitive'] except: sensitive_score = 0 else: retweet_count = 0 comment_count = 0 sensitive_score = 0 city = ip2city(ip) if mid not in mid_set: results.append([ mid, uid, text, ip, city, timestamp, date, retweet_count, comment_count, sensitive_score, weibo_url ]) mid_set.add(mid) if sort_type == 'timestamp': sort_results = sorted(results, key=lambda x: x[5], reverse=True) elif sort_type == 'retweet_count': sort_results = sorted(results, key=lambda x: x[7], reverse=True) elif sort_type == 'comment_count': sort_results = sorted(results, key=lambda x: x[8], reverse=True) elif sort_type == 'sensitive': sort_results = sorted(results, key=lambda x: x[9], reverse=True) print '778' return sort_results
def get_retweet_weibo_detail(ts, user, task_name, size, text_type, type_value): _id = user + '-' + task_name task_detail = es_user_portrait.get(index=index_sensing_task, doc_type=_id, id=ts)['_source'] origin_weibo_detail = json.loads(task_detail['origin_weibo_detail']) retweeted_weibo_detail = json.loads(task_detail['retweeted_weibo_detail']) mid_list = [] mid_list.extend(origin_weibo_detail.keys()) mid_list.extend(retweeted_weibo_detail.keys()) query_body = { "query": { "filtered": { "filter": { "bool": { "must": [{ "range": { "timestamp": { "gte": ts - time_interval, "lt": ts } } }, { "terms": { "root_mid": mid_list } }] } } } }, "sort": { "timestamp": { "order": "desc" } }, "size": 100 } if text_type == "message_type": query_body['query']['filtered']['filter']['bool']['must'].append( {"term": { text_type: type_value }}) if text_type == "sentiment": #if isinstance(type_value, str): if len(type_value) == 1: query_body['query']['filtered']['filter']['bool']['must'].append( {"term": { text_type: type_value }}) else: query_body['query']['filtered']['filter']['bool']['must'].append( {"terms": { text_type: type_value }}) datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts - time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) # 1. 查询微博 if datetime == datetime_1 and exist_es: search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] elif datetime != datetime_1 and exist_es_1: search_results = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] #print search_results # 2. 获取微博相关信息 results = [] uid_list = [] if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) if uid_list: portrait_result = es_profile.mget( index=profile_index_name, doc_type=profile_index_type, body={"ids": uid_list}, fields=['nick_name', 'photo_url'])["docs"] for i in range(len(uid_list)): item = search_results[i]['_source'] temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type temp.append(item['uid']) if portrait_result[i]['found']: temp.append(portrait_result[i]["fields"]["nick_name"][0]) temp.append(portrait_result[i]["fields"]["photo_url"][0]) else: temp.append(item['uid']) temp.append("") temp.append(item["text"]) #print item['text'] temp.append(item["sentiment"]) temp.append(ts2date(item['timestamp'])) temp.append(item['geo']) temp.append(item["message_type"]) results.append(temp) return results
def get_positive_weibo_detail(ts, social_sensors, keywords_list, size, sentiment_type=1): former_mid_list = query_mid_list(ts-time_interval, keywords_list, time_segment, social_sensors) # 前一段时间内的微博mid list current_mid_list = query_mid_list(ts, keywords_list, time_interval, social_sensors) mid_list = [] mid_list.extend(former_mid_list) mid_list.extend(current_mid_list) query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte": ts - time_interval, "lt": ts } }} ], "should":[ {"terms": {"root_mid": mid_list}}, {"terms": {"mid": mid_list}}, ] } } } }, "sort": {"timestamp": {"order": "desc"}}, "size": 100 } #if social_sensors and int(sentiment_type) == 1: # query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"terms":{"uid": social_sensors}}) if int(sentiment_type) == 1 or int(sentiment_type) == 0: query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"term":{"sentiment":sentiment_type}}) else: query_body["query"]["filtered"]["filter"]["bool"]["must"].append([{"terms":{"sentiment": ["2", "3"]}}]) # 判断当前ts和ts-time_interval是否属于同一天,确定查询哪个es datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) # 1. 聚合原创微博mid list if datetime == datetime_1 and exist_es: search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] elif datetime != datetime_1 and exist_es_1: search_results = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] uid_list = [] results = [] if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) if uid_list: portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"] for i in range(len(uid_list)): item = search_results[i]['_source'] temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type temp.append(item['uid']) if portrait_result[i]['found']: temp.append(portrait_result[i]["fields"]["nick_name"][0]) temp.append(portrait_result[i]["fields"]["photo_url"][0]) else: temp.append("unknown") temp.append("") temp.append(item["text"]) temp.append(item["sentiment"]) temp.append(ts2date(item['timestamp'])) temp.append(item['geo']) keywords_set = set(item['keywords_string'].split('&')) common_keywords = set(keywords_list) & keywords_set temp.append(list(common_keywords)) temp.append(item['message_type']) results.append(temp) return results
def get_sensitive_weibo_detail(ts, social_sensors, sensitive_words_list, message_type, size=100): results = [] query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte": ts - time_interval, "lt": ts } }}, {"term": {"message_type": message_type}}, {"terms":{"keywords_string": sensitive_words_list}} ] } } } }, "size": size, "sort": {"timestamp": {"order": "desc"}} } if social_sensors: query_body['query']['filtered']['filter']['bool']['must'].append({"terms": {"uid": social_sensors}}) datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) if datetime == datetime_1 and exist_es: search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] elif datetime != datetime_1 and exist_es_1: search_results = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] uid_list = [] if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) if uid_list: portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"] for i in range(len(uid_list)): item = search_results[i]['_source'] temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type temp.append(item['uid']) if portrait_result[i]['found']: temp.append(portrait_result[i]["fields"]["nick_name"][0]) temp.append(portrait_result[i]["fields"]["photo_url"][0]) else: temp.append("unknown") temp.append("") temp.append(item["text"]) #print item['text'] temp.append(item["sentiment"]) temp.append(ts2date(item['timestamp'])) temp.append(item['geo']) keywords_set = set(item['keywords_string'].split('&')) common_keywords = set(sensitive_words_list) & keywords_set temp.append(list(common_keywords)) temp.append(item['message_type']) results.append(temp) return results
def search_sentiment_detail_in_domain(start_ts, task_type, task_detail, time_segment, sentiment, sort_type): results = {} start_ts= int(start_ts) start_date = ts2datetime(start_ts) end_ts = start_ts + str2segment[time_segment] print 'start_ts:', ts2date(start_ts) print 'end_ts:', ts2date(end_ts) if sentiment == '7': query_sentiment_list = SENTIMENT_SECOND else: query_sentiment_list = [sentiment] user_domain = task_detail #step1: iter get weibo and user in domain iter_user_count = 0 in_user_result = {} all_filter_weibo_list = [] sort_evaluate_max= SENTIMENT_SORT_EVALUATE_MAX flow_text_index_name = flow_text_index_name_pre + start_date print 'flow_text_index_name:', flow_text_index_name while len(in_user_result) < SENTIMENT_MAX_USER: print 'in_user_result:', len(in_user_result) print 'sort_evaluate_max:', sort_evaluate_max query_body ={ 'query':{ 'filtered':{ 'filter':{ 'bool':{ 'must':[ {'range':{sort_type: {'lt': sort_evaluate_max}}}, {'terms':{'sentiment': query_sentiment_list}}, {'range':{'timestamp': {'gte': start_ts, 'lt': end_ts}}} ] } } } }, 'sort': [{sort_type: {'order': 'desc'}}], 'size': SENTIMENT_ITER_TEXT_COUNT } try: flow_text_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body=query_body)['hits']['hits'] except: flow_text_result = [] print 'len flow_text_result:', len(flow_text_result) if not flow_text_result: break weibo_list, user_set = deal_show_weibo_list(flow_text_result) #filter domain user filter_type = 'domain' print 'identify user portrait domain topic' in_portrait_result = identify_user_portrait_domain_topic(user_set, filter_type, user_domain) filter_weibo_list = filter_weibo_in(weibo_list, in_portrait_result) if filter_weibo_list: all_filter_weibo_list.extend(filter_weibo_list) if in_portrait_result: in_user_result = dict(in_user_result, **in_portrait_result) sort_evaluate_max = flow_text_result[-1]['_source'][sort_type] query_uid_list = in_user_result.keys() #step2: get keywords from flow_text print 'get keyword' keyword_query_dict = { 'query':{ 'filtered':{ 'filter':{ 'bool':{ 'must':[ {'range':{'timestamp': {'gte': start_ts, 'lt': end_ts}}}, {'terms': {'uid': query_uid_list}} ] } } } }, 'aggs':{ 'all_interests':{ 'terms':{ 'field': 'keywords_string', 'size': SENTIMENT_MAX_KEYWORDS } } } } show_keywords_dict = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body=keyword_query_dict)['aggregations']['all_interests']['buckets'] keywords_list = [[item['key'], item['doc_count']] for item in show_keywords_dict] #step3: get results results['weibo'] = all_filter_weibo_list results['in_portrait_result'] = sorted(in_user_result.items(), key=lambda x:x[1][1], reverse=True) results['keywords'] = keywords_list return results
def search_task(task_name, submit_date, state, status, submit_user): results = [] query = [] condition_num = 0 if task_name: task_name_list = task_name.split(' ') for item in task_name_list: query.append({'wildcard': {'task_name': '*' + item + '*'}}) condition_num += 1 if submit_date: submit_date_ts = datetime2ts(submit_date) submit_date_start = submit_date_ts submit_date_end = submit_date_ts + DAY query.append({ 'range': { 'submit_date': { 'gte': submit_date_start, 'lt': submit_date_end } } }) condition_num += 1 if state: state_list = state.split(' ') for item in state_list: query.append({'wildcard': {'state': '*' + item + '*'}}) condition_num += 1 if status: query.append({'match': {'status': status}}) condition_num += 1 if submit_user: query.append({'term': {'submit_user': submit_user}}) condition_num += 1 print es_group_result, group_index_name, group_index_type if condition_num > 0: query.append({'term': {'task_type': 'analysis'}}) try: source = es_group_result.search(index=group_index_name, doc_type=group_index_type, body={ 'query': { 'bool': { 'must': query } }, 'sort': [{ 'count': { 'order': 'desc' } }], 'size': MAX_VALUE }) except Exception as e: raise e else: query.append({'term': {'task_type': 'analysis'}}) source = es.search(index=group_index_name, doc_type=group_index_type, body={ 'query': { 'bool': { 'must': query } }, 'sort': [{ 'count': { 'order': 'desc' } }], 'size': MAX_VALUE }) try: task_dict_list = source['hits']['hits'] except: return None result = [] for task_dict in task_dict_list: try: state = task_dict['_source']['state'] except: state = '' try: status = task_dict['_source']['status'] except: status = 0 #result.append([task_dict['_source']['task_name'], task_dict['_source']['submit_date'], task_dict['_source']['count'], state, status]) result.append({ 'task_name': task_dict['_source']['task_name'], 'submit_date': ts2date(task_dict['_source']['submit_date']), 'group_count': task_dict['_source']['count'], 'status': status }) return result
def get_origin_weibo_detail(ts, user, task_name, size, order, message_type=1): _id = user + '-' + task_name task_detail = es_user_portrait.get(index=index_sensing_task, doc_type=_id, id=ts)['_source'] if message_type == 1: weibo_detail = json.loads(task_detail['origin_weibo_detail']) elif message_type == 2: weibo_detail = json.loads(task_detail['retweeted_weibo_detail']) weibo_detail_list = [] if weibo_detail: for iter_mid, item in weibo_detail.iteritems(): tmp = [] tmp.append(iter_mid) tmp.append(item[iter_mid]) tmp.append(item['retweeted']) tmp.append(item['comment']) weibo_detail_list.append(tmp) mid_list = weibo_detail.keys() results = [] query_body = { "query":{ "filtered":{ "filter":{ "terms":{"mid": mid_list} } } }, "size": 1000, "sort": {"timestamp": {"order": "desc"}} } index_list = [] datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-DAY) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: index_list.append(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) if exist_es_1: index_list.append(index_name_1) if index_list and mid_list: search_results = es_text.search(index=index_list, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] uid_list = [] text_dict = dict() # 文本信息 portrait_dict = dict() # 背景信息 if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) text_dict[item['_id']] = item['_source'] # _id是mid if uid_list: portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"] for item in portrait_result: if item['found']: portrait_dict[item['_id']] = {"nick_name": item["fields"]["nick_name"][0], "photo_url": item["fields"]["photo_url"][0]} else: portrait_dict[item['_id']] = {"nick_name": item['_id'], "photo_url":""} if order == "total": sorted_list = sorted(weibo_detail_list, key=lambda x:x[1], reverse=True) elif order == "retweeted": sorted_list = sorted(weibo_detail_list, key=lambda x:x[2], reverse=True) elif order == "comment": sorted_list = sorted(weibo_detail_list, key=lambda x:x[3], reverse=True) else: sorted_list = weibo_detail_list count_n = 0 for item in sorted_list: mid = item[0] iter_text = text_dict.get(mid, {}) temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type if iter_text: uid = iter_text['uid'] temp.append(uid) iter_portrait = portrait_dict.get(uid, {}) if iter_portrait: temp.append(iter_portrait['nick_name']) temp.append(iter_portrait['photo_url']) else: temp.extend([uid,'']) temp.append(iter_text["text"]) temp.append(iter_text["sentiment"]) temp.append(ts2date(iter_text['timestamp'])) temp.append(iter_text['geo']) temp.append(iter_text['message_type']) temp.append(item[2]) temp.append(item[3]) temp.append(iter_text['timestamp']) count_n += 1 results.append(temp) if count_n == size: break if results and order == "ts": results = sorted(results, key=lambda x:x[-1], reverse=True) return results