def get_community_keyword(uid_list, date_time): query_body = { 'query': { 'filtered': { 'filter': { 'bool': { 'must': [{ 'terms': { 'uid': uid_list } }] } } } }, 'aggs': { 'keywords': { 'terms': { 'field': 'keywords_string', 'size': 1000 } } } } flow_text_index_name_list = get_flow_text_index_list(date_time) flow_text_exist = es_flow_text.search(index = flow_text_index_name_list,doc_type = flow_text_index_type,\ body = query_body)['aggregations']['keywords']['buckets'] word_dict = dict() word_dict_new = dict() keywords_string = '' for item in flow_text_exist: word = item['key'] count = item['doc_count'] word_dict[word] = count keywords_string += '&' keywords_string += item['key'] k_dict = extract_keywords(keywords_string) for item_item in k_dict: keyword = item_item.word # print 'keyword::',keyword,type(keyword) if word_dict.has_key(keyword): word_dict_new[keyword] = word_dict[keyword] else: word_dict_new[keyword] = 1 keyword_dict = sorted(word_dict_new.items(), key=lambda d: d[1], reverse=True) #print 'keyword_dict',keyword_dict,keyword_dict[0],type(keyword_dict[0]) try: keyword_name = keyword_dict[0][0] + '_' + keyword_dict[1][0] except: keyword_name = 'X' return json.dumps(keyword_dict), keyword_name
def get_user_keywords(uid,today_datetime): flow_text_index_list = get_flow_text_index_list(today_datetime) query_body={ '_source':['keywords_string'], 'query':{ 'filtered':{ 'filter':{ 'bool':{ 'must':[ {'term':{'uid':uid}} ] } } } }, 'size':MAX_SEARCH_SIZE } results = es_flow_text.search(index=flow_text_index_list,doc_type=flow_text_index_type,body=query_body)['hits']['hits'] # print results keywords_list = [] for item in results: keywords_list.extend(item['_source']['keywords_string'].split('&')) temp_keywords = list(set(keywords_list)) keywords = '&'.join(temp_keywords) # print keywords return keywords
def followers_domain_update(): if S_TYPE == 'test': current_time = datetime2ts(S_DATE) else: current_time = int(time.time()) flow_text_index_name_list = get_flow_text_index_list(current_time) query_body = {'query': {'match_all': {}}, 'size': MAX_VALUE} search_results = es_xnr.search(index=weibo_xnr_fans_followers_index_name,\ doc_type=weibo_xnr_fans_followers_index_type,body=query_body)['hits']['hits'] followers_list_all = [] for result in search_results: result = result['_source'] followers_list = result['followers_list'] followers_list_all.extend(followers_list) followers_list_all_set_list = list(set(followers_list_all)) uid_weibo_keywords_dict, keywords_dict_all_users = uid_list_2_uid_keywords_dict( followers_list_all_set_list, flow_text_index_name_list) uids_avtive_list = uid_weibo_keywords_dict.keys( ) # 防止关注列表中有无效uid,或者只有近期活跃的uid才有意义。 ## 领域分类 r_domain = dict() print 'uids_avtive_list::', uids_avtive_list domain, r_domain = domain_classfiy(uids_avtive_list, uid_weibo_keywords_dict) print 'r_domain::', r_domain for uid, domain in r_domain.iteritems(): domain_name = domain_en2ch_dict[domain] _id = uid try: print '_id:::', _id get_result = es_xnr.get(index=user_domain_index_name,doc_type=user_domain_index_type,\ id=_id)['_source'] get_result['domain_name'] = domain_name get_result['update_time'] = int(time.time()) es_xnr.update(index=user_domain_index_name,doc_type=user_domain_index_type,\ id=_id,body={'doc':get_result}) except: item_dict = {} item_dict['uid'] = uid item_dict['domain_name'] = domain_name item_dict['update_time'] = int(time.time()) es_xnr.index(index=user_domain_index_name,doc_type=user_domain_index_type,\ id=_id,body=item_dict)
def caculate_sensitive_user(today_datetime): flow_text_index_list=get_flow_text_index_list(today_datetime) # print flow_text_index_list #计算敏感度排名靠前的用户 query_body={ 'query':{ 'filtered':{ 'filter':{ 'bool':{ 'must':[ {'range':{'sensitive':{'gt':0}}} ] } } } }, 'aggs':{ 'user_sensitive_sum':{ 'terms':{'field':'uid','size':MAX_CACULATE_USER_NUM,'order':[{'sensitive_sum':'desc'}]}, 'aggs':{ 'sensitive_sum':{ 'sum':{'field':'sensitive'} } } } } } sensitive_uid_info=[] sensitive_uidlist=[] try: sensitive_result=es_user_portrait.search(index=flow_text_index_list,doc_type=flow_text_index_type,\ body=query_body)['aggregations']['user_sensitive_sum']['buckets'] # print sensitive_result for i in xrange(0,len(sensitive_result)): user_sensitive=sensitive_result[i]['sensitive_sum']['value'] user_dict=dict() user_dict['uid']=sensitive_result[i]['key'] user_dict['sensitive']=user_sensitive sensitive_uid_info.append(user_dict) sensitive_uidlist.append(sensitive_result[i]['key']) except: sensitive_uid_info=[] sensitive_uidlist=[] return sensitive_uidlist,sensitive_uid_info
def get_psy_feature_sort(uids_list, create_time): index_name_list = get_flow_text_index_list(create_time) query_body = { 'query': { 'filtered': { 'filter': { 'terms': { 'uid': uids_list } } } }, 'aggs': { 'sentiment_all': { 'terms': { 'field': 'sentiment', 'size': MAX_SEARCH_SIZE } } } } es_sentiment_counts = es_flow_text.search(index=index_name_list,doc_type=flow_text_index_type,\ body=query_body)['aggregations']['sentiment_all']['buckets'] sentiment_dict = dict() for item in es_sentiment_counts: sen_no = str(item['key']) sen_count = item['doc_count'] sen_zh = SENTIMENT_DICT_NEW[sen_no] sentiment_dict[sen_zh] = sen_count sentiment_sort = sorted(sentiment_dict.items(), key=lambda x: x[1], reverse=True) return sentiment_sort
from elasticsearch import Elasticsearch from elasticsearch.helpers import scan from textrank4zh import TextRank4Keyword, TextRank4Sentence sys.path.append('../') from global_utils import es_flow_text, flow_text_index_name_pre, flow_text_index_type from global_config import S_DATE from time_utils import get_flow_text_index_list abs_path = './' K1 = 1.5 B = 0.75 K3 = 500 MAX_SIZE = 999999 OPINION_CLUSTER = 5 index_list = get_flow_text_index_list( int(time.mktime(time.strptime(S_DATE, "%Y-%m-%d")))) ##对微博文本进行预处理 def cut_filter(text): pattern_list = [r'\(分享自 .*\)', r'http://\w*'] for i in pattern_list: p = re.compile(i) text = p.sub('', text) return text def re_cut(w_text): #根据一些规则把无关内容过滤掉 w_text = cut_filter(w_text)
def get_opinions(task_source, task_id, xnr_user_no, opinion_keywords_list, opinion_type, intel_type): query_item = 'text' nest_query_list = [] tweets_list = [] if task_source == 'weibo': if S_TYPE == 'test': current_time = datetime2ts(S_DATE) else: current_time = int(time.time()) index_name_list = get_flow_text_index_list(current_time, days=5) sort_item = 'retweeted' for keyword in opinion_keywords_list: nest_query_list.append( {'wildcard': { query_item: '*' + keyword + '*' }}) uid_list = [] if len(nest_query_list) == 1: SHOULD_PERCENT = 1 else: SHOULD_PERCENT = 1 if intel_type == 'all': query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } elif intel_type == 'follow': try: follow_results = es_xnr.get(index=weibo_xnr_fans_followers_index_name,doc_type=weibo_xnr_fans_followers_index_type,\ id=xnr_user_no)['_source'] if follow_results: for follow_result in follow_results: uid_list = follow_result['_source']['followers'] except: uid_list = [] query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } elif intel_type == 'influence': date = ts2datetime(current_time - 24 * 3600) if S_TYPE == 'test': date = S_DATE_BCI weibo_bci_index_name = weibo_bci_index_name_pre + date[:4] + date[ 5:7] + date[8:10] query_body_bci = { 'query': { 'match_all': {} }, 'sort': { 'user_index': { 'order': 'desc' } }, 'size': 500 } weino_bci_results = es_user_portrait.search( index=weibo_bci_index_name, doc_type=weibo_bci_index_type, body=query_body_bci)['hits']['hits'] if weino_bci_results: for bci_result in weino_bci_results: uid = bci_result['_source']['user'] uid_list.append(uid) query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } else: query_sensitive = { 'query': { 'match_all': {} }, "aggs": { "uids": { "terms": { "field": "uid", "order": { "avg_sensitive": "desc" } }, "aggs": { "avg_sensitive": { "avg": { "field": "sensitive" } } } } }, 'size': 500000 } es_sensitive_result = es_flow_text.search(index=index_name_list,doc_type='text',\ body=query_sensitive)['aggregations']['uids']['buckets'] for item in es_sensitive_result: uid = item['key'] uid_list.append(uid) query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } # 得到tweets_list tweets_results = es_flow_text.search(index=index_name_list, doc_type='text', body=query_body)['hits']['hits'] if tweets_results: for item in tweets_results: item = item['_source'] weibo = item['text'] tweets_list.append(weibo) else: if S_TYPE == 'test': current_time = datetime2ts(S_DATE_FB) else: current_time = int(time.time()) uid_list = [] sort_item = 'share' opinion_keywords_list = [ word.encode('utf-8') for word in opinion_keywords_list ] en_keywords_list = trans(opinion_keywords_list, target_language='en') for i in range(len(opinion_keywords_list)): keyword = opinion_keywords_list[i].decode('utf-8') traditional_keyword = simplified2traditional(keyword) if len(en_keywords_list) == len(opinion_keywords_list): #确保翻译没出错 en_keyword = en_keywords_list[i] nest_query_list.append( {'wildcard': { query_item: '*' + en_keyword + '*' }}) nest_query_list.append( {'wildcard': { query_item: '*' + keyword + '*' }}) nest_query_list.append( {'wildcard': { query_item: '*' + traditional_keyword + '*' }}) if len(nest_query_list) == 1: SHOULD_PERCENT = 1 else: SHOULD_PERCENT = 1 if task_source == 'facebook': index_name_list = fb_get_flow_text_index_list(current_time, days=5) if intel_type == 'all': query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } elif intel_type == 'follow': try: follow_results = es_xnr.get(index=fb_xnr_fans_followers_index_name,doc_type=fb_xnr_fans_followers_index_type,\ id=xnr_user_no)['_source'] if follow_results: for follow_result in follow_results: uid_list = follow_result['_source']['fans_list'] except: uid_list = [] query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } elif intel_type == 'influence': fb_bci_index_name = fb_bci_index_name_pre + ts2datetime( current_time) query_body_bci = { 'query': { 'match_all': {} }, 'sort': { 'influence': { 'order': 'desc' } }, 'size': 500 } fb_bci_results = es_xnr.search( index=fb_bci_index_name, doc_type=fb_bci_index_type, body=query_body_bci)['hits']['hits'] #print 'fb_bci_results...',len(fb_bci_results) if fb_bci_results: for bci_result in fb_bci_results: uid = bci_result['_source']['uid'] uid_list.append(uid) query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } else: query_sensitive = { 'query': { 'match_all': {} }, "aggs": { "uids": { "terms": { "field": "uid", "order": { "avg_sensitive": "desc" } }, "aggs": { "avg_sensitive": { "avg": { "field": "sensitive" } } } } }, 'size': 500 } es_sensitive_result = es_xnr.search(index=index_name_list,doc_type='text',\ body=query_sensitive)['aggregations']['uids']['buckets'] #print 'es_sensitive_result...',len(es_sensitive_result) for item in es_sensitive_result: uid = item['key'] uid_list.append(uid) query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } #print 'query_body...',query_body tweets_results = es_xnr.search(index=index_name_list, doc_type='text', body=query_body)['hits']['hits'] if tweets_results: for item in tweets_results: item = item['_source'] weibo = item['text'] tweets_list.append(weibo) else: index_name_list = tw_get_flow_text_index_list(current_time, days=5) if intel_type == 'all': query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } elif intel_type == 'follow': try: follow_results = es_xnr.get(index=tw_xnr_fans_followers_index_name,doc_type=tw_xnr_fans_followers_index_type,\ id=xnr_user_no)['_source'] if follow_results: for follow_result in follow_results: uid_list = follow_result['_source'][ 'followers_list'] except: uid_list = [] query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } elif intel_type == 'influence': tw_bci_index_name = tw_bci_index_name_pre + ts2datetime( current_time) query_body_bci = { 'query': { 'match_all': {} }, 'sort': { 'influence': { 'order': 'desc' } }, 'size': 500 } tw_bci_results = es_xnr.search( index=tw_bci_index_name, doc_type=tw_bci_index_type, body=query_body_bci)['hits']['hits'] if tw_bci_results: for bci_result in tw_bci_results: uid = bci_result['_source']['uid'] uid_list.append(uid) query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } else: query_sensitive = { 'query': { 'match_all': {} }, "aggs": { "uids": { "terms": { "field": "uid", "order": { "avg_sensitive": "desc" } }, "aggs": { "avg_sensitive": { "avg": { "field": "sensitive" } } } } }, 'size': 500 } es_sensitive_result = es_xnr.search(index=index_name_list,doc_type='text',\ body=query_sensitive)['aggregations']['uids']['buckets'] for item in es_sensitive_result: uid = item['key'] uid_list.append(uid) query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } print 'index_name_list...', index_name_list print 'query_body........', query_body tweets_results = es_xnr.search(index=index_name_list, doc_type='text', body=query_body)['hits']['hits'] if tweets_results: for item in tweets_results: item = item['_source'] weibo = item['text'] tweets_list.append(weibo) if tweets_list: opinion_name, word_result, text_list = opinion_main(tweets_list, k_cluster=5) sub_opinion_results = dict() topic_keywords_list = [] summary_text_list = [] for topic, text in text_list.iteritems(): topic_name = opinion_name[topic] sub_opinion_results[topic_name] = text[:SUB_OPINION_WEIBO_LIMIT] topic_keywords_list.extend(topic_name.split('&')) summary_text_list.extend(text) #try: print 'summary_text_list..', len(summary_text_list) print 'topic_keywords_list..', topic_keywords_list summary = text_generation_main(summary_text_list, topic_keywords_list) #summary = summary_main(summary_text_list) #except: # summary = '' else: sub_opinion_results = {} summary = '' print '开始保存子观点计算结果......' print 'summary....', summary mark = save_intelligent_opinion_results(task_id, sub_opinion_results, summary, intel_type) return mark
def find_flow_texts(task_source, task_id, event_keywords): # 得到nest_query_list nest_query_list = [] keywords_list = event_keywords.split('&') keywords_list = [word.encode('utf-8') for word in keywords_list] query_item = 'text' if task_source != 'weibo': #文本中可能存在英文或者繁体字,所以都匹配一下 en_keywords_list = trans(keywords_list, target_language='en') for i in range(len(keywords_list)): keyword = keywords_list[i].decode('utf-8') traditional_keyword = simplified2traditional(keyword) if len(en_keywords_list) == len(keywords_list): #确保翻译没出错 en_keyword = en_keywords_list[i] nest_query_list.append( {'wildcard': { query_item: '*' + en_keyword + '*' }}) nest_query_list.append( {'wildcard': { query_item: '*' + keyword + '*' }}) nest_query_list.append( {'wildcard': { query_item: '*' + traditional_keyword + '*' }}) else: for keyword in keywords_list: nest_query_list.append( {'wildcard': { query_item: '*' + keyword + '*' }}) if len(nest_query_list) == 1: SHOULD_PERCENT = 1 else: SHOULD_PERCENT = 1 # 匹配文本 if task_source == 'weibo': sort_item = 'retweeted' if S_TYPE == 'test': current_time = datetime2ts(S_DATE) else: current_time = int(time.time() + 24 * 3600) #test #current_time = int(datetime2ts("2018-05-13")) index_name_list = get_flow_text_index_list(current_time, days=2) es_name = es_flow_text elif task_source == 'facebook': sort_item = 'share' if S_TYPE == 'test': current_time = datetime2ts(S_DATE_FB) else: current_time = int(time.time() + 24 * 3600) index_name_list = fb_get_flow_text_index_list(current_time, days=2) es_name = es_xnr else: sort_item = 'share' if S_TYPE == 'test': current_time = datetime2ts(S_DATE_TW) else: current_time = int(time.time() + 24 * 3600) index_name_list = tw_get_flow_text_index_list(current_time, days=2) es_name = es_xnr query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': 100000 } print 'es_name...', es_name print 'index_name_list..', index_name_list search_results = es_name.search(index=index_name_list, doc_type='text', body=query_body)['hits']['hits'] print 'len..search_results..', len(search_results) save2topic_es(task_source, task_id, search_results)
def get_related_recommendation(task_detail): avg_sort_uid_dict = {} xnr_user_no = task_detail['xnr_user_no'] sort_item = task_detail['sort_item'] es_result = es.get(index=weibo_xnr_index_name,doc_type=weibo_xnr_index_type,id=xnr_user_no)['_source'] uid = es_result['uid'] monitor_keywords = es_result['monitor_keywords'] monitor_keywords_list = monitor_keywords.split(',') nest_query_list = [] #print 'monitor_keywords_list::',monitor_keywords_list for monitor_keyword in monitor_keywords_list: #print 'monitor_keyword::::',monitor_keyword nest_query_list.append({'wildcard':{'keywords_string':'*'+monitor_keyword+'*'}}) # else: try: recommend_list = es.get(index=weibo_xnr_fans_followers_index_name,doc_type=weibo_xnr_fans_followers_index_type,id=xnr_user_no)['_source']['followers_list'] except: recommend_list = [] recommend_set_list = list(set(recommend_list)) if S_TYPE == 'test': current_date = S_DATE else: current_date = ts2datetime(int(time.time()-24*3600)) flow_text_index_name = flow_text_index_name_pre + current_date if sort_item != 'friend': uid_list = [] #uid_list = recommend_set_list if sort_item == 'influence': sort_item = 'user_fansnum' query_body_rec = { 'query':{ 'bool':{ 'should':nest_query_list } }, 'aggs':{ 'uid_list':{ 'terms':{'field':'uid','size':TOP_ACTIVE_SOCIAL,'order':{'avg_sort':'desc'} }, 'aggs':{'avg_sort':{'avg':{'field':sort_item}}} } } } es_rec_result = es_flow_text.search(index=flow_text_index_name,doc_type='text',body=query_body_rec)['aggregations']['uid_list']['buckets'] #print 'es_rec_result///',es_rec_result for item in es_rec_result: uid = item['key'] uid_list.append(uid) avg_sort_uid_dict[uid] = {} if sort_item == 'user_fansnum': avg_sort_uid_dict[uid]['sort_item_value'] = int(item['avg_sort']['value']) else: avg_sort_uid_dict[uid]['sort_item_value'] = round(item['avg_sort']['value'],2) else: if S_TYPE == 'test': uid_list = FRIEND_LIST #sort_item = 'sensitive' else: uid_list = [] ''' friends_list_results = es_user_profile.mget(index=profile_index_name,doc_type=profile_index_type,body={'ids':recommend_set_list})['docs'] for result in friends_list_results: friends_list = friends_list + result['friend_list'] ''' friends_list = get_friends_list(recommend_set_list) friends_set_list = list(set(friends_list)) #uid_list = friends_set_list sort_item_new = 'fansnum' query_body_rec = { 'query':{ 'bool':{ 'must':[ {'terms':{'uid':friends_set_list}}, {'bool':{ 'should':nest_query_list }} ] } }, 'aggs':{ 'uid_list':{ 'terms':{'field':'uid','size':TOP_ACTIVE_SOCIAL,'order':{'avg_sort':'desc'} }, 'aggs':{'avg_sort':{'avg':{'field':sort_item_new}}} } } } es_friend_result = es_flow_text.search(index=flow_text_index_name,doc_type='text',body=query_body_rec)['aggregations']['uid_list']['buckets'] for item in es_friend_result: uid = item['key'] uid_list.append(uid) avg_sort_uid_dict[uid] = {} if not item['avg_sort']['value']: avg_sort_uid_dict[uid]['sort_item_value'] = 0 else: avg_sort_uid_dict[uid]['sort_item_value'] = int(item['avg_sort']['value']) results_all = [] for uid in uid_list: #if sort_item == 'friend': query_body = { 'query':{ 'filtered':{ 'filter':{ 'term':{'uid':uid} } } } } es_results = es_user_portrait.search(index=portrait_index_name,doc_type=portrait_index_type,body=query_body)['hits']['hits'] if es_results: #print 'portrait--',es_results[0]['_source'].keys() for item in es_results: uid = item['_source']['uid'] #nick_name,photo_url = uid2nick_name_photo(uid) item['_source']['nick_name'] = uid #nick_name item['_source']['photo_url'] = ''#photo_url weibo_type = judge_follow_type(xnr_user_no,uid) sensor_mark = judge_sensing_sensor(xnr_user_no,uid) item['_source']['weibo_type'] = weibo_type item['_source']['sensor_mark'] = sensor_mark try: del item['_source']['group'] del item['_source']['activity_geo_dict'] except: pass if sort_item == 'friend': if S_TYPE == 'test': item['_source']['fansnum'] = item['_source']['fansnum'] else: item['_source']['fansnum'] = avg_sort_uid_dict[uid]['sort_item_value'] elif sort_item == 'sensitive': item['_source']['sensitive'] = avg_sort_uid_dict[uid]['sort_item_value'] item['_source']['fansnum'] = item['_source']['fansnum'] else: item['_source']['fansnum'] = avg_sort_uid_dict[uid]['sort_item_value'] if S_TYPE == 'test': current_time = datetime2ts(S_DATE) else: current_time = int(time.time()) index_name = get_flow_text_index_list(current_time) query_body = { 'query':{ 'bool':{ 'must':[ {'term':{'uid':uid}}, {'terms':{'message_type':[1,3]}} ] } }, 'sort':{'retweeted':{'order':'desc'}}, 'size':5 } es_weibo_results = es_flow_text.search(index=index_name,doc_type=flow_text_index_type,body=query_body)['hits']['hits'] weibo_list = [] for weibo in es_weibo_results: weibo = weibo['_source'] weibo_list.append(weibo) item['_source']['weibo_list'] = weibo_list item['_source']['portrait_status'] = True results_all.append(item['_source']) else: item_else = dict() item_else['uid'] = uid #nick_name,photo_url = uid2nick_name_photo(uid) item_else['nick_name'] = uid#nick_name item_else['photo_url'] = ''#photo_url weibo_type = judge_follow_type(xnr_user_no,uid) sensor_mark = judge_sensing_sensor(xnr_user_no,uid) item_else['weibo_type'] = weibo_type item_else['sensor_mark'] = sensor_mark item_else['portrait_status'] = False #if sort_item != 'friend': #item_else['sort_item_value'] = avg_sort_uid_dict[uid]['sort_item_value'] # else: # item_else['sort_item_value'] = '' if S_TYPE == 'test': current_time = datetime2ts(S_DATE) else: current_time = int(time.time()) index_name = get_flow_text_index_list(current_time) query_body = { 'query':{ 'term':{'uid':uid} }, 'sort':{'retweeted':{'order':'desc'}} } es_weibo_results = es_flow_text.search(index=index_name,doc_type=flow_text_index_type,body=query_body)['hits']['hits'] weibo_list = [] for weibo in es_weibo_results: item_else['fansnum'] = weibo['_source']['user_fansnum'] weibo = weibo['_source'] weibo_list.append(weibo) item_else['weibo_list'] = weibo_list item_else['friendsnum'] = 0 item_else['statusnum'] = 0 if sort_item == 'sensitive': item_else['sensitive'] = avg_sort_uid_dict[uid]['sort_item_value'] else: item_else['fansnum'] = avg_sort_uid_dict[uid]['sort_item_value'] results_all.append(item_else) return results_all
sensor_mark = '' item['_source']['weibo_type'] = weibo_type item['_source']['sensor_mark'] = sensor_mark try: del item['_source']['group'] del item['_source']['activity_geo_dict'] except: pass item['_source']['user_index'] = avg_sort_uid_dict[uid]['sort_item_value'] current_time = datetime2ts(current_date) index_name = get_flow_text_index_list(current_time) query_body = { 'query':{ 'bool':{ 'must':[ {'term':{'uid':uid}}, {'terms':{'message_type':[1,3]}} ] } }, 'sort':{'retweeted':{'order':'desc'}}, 'size':5 } es_weibo_results = es_flow_text.search(index=index_name,doc_type=flow_text_index_type,body=query_body)['hits']['hits']
def compute_penetration_num(xnr_user_no): if S_TYPE == 'test': current_time = datetime2ts(S_DATE) - DAY else: current_time = time.time() - DAY current_date = ts2datetime(current_time) timestamp = datetime2ts(current_date) # 找出top 敏感用户 query_body = { 'query':{ 'match_all':{} }, 'sort':{'sensitive':{'order':'desc'}}, 'size':TOP_ASSESSMENT_NUM } top_sensitive_users = es_user_portrait.search(index=portrait_index_name,doc_type=portrait_index_type,\ body=query_body)['hits']['hits'] top_sensitive_uid_list = [] for user in top_sensitive_users: user = user['_source'] top_sensitive_uid_list.append(user['uid']) # 计算top敏感用户的微博敏感度均值 query_body_count = { 'query':{ 'filtered':{ 'filter':{ 'terms':{'uid':top_sensitive_uid_list} } } }, 'aggs':{ 'avg_sensitive':{ 'avg':{ 'field':'sensitive' } } } } if S_TYPE == 'test': index_name = get_flow_text_index_list(timestamp) else: index_name = flow_text_index_name_pre + current_date es_sensitive_result = es_flow_text.search(index=index_name,doc_type=flow_text_index_type,\ body=query_body_count)['aggregations'] sensitive_value_top_avg = es_sensitive_result['avg_sensitive']['value'] if S_TYPE == 'test': if not sensitive_value_top_avg: sensitive_value_top_avg = 1 print 'es_sensitive_result::',es_sensitive_result # 计算xnr反馈群体的敏感度 #follow_group_mark = get_pene_follow_group_sensitive(xnr_user_no)['sensitive_info'][timestamp] #fans_group_mark = get_pene_fans_group_sensitive(xnr_user_no)['sensitive_info'][timestamp] try: feedback_mark_at = get_pene_feedback_sensitive(xnr_user_no,'be_at')['sensitive_info'][timestamp] feedback_mark_retweet = get_pene_feedback_sensitive(xnr_user_no,'be_retweet')['sensitive_info'][timestamp] feedback_mark_comment = get_pene_feedback_sensitive(xnr_user_no,'be_comment')['sensitive_info'][timestamp] except: feedback_mark_at = 0.0839 feedback_mark_retweet = 0.1199 feedback_mark_comment = 0.01311 # try: # report_management_mark_tweet = get_pene_warning_report_sensitive(xnr_user_no)['tweet'][timestamp] # report_management_mark_event = get_pene_warning_report_sensitive(xnr_user_no)['event'][timestamp] # except: # report_management_mark_tweet = 0 # report_management_mark_event = 0 # pene_mark = 100*float(follow_group_mark+fans_group_mark+feedback_mark_at+feedback_mark_retweet+\ # feedback_mark_comment+report_management_mark_tweet+report_management_mark_event)/sensitive_value_top_avg pene_mark = 100 * float(feedback_mark_at+feedback_mark_retweet+feedback_mark_comment)/sensitive_value_top_avg pene_mark = round(pene_mark,2) return pene_mark
def compute_recommend_subopnion(task_detail): print '开始分析计算......' task_id = task_detail['task_id'].strip('"') keywords_string = task_detail['keywords_string'] keywords_list = keywords_string.split('&') ## 以 & 切分关键词,得到list xnr_user_no = task_detail['xnr_user_no'] mid = task_detail['mid'] query_item = 'keywords_string' nest_query_list = [] for keyword in keywords_list: nest_query_list.append({'wildcard': {query_item: '*' + keyword + '*'}}) ''' ## 重点关注当前虚拟人的关注用户 if S_TYPE == 'test': # followers_list = get_result['followers_list'] # nest_query_list.append({'terms':followers_list}) print '全部用户' else: get_result = es.get(index=weibo_xnr_fans_followers_index_name,doc_type=weibo_xnr_fans_followers_index_type,\ id=xnr_user_no)['_source'] followers_list = get_result['followers_list'] nest_query_list.append({'terms':followers_list}) ''' if S_TYPE == 'test': create_time = datetime2ts(S_DATE) else: create_time = datehour2ts(ts2datehour(time.time() - 3600)) #get_flow_text_index_list(create_time) #index_name_list_list = get_flow_text_index_list(now_timestamp) index_name_list = get_flow_text_index_list(create_time) print 'index_name_list::', index_name_list es_results = es_flow_text.search(index=index_name_list,doc_type='text',\ body={'query':{'bool':{'must':nest_query_list}},'size':MAX_SEARCH_SIZE})['hits']['hits'] weibo_list = [] ## 内容推荐和子观点分析的输入 if es_results: for item in es_results: item = item['_source'] weibo = item['text'] weibo_list.append(weibo) ## 内容推荐 ## 得到推荐句子列表 print 'weibo_list::::::', weibo_list print '开始内容推荐计算......' if weibo_list: content_results = summary_main(weibo_list) else: content_results = [] print '开始保存内容推荐计算结果......' mark = save_content_recommendation_results(xnr_user_no, mid, task_id.encode('utf-8'), content_results) print 'mark_content:::', mark if mark == False: print '内容推荐结果保存过程中遇到错误,把计算任务重新push到队列中' add_task_2_queue(keyword_task_queue_name, task_detail) else: print '内容推荐计算结果保存完毕......' ## 子观点分析 ''' 输入: weibo_data:微博列表,[weibo1,weibo2,...] k_cluster:子话题个数 (默认为5) 输出: opinion_name:子话题名称字典,{topic1:name1,topic2:name2,...} word_result:子话题关键词对,{topic1:[w1,w2,...],topic2:[w1,w2,...],...} text_list:子话题对应的文本,{topic1:[text1,text2,...],topic2:[text1,text2,..],..} ''' print '开始子观点计算......' if weibo_list: opinion_name, word_result, text_list = opinion_main(weibo_list, k_cluster=5) sub_opinion_results = dict() for topic, text in text_list.iteritems(): topic_name = opinion_name[topic] sub_opinion_results[topic_name] = text[:SUB_OPINION_WEIBO_LIMIT] else: sub_opinion_results = {} print '开始保存子观点计算结果......' mark = save_subopnion_results(xnr_user_no, mid, task_id, sub_opinion_results) print 'mark_opinion:::', mark if mark == False: print '子观点计算结果保存过程中遇到错误,把计算任务重新push到队列中' add_task_2_queue(keyword_task_queue_name, task_detail) else: print '子观点计算结果保存完毕......'