def get_group_list(task_name): results = [] try: es_results = es.get(index=index_name, doc_type=index_type, id=task_name)['_source'] except: return results #print 'es_result:', es_results['uid_list'], type(es_results['uid_list']) uid_list = es_results['uid_list'] user_portrait_attribute = es.mget(index='user_portrait', doc_type='user', body={'ids':uid_list})['docs'] evaluate_max = get_evaluate_max() for item in user_portrait_attribute: uid = item['_id'] try: source = item['_source'] uname = source['uname'] gender = source['gender'] location = source['location'] importance = source['importance'] normal_importance = math.log(importance / evaluate_max['importance'] * 9 + 1, 10) * 100 influence = source['influence'] normal_influence = math.log(influence / evaluate_max['influence'] * 9 + 1, 10) * 100 results.append([uid, uname, gender, location, normal_importance, normal_influence]) except: results.append([uid]) return results
def get_group_list(task_name): results = [] try: es_results = es.get(index=index_name, doc_type=index_type, id=task_name)['_source'] except: return results #print 'es_result:', es_results['uid_list'], type(es_results['uid_list']) uid_list = es_results['uid_list'] user_portrait_attribute = es.mget(index='user_portrait', doc_type='user', body={'ids': uid_list})['docs'] evaluate_max = get_evaluate_max() for item in user_portrait_attribute: uid = item['_id'] try: source = item['_source'] uname = source['uname'] gender = source['gender'] location = source['location'] importance = source['importance'] normal_importance = math.log( importance / evaluate_max['importance'] * 9 + 1, 10) * 100 influence = source['influence'] normal_influence = math.log( influence / evaluate_max['influence'] * 9 + 1, 10) * 100 results.append([ uid, uname, gender, location, normal_importance, normal_influence ]) except: results.append([uid]) return results
def test_influence_rank(domain, date, order): uid_list = domain_dict[domain] order = str(order) query_body = { "query":{ "match_all": {} }, "sort": {search_order[order]: {"order": "desc"}}, "size": 100 } search_result = es.search(index=date, doc_type="bci", body=query_body)['hits']['hits'] portrait_result = es.mget(index='sensitive_user_portrait', doc_type='user', body={"ids": uid_list})['docs'] results = [] for i in range(len(search_result)): detail = [] try: detail.append(search_result[i]['_source'][search_order[order]]) except: print uid_list[i] detail.append(0) try: temp = portrait_result[i]['_source'] except: continue detail.extend([temp['uid'], temp['uname'], temp['photo_url'], temp['activeness'], temp['importance'], temp['sensitive']]) results.append(detail) sorted_list = sorted(results, key=lambda x:x[0], reverse=True) return sorted_list
def search_domain(domain, date, order, number=100): # top influence user in domain count = 0 count_n = 0 result_list = [] date = str(date).replace('-', '') order = str(order) query_body = { "query":{ "match_all": {} }, "sort": {search_order[order]: {"order": "desc"}}, "size": 1+count } while 1: search_results = es.search(index=date, doc_type='bci', body=query_body)['hits']['hits'] uid_list = [] count += 1 for item in search_results: uid_list.append(item['_id']) portrait_results = es.mget(index='sensitive_user_portrait', doc_type='user', body={"ids":uid_list})['docs'] for item in portrait_results: print item domain_list = (item['_source']['topic_string']).split('&') # attention if domain in set(domain_list): result_list.append([item['_id'], item['_source']['uname'], item['_source']['photo_url'], item['_source']['influence'], item['_source']['sensitive'], item['_source']['importance'], item['_source']['activeness']]) count_n += 1 if count_n >= int(number): break return result_list
def identify_uid_list_in(uid_list): in_set = set() search_result = es.mget(index='sensitive_user_portrait', doc_type="user", body={'ids':uid_list})['docs'] for item in search_result: if item['found']: in_set.add(item['_id']) return in_set
def identify_uid_list_in(uid_list): in_set = set() search_result = es.mget(index="sensitive_user_portrait", doc_type="user", body={"ids": uid_list})["docs"] for item in search_result: if item["found"]: in_set.add(item["_id"]) return in_set
def identify_uid_list_in(uid_list): in_set = set() search_result = es.mget(index='sensitive_user_portrait', doc_type="user", body={'ids': uid_list})['docs'] for item in search_result: if item['found']: in_set.add(item['_id']) return in_set
def get_user_info(uid_list): results = {} search_results = es.mget(index='sensitive_user_portrait', doc_type='user', body={'ids':uid_list})['docs'] fields = ['uid', 'uname', 'domain', 'sensitive', 'importance', 'influence', 'activeness'] for item in search_results: detail = [] for field in fields: detail.append(item['_source'][field]) results[item['_id']] = detail return results
def search_follower(uid, sensitive): results = dict() stat_results = dict() for db_num in R_DICT: r = R_DICT[db_num] if sensitive: br_uid_results = r.hgetall('sensitive_be_retweet_' + str(uid)) else: br_uid_results = r.hgetall('be_retweet_' + str(uid)) if br_uid_results: for br_uid in br_uid_results: if br_uid != uid: try: stat_results[br_uid] += br_uid_results[br_uid] except: stat_results[br_uid] = br_uid_results[br_uid] if not stat_results: return [None, 0] try: sort_stat_results = sorted(stat_results.items(), key=lambda x: x[1], reverse=True)[:20] except: return [None, 0] uid_list = [item[0] for item in sort_stat_results] es_profile_results = es_user_profile.mget(index='weibo_user', doc_type='user', body={'ids': uid_list})['docs'] es_portrait_results = es.mget(index='sensitive_user_portrait', doc_type='user', body={'ids': uid_list})['docs'] result_list = [] for i in range(len(es_profile_results)): item = es_profile_results[i] uid = item['_id'] try: source = item['_source'] uname = source['nick_name'] except: uname = u'unknown' portrait_item = es_portrait_results[i] try: source = portrait_item['_source'] in_status = 1 except: in_status = 0 result_list.append([uid, [uname, stat_results[uid], in_status]]) return [result_list[:20], len(stat_results)]
def search_retweet(uid, sensitive): stat_results = dict() results = dict() for db_num in R_DICT: r = R_DICT[db_num] if not sensitive: ruid_results = r.hgetall('retweet_' + str(uid)) else: ruid_results = r.hgetall('sensitive_retweet_' + str(uid)) # because of sensitive weibo if ruid_results: for ruid in ruid_results: if ruid != uid: if stat_results.has_key(ruid): stat_results[ruid] += ruid_results[ruid] else: stat_results[ruid] = ruid_results[ruid] if stat_results: sort_stat_results = sorted(stat_results.items(), key=lambda x: x[1], reverse=True)[:20] else: return [None, 0] uid_list = [item[0] for item in sort_stat_results] es_profile_results = es_user_profile.mget(index='weibo_user', doc_type='user', body={'ids': uid_list})['docs'] es_portrait_results = es.mget(index='sensitive_user_portrait', doc_type='user', body={'ids': uid_list})['docs'] result_list = [] for i in range(len(es_profile_results)): item = es_profile_results[i] uid = item['_id'] if item['found']: uname = item['_source']['nick_name'] else: uname = u'unknown' portrait_item = es_portrait_results[i] if portrait_item['found']: in_status = 1 else: in_status = 0 result_list.append([uid, [uname, stat_results[uid], in_status]]) return [result_list[:20], len(stat_results)]
def search_domain(domain, date, order, number=100): # top influence user in domain count = 0 count_n = 0 result_list = [] date = str(date).replace('-', '') order = str(order) query_body = { "query": { "match_all": {} }, "sort": { search_order[order]: { "order": "desc" } }, "size": 1 + count } while 1: search_results = es.search(index=date, doc_type='bci', body=query_body)['hits']['hits'] uid_list = [] count += 1 for item in search_results: uid_list.append(item['_id']) portrait_results = es.mget(index='sensitive_user_portrait', doc_type='user', body={"ids": uid_list})['docs'] for item in portrait_results: print item domain_list = (item['_source']['topic_string']).split( '&') # attention if domain in set(domain_list): result_list.append([ item['_id'], item['_source']['uname'], item['_source']['photo_url'], item['_source']['influence'], item['_source']['sensitive'], item['_source']['importance'], item['_source']['activeness'] ]) count_n += 1 if count_n >= int(number): break return result_list
def search_follower(uid, sensitive): results = dict() stat_results = dict() if 1: r = r_cluster if sensitive: br_uid_results = r.hgetall('sensitive_be_retweet_'+str(uid)) else: br_uid_results = r.hgetall('be_retweet_'+str(uid)) if br_uid_results: for br_uid in br_uid_results: if br_uid != uid: try: stat_results[br_uid] += br_uid_results[br_uid] except: stat_results[br_uid] = br_uid_results[br_uid] if not stat_results: return [None, 0] try: sort_stat_results = sorted(stat_results.items(), key=lambda x:x[1], reverse=True)[:20] except: return [None, 0] uid_list = [item[0] for item in sort_stat_results] es_profile_results = es_user_profile.mget(index='weibo_user', doc_type='user', body={'ids':uid_list})['docs'] es_portrait_results = es.mget(index='sensitive_user_portrait', doc_type='user', body={'ids':uid_list})['docs'] result_list = [] for i in range(len(es_profile_results)): item = es_profile_results[i] uid = item['_id'] try: source = item['_source'] uname = source['nick_name'] except: uname = u'unknown' portrait_item = es_portrait_results[i] try: source = portrait_item['_source'] in_status = 1 except: in_status = 0 result_list.append([uid,[uname, stat_results[uid], in_status]]) return [result_list[:20], len(stat_results)]
def search_follower(uid, sensitive): results = dict() stat_results = dict() for db_num in R_DICT: r = R_DICT[db_num] if sensitive: br_uid_results = r.hgetall("sensitive_be_retweet_" + str(uid)) else: br_uid_results = r.hgetall("be_retweet_" + str(uid)) if br_uid_results: for br_uid in br_uid_results: if br_uid != uid: try: stat_results[br_uid] += br_uid_results[br_uid] except: stat_results[br_uid] = br_uid_results[br_uid] if not stat_results: return [None, 0] try: sort_stat_results = sorted(stat_results.items(), key=lambda x: x[1], reverse=True)[:20] except: return [None, 0] uid_list = [item[0] for item in sort_stat_results] es_profile_results = es_user_profile.mget(index="weibo_user", doc_type="user", body={"ids": uid_list})["docs"] es_portrait_results = es.mget(index="sensitive_user_portrait", doc_type="user", body={"ids": uid_list})["docs"] result_list = [] for i in range(len(es_profile_results)): item = es_profile_results[i] uid = item["_id"] try: source = item["_source"] uname = source["nick_name"] except: uname = u"unknown" portrait_item = es_portrait_results[i] try: source = portrait_item["_source"] in_status = 1 except: in_status = 0 result_list.append([uid, [uname, stat_results[uid], in_status]]) return [result_list[:20], len(stat_results)]
def get_group_tag(group_name): result = {} order_result = [] #get group task uid list #get user tag #statistic tag try: group_task_result = es.get(index=group_index_name, doc_type=group_index_type, id=group_name) except: return 'no group task' try: uid_list = group_task_result['_source']['uid_list'] except: return 'no user' try: user_result = es.mget(index=user_index_name, doc_type=user_index_type, body={'ids': uid_list})['docs'] except Exception, e: raise e
def get_user_tag(uid_list): result = {} user_result = es.mget(index=user_index_name, doc_type=user_index_type, body={'ids':uid_list})['docs'] print 'user_result:', user_result for user_item in user_result: uid = user_item['_id'] result[uid] = [] try: source = user_item['_source'] except: source = {} for key in source: if key not in identify_attribute_list: value = source[key] tag_string = key+':'+value result[uid].append(tag_string) return result
def search_retweet(uid, sensitive): stat_results = dict() results = dict() for db_num in R_DICT: r = R_DICT[db_num] if not sensitive: ruid_results = r.hgetall("retweet_" + str(uid)) else: ruid_results = r.hgetall("sensitive_retweet_" + str(uid)) # because of sensitive weibo if ruid_results: for ruid in ruid_results: if ruid != uid: if stat_results.has_key(ruid): stat_results[ruid] += ruid_results[ruid] else: stat_results[ruid] = ruid_results[ruid] if stat_results: sort_stat_results = sorted(stat_results.items(), key=lambda x: x[1], reverse=True)[:20] else: return [None, 0] uid_list = [item[0] for item in sort_stat_results] es_profile_results = es_user_profile.mget(index="weibo_user", doc_type="user", body={"ids": uid_list})["docs"] es_portrait_results = es.mget(index="sensitive_user_portrait", doc_type="user", body={"ids": uid_list})["docs"] result_list = [] for i in range(len(es_profile_results)): item = es_profile_results[i] uid = item["_id"] if item["found"]: uname = item["_source"]["nick_name"] else: uname = u"unknown" portrait_item = es_portrait_results[i] if portrait_item["found"]: in_status = 1 else: in_status = 0 result_list.append([uid, [uname, stat_results[uid], in_status]]) return [result_list[:20], len(stat_results)]
def search_retweet(uid, sensitive): stat_results = dict() results = dict() if 1: r = r_cluster if not sensitive: ruid_results = r.hgetall('retweet_'+str(uid)) else: ruid_results = r.hgetall('sensitive_retweet_'+str(uid)) # because of sensitive weibo if ruid_results: for ruid in ruid_results: if ruid != uid: if stat_results.has_key(ruid): stat_results[ruid] += ruid_results[ruid] else: stat_results[ruid] = ruid_results[ruid] if stat_results: sort_stat_results = sorted(stat_results.items(), key=lambda x:x[1], reverse=True)[:20] else: return [None, 0] uid_list = [item[0] for item in sort_stat_results] es_profile_results = es_user_profile.mget(index='weibo_user', doc_type='user', body={'ids':uid_list})['docs'] es_portrait_results = es.mget(index='sensitive_user_portrait', doc_type='user', body={'ids':uid_list})['docs'] result_list = [] for i in range(len(es_profile_results)): item = es_profile_results[i] uid = item['_id'] if item['found']: uname = item['_source']['nick_name'] else: uname = u'unknown' portrait_item = es_portrait_results[i] if portrait_item['found']: in_status = 1 else: in_status = 0 result_list.append([uid,[uname, stat_results[uid], in_status]]) return [result_list[:20], len(stat_results)]
def get_user_tag(uid_list): result = {} user_result = es.mget(index=user_index_name, doc_type=user_index_type, body={'ids': uid_list})['docs'] print 'user_result:', user_result for user_item in user_result: uid = user_item['_id'] result[uid] = [] try: source = user_item['_source'] except: source = {} for key in source: if key not in identify_attribute_list: value = source[key] tag_string = key + ':' + value result[uid].append(tag_string) return result
def get_group_tag(group_name): result = {} order_result = [] #get group task uid list #get user tag #statistic tag try: group_task_result = es.get(index=group_index_name, doc_type=group_index_type, id=group_name) except: return 'no group task' try: uid_list = group_task_result['_source']['uid_list'] except: return 'no user' try: user_result = es.mget(index=user_index_name, doc_type=user_index_type, body={'ids': uid_list})['docs'] except Exception, e: raise e
def test_influence_rank(domain, date, order): uid_list = domain_dict[domain] order = str(order) query_body = { "query": { "match_all": {} }, "sort": { search_order[order]: { "order": "desc" } }, "size": 100 } search_result = es.search(index=date, doc_type="bci", body=query_body)['hits']['hits'] portrait_result = es.mget(index='sensitive_user_portrait', doc_type='user', body={"ids": uid_list})['docs'] results = [] for i in range(len(search_result)): detail = [] try: detail.append(search_result[i]['_source'][search_order[order]]) except: print uid_list[i] detail.append(0) try: temp = portrait_result[i]['_source'] except: continue detail.extend([ temp['uid'], temp['uname'], temp['photo_url'], temp['activeness'], temp['importance'], temp['sensitive'] ]) results.append(detail) sorted_list = sorted(results, key=lambda x: x[0], reverse=True) return sorted_list
def get_group_results(task_name, module): result = [] try: es_result = es.get(index=index_name, doc_type=index_type, id=task_name)['_source'] #print 'result:', result except: return None #basic module: gender, count, verified if module == 'overview': task_name = es_result['task_name'] submit_date = es_result['submit_date'] state = es_result['state'] tightness = es_result['tightness'] activeness = es_result['activeness'] importance = es_result['importance'] influence = es_result['influence'] result = [ task_name, submit_date, state, tightness, activeness, importance, influence ] if module == 'basic': gender_dict = json.loads(es_result['gender']) count = es_result['count'] verified = es_result['verified'] if verified: verified_dict = json.loads(verified) result = [gender_dict, count, verified] if module == 'activity': activity_geo_dict = json.loads(es_result['activity_geo']) sort_activity_geo = sorted(activity_geo_dict.items(), key=lambda x: x[1], reverse=True) activity_geo = sort_activity_geo[:50] activity_trend = json.loads(es_result['activity_trend']) online_pattern_dict = json.loads(es_result['online_pattern']) sort_online_pattern = sorted(online_pattern_dict.items(), key=lambda x: x[1], reverse=True) online_pattern = sort_online_pattern[:50] geo_track = json.loads(es_result['geo_track']) result = [activity_geo, activity_trend, online_pattern, geo_track] if module == 'social': #degree_his = json.loads(es_result['degree_his']) density = es_result['density'] retweet_weibo_count = es_result['retweet_weibo_count'] retweet_user_count = es_result['retweet_user_count'] retweet_relation = json.loads(es_result['retweet_relation']) uid_list = [] for relation in retweet_relation: uid_list.append(relation[0]) uid_list.append(relation[1]) es_portrait_result = es.mget(index='user_portrait', doc_type='user', body={'ids': uid_list})['docs'] es_count = 0 new_retweet_relation = [] for relation in retweet_relation: source_uid = relation[0] source_item = es_portrait_result[es_count] try: source = source_item['_source'] source_uname = source['uname'] except: source_uname = '' target_uid = relation[1] es_count += 1 target_item = es_portrait_result[es_count] try: source = target_item['_source'] target_uname = source['uname'] except: target_uname = '' count = relation[2] new_retweet_relation.append( [source_uid, source_uname, target_uid, target_uname, count]) uid_list = [] out_beretweet_relation = json.loads( es_result['out_beretweet_relation']) uid_list = [] uid_list = [item[0] for item in out_beretweet_relation] es_portrait_result = es.mget(index='user_portrait', doc_type='user', body={'ids': uid_list})['docs'] es_count = 0 new_out_beretweet_relation = [] for i in range(len(uid_list)): item = es_portrait_result[i] uid = item['_id'] try: source = item['_source'] uname = source['uname'] except: uname = '' out_relation_item = out_beretweet_relation[i][1:] a = [uid, uname] a.extend(out_relation_item) #print 'add_item:', add_item new_out_beretweet_relation.append(a) result = [ new_retweet_relation, density, retweet_weibo_count, retweet_user_count, new_out_beretweet_relation ] if module == 'think': domain_dict = json.loads(es_result['domain']) topic_dict = json.loads(es_result['topic']) psycho_status = json.loads(es_result['psycho_status']) psycho_feature = json.loads(es_result['psycho_feature']) result = [domain_dict, topic_dict, psycho_status, psycho_feature] if module == 'text': hashtag_dict = json.loads(es_result['hashtag']) sort_hashtag = sorted(hashtag_dict.items(), key=lambda x: x[1], reverse=True) hashtag = sort_hashtag[:50] emoticon_dict = json.loads(es_result['emoticon']) sort_emoticon = sorted(emoticon_dict.items(), key=lambda x: x[1], reverse=True) emoticon = sort_emoticon[:5] keyword_dict = json.loads(es_result['keywords']) sort_keyword = sorted(keyword_dict.items(), key=lambda x: x[1], reverse=True) keyword = sort_keyword[:50] result = [hashtag, keyword, emoticon] if module == 'influence': importance_dis = json.loads(es_result['importance_his']) activeness_his = json.loads(es_result['activeness_his']) influence_his = json.loads(es_result['influence_his']) user_influence_list = json.loads(es_result['user_influence_list']) user_influence_result = [] for user_item in user_influence_list: uid = user_item[0] result_item = user_item[:5] for i in range(5, 9): item = user_item[i] mid = item[1] number = item[0] if mid != 0 and uid: weibolink = weiboinfo2url(uid, mid) else: weibolink = None result_item.append((number, mid, weibolink)) user_influence_result.append(result_item) ''' origin_max_retweeted_number =es_result['origin_max_retweeted_number'] origin_max_retweeted_id = es_result['origin_max_retweeted_id'] origin_max_retweeted_user = es_result['origin_max_retweeted_user'] if origin_max_retweeted_id != 0 and origin_max_retweeted_user != 0: origin_max_retweeted_weibolink = weiboinfo2url(origin_max_retweeted_user, origin_max_retweeted_id) else: origin_max_retweeted_weibolink = None origin_max_comment_number = es_result['origin_max_comment_number'] origin_max_comment_id = es_result['origin_max_comment_id'] origin_max_comment_user = es_result['origin_max_comment_user'] if origin_max_comment_id !=0 and origin_max_comment_user != 0: origin_max_comment_weibolink = weiboinfo2url(origin_max_comment_user, origin_max_comment_id) else: origin_max_comment_weibolink = None retweet_max_retweeted_number = es_result['retweet_max_retweeted_number'] retweet_max_retweeted_id = es_result['retweet_max_retweeted_id'] retweet_max_retweeted_user = es_result['retweet_max_retweeted_user'] if retweet_max_retweeted_id != 0 and retweet_max_retweeted_user != 0: retweet_max_retweeted_weibolink = weiboinfo2url(retweet_max_retweeted_user, retweet_max_retweeted_id) else: retweet_max_retweeted_weibolink = None retweet_max_comment_number = es_result['retweet_max_comment_number'] retweet_max_comment_id = es_result['retweet_max_comment_id'] retweet_max_comment_user = es_result['retweet_max_comment_user'] if retweet_max_comment_id != 0 and retweet_max_comment_user != 0: retweet_max_comment_weibolink = weiboinfo2url(retweet_max_comment_user, retweet_max_comment_id) else: retweet_max_comment_weibolink = None ''' result = [ importance_dis, activeness_his, influence_his, user_influence_result ] #print result return result
def get_group_results(task_name, module): result = [] try: es_result = es.get(index=index_name, doc_type=index_type, id=task_name)['_source'] #print 'result:', result except: return None #basic module: gender, count, verified if module=='overview': task_name = es_result['task_name'] submit_date = es_result['submit_date'] state = es_result['state'] tightness = es_result['tightness'] activeness = es_result['activeness'] importance = es_result['importance'] influence = es_result['influence'] result = [task_name, submit_date, state, tightness, activeness, importance, influence] if module=='basic': gender_dict = json.loads(es_result['gender']) count = es_result['count'] verified = es_result['verified'] if verified: verified_dict = json.loads(verified) result = [gender_dict, count, verified] if module=='activity': activity_geo_dict = json.loads(es_result['activity_geo']) sort_activity_geo = sorted(activity_geo_dict.items(), key=lambda x:x[1], reverse=True) activity_geo = sort_activity_geo[:50] activity_trend = json.loads(es_result['activity_trend']) online_pattern_dict = json.loads(es_result['online_pattern']) sort_online_pattern = sorted(online_pattern_dict.items(), key=lambda x:x[1], reverse=True) online_pattern = sort_online_pattern[:50] geo_track = json.loads(es_result['geo_track']) result = [activity_geo, activity_trend, online_pattern, geo_track] if module=='social': #degree_his = json.loads(es_result['degree_his']) density = es_result['density'] retweet_weibo_count = es_result['retweet_weibo_count'] retweet_user_count = es_result['retweet_user_count'] retweet_relation = json.loads(es_result['retweet_relation']) uid_list = [] for relation in retweet_relation: uid_list.append(relation[0]) uid_list.append(relation[1]) es_portrait_result = es.mget(index='user_portrait', doc_type='user', body={'ids':uid_list})['docs'] es_count = 0 new_retweet_relation = [] for relation in retweet_relation: source_uid = relation[0] source_item = es_portrait_result[es_count] try: source = source_item['_source'] source_uname = source['uname'] except: source_uname = '' target_uid = relation[1] es_count += 1 target_item = es_portrait_result[es_count] try: source = target_item['_source'] target_uname = source['uname'] except: target_uname = '' count = relation[2] new_retweet_relation.append([source_uid, source_uname, target_uid, target_uname, count]) uid_list = [] out_beretweet_relation = json.loads(es_result['out_beretweet_relation']) uid_list = [] uid_list = [item[0] for item in out_beretweet_relation] es_portrait_result = es.mget(index='user_portrait', doc_type='user', body={'ids':uid_list})['docs'] es_count = 0 new_out_beretweet_relation = [] for i in range(len(uid_list)): item = es_portrait_result[i] uid = item['_id'] try: source = item['_source'] uname = source['uname'] except: uname = '' out_relation_item = out_beretweet_relation[i][1:] a = [uid, uname] a.extend(out_relation_item) #print 'add_item:', add_item new_out_beretweet_relation.append(a) result = [new_retweet_relation, density, retweet_weibo_count, retweet_user_count, new_out_beretweet_relation] if module=='think': domain_dict = json.loads(es_result['domain']) topic_dict = json.loads(es_result['topic']) psycho_status = json.loads(es_result['psycho_status']) psycho_feature = json.loads(es_result['psycho_feature']) result = [domain_dict, topic_dict, psycho_status, psycho_feature] if module=='text': hashtag_dict = json.loads(es_result['hashtag']) sort_hashtag = sorted(hashtag_dict.items(), key=lambda x:x[1], reverse=True) hashtag = sort_hashtag[:50] emoticon_dict = json.loads(es_result['emoticon']) sort_emoticon = sorted(emoticon_dict.items(), key=lambda x:x[1], reverse=True) emoticon = sort_emoticon[:5] keyword_dict = json.loads(es_result['keywords']) sort_keyword = sorted(keyword_dict.items(), key=lambda x:x[1], reverse=True) keyword = sort_keyword[:50] result = [hashtag, keyword, emoticon] if module=='influence': importance_dis = json.loads(es_result['importance_his']) activeness_his = json.loads(es_result['activeness_his']) influence_his = json.loads(es_result['influence_his']) user_influence_list = json.loads(es_result['user_influence_list']) user_influence_result = [] for user_item in user_influence_list: uid = user_item[0] result_item = user_item[:5] for i in range(5,9): item = user_item[i] mid = item[1] number = item[0] if mid != 0 and uid: weibolink = weiboinfo2url(uid, mid) else: weibolink = None result_item.append((number, mid, weibolink)) user_influence_result.append(result_item) ''' origin_max_retweeted_number =es_result['origin_max_retweeted_number'] origin_max_retweeted_id = es_result['origin_max_retweeted_id'] origin_max_retweeted_user = es_result['origin_max_retweeted_user'] if origin_max_retweeted_id != 0 and origin_max_retweeted_user != 0: origin_max_retweeted_weibolink = weiboinfo2url(origin_max_retweeted_user, origin_max_retweeted_id) else: origin_max_retweeted_weibolink = None origin_max_comment_number = es_result['origin_max_comment_number'] origin_max_comment_id = es_result['origin_max_comment_id'] origin_max_comment_user = es_result['origin_max_comment_user'] if origin_max_comment_id !=0 and origin_max_comment_user != 0: origin_max_comment_weibolink = weiboinfo2url(origin_max_comment_user, origin_max_comment_id) else: origin_max_comment_weibolink = None retweet_max_retweeted_number = es_result['retweet_max_retweeted_number'] retweet_max_retweeted_id = es_result['retweet_max_retweeted_id'] retweet_max_retweeted_user = es_result['retweet_max_retweeted_user'] if retweet_max_retweeted_id != 0 and retweet_max_retweeted_user != 0: retweet_max_retweeted_weibolink = weiboinfo2url(retweet_max_retweeted_user, retweet_max_retweeted_id) else: retweet_max_retweeted_weibolink = None retweet_max_comment_number = es_result['retweet_max_comment_number'] retweet_max_comment_id = es_result['retweet_max_comment_id'] retweet_max_comment_user = es_result['retweet_max_comment_user'] if retweet_max_comment_id != 0 and retweet_max_comment_user != 0: retweet_max_comment_weibolink = weiboinfo2url(retweet_max_comment_user, retweet_max_comment_id) else: retweet_max_comment_weibolink = None ''' result = [importance_dis, activeness_his, influence_his, user_influence_result] #print result return result
def influenced_user_detail(uid, date, origin_retweeted_mid, retweeted_retweeted_mid, message_type, default_number=20): query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must": [ ] } } } }, "size":20000, } if RUN_TYPE == 1: query_body["sort"] = {"user_fansnum":{"order":"desc"}} #详细影响到的人 date1 = str(date).replace('-', '') index_name = pre_index + date1 index_flow_text = pre_text_index + date origin_retweeted_uid = [] # influenced user uid_list retweeted_retweeted_uid = [] origin_comment_uid = [] retweeted_comment_uid = [] query_origin = copy.deepcopy(query_body) query_retweeted = copy.deepcopy(query_body) if origin_retweeted_mid: # 所有转发该条原创微博的用户 query_origin["query"]["filtered"]["filter"]["bool"]["must"].append({"terms": {"root_mid": origin_retweeted_mid}}) query_origin["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term":{"message_type": message_type}}, {"term":{"root_uid": uid}}]) origin_retweeted_result = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=query_origin, fields=["uid"])["hits"]["hits"] if origin_retweeted_result: for item in origin_retweeted_result: origin_retweeted_uid.append(item["fields"]["uid"][0]) if retweeted_retweeted_mid: # 所有评论该条原创微博的用户 query_retweeted["query"]["filtered"]["filter"]["bool"]["must"].append({"terms": {"root_mid": retweeted_retweeted_mid}}) query_retweeted["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term":{"message_type": message_type}},{"term": {"directed_uid": uid}}]) retweeted_retweeted_result = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=query_retweeted, fields=["uid"])["hits"]["hits"] if retweeted_retweeted_result: for item in retweeted_retweeted_result: retweeted_retweeted_uid.append(item["fields"]["uid"][0]) retweeted_uid_list = [] # all retweeted user list retweeted_results = {} # statistics of all retweeted uid information retweeted_domain = {} retweeted_topic = {} retweeted_geo = {} bci_results = {} in_portrait = [] out_portrait = [] average_influence = 0 total_influence = 0 count = 0 all_uid_set = set(origin_retweeted_uid) | set(retweeted_retweeted_uid) retweeted_uid_list.extend(origin_retweeted_uid) retweeted_uid_list.extend(retweeted_retweeted_uid) retweeted_uid_list = list(set(retweeted_uid_list) - set([uid])) # filter uids if retweeted_uid_list: user_portrait_result = es_user_portrait.mget(index=user_portrait, doc_type=portrait_index_type, body={"ids": retweeted_uid_list}, fields=["domain", "topic_string", "activity_geo_dict","importance", "influence"])["docs"] bci_index = "bci_" + date.replace('-', '') bci_results = es_cluster.mget(index=bci_index, doc_type="bci", body={"ids":retweeted_uid_list}, fields=['user_index'])["docs"] for item in user_portrait_result: if item["found"]: temp = [] count += 1 temp.append(item['_id']) temp.append(item["fields"]["importance"][0]) in_portrait.append(temp) temp_domain = item["fields"]["domain"][0].split('&') temp_topic = item["fields"]["topic_string"][0].split('&') temp_geo = json.loads(item["fields"]["activity_geo_dict"][0])[-1].keys() #total_influence += item["fields"]["influence"][0] retweeted_domain = aggregation(temp_domain, retweeted_domain) retweeted_topic = aggregation(temp_topic, retweeted_topic) retweeted_geo = aggregation(temp_geo, retweeted_geo) else: out_portrait.append(item['_id']) retweeted_domain = proportion(retweeted_domain) retweeted_topic = proportion(retweeted_topic) retweeted_geo = proportion(retweeted_geo) if bci_results: total_influence = 0 for item in bci_results: if item['found']: total_influence += item['fields']['user_index'][0] try: average_influence = total_influence/len(retweeted_uid_list) except: average_influence = 0 sorted_retweeted_domain = sorted(retweeted_domain.items(),key=lambda x:x[1], reverse=True) sorted_retweeted_topic = sorted(retweeted_topic.items(),key=lambda x:x[1], reverse=True) sorted_retweeted_geo = sorted(retweeted_geo.items(), key=lambda x:x[1], reverse=True) retweeted_results["domian"] = sorted_retweeted_domain[:5] retweeted_results["topic"] = sorted_retweeted_topic[:5] retweeted_results["geo"] = sorted_retweeted_geo[:5] retweeted_results["influence"] = average_influence in_portrait = sorted(in_portrait, key=lambda x:x[1], reverse=True) temp_list = [] for item in in_portrait: temp_list.append(item[0]) retweeted_results['in_portrait_number'] = len(temp_list) retweeted_results['out_portrait_number'] = len(out_portrait) in_portrait_url = get_user_url(temp_list[:default_number]) out_portrait_url = get_user_url(out_portrait[:default_number]) retweeted_results["in_portrait"] = in_portrait_url retweeted_results["out_portrait"] = out_portrait_url retweeted_results["total_number"] = len(temp_list) + len(out_portrait) return retweeted_results
def influenced_people(uid, mid, influence_style, date, default_number=20): # uid # which weibo----mid, retweeted weibo ---seek for root_mid # influence_style: retweeted(0) or comment(1) date1 = ts2datetime(datetime2ts(date)).replace('-', '') index_name = pre_index + date1 index_flow_text = pre_text_index + date text_result = es.get(index=index_flow_text, doc_type=flow_text_index_type, id=mid)["_source"] temp_mid = text_result.get("root_mid",'') #判断微博是否是原创微博 if temp_mid: mid_type = 1 # 非原创微博 else: mid_type = 0 # 原创微博 query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ ] } } } }, "size": 30000 } if RUN_TYPE: query_body["sort"] = {"user_fansnum":{"order":"desc"}} if int(mid_type) == 0: if int(influence_style) == 0: # origin weibo, all retweeted people query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"root_uid": uid}}, {"term": {"message_type": 3}}, {"term": {"root_mid": mid}}]) else: # commented people query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"directed_uid": uid}}, {"term": {"message_type": 2}}, {"term": {"root_mid": mid}}]) else: if int(influence_style) == 0: # origin weibo, all retweeted people query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"directed_uid": uid}}, {"term": {"message_type": 3}}, {"term": {"root_mid": temp_mid}}]) else: # commented people query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"directed_uid": uid}}, {"term": {"message_type": 2}}, {"term": {"root_mid": temp_mid}}]) search_results = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=query_body, _source=False, fields=["uid"], timeout=30)["hits"]["hits"] results = [] # uid_list if search_results: for item in search_results: if int(item["fields"]["uid"][0]) == int(uid): pass else: results.append(item["fields"]["uid"][0]) results = list(set(results)) else: results = [] bci_index = "bci_" + date.replace('-','') if results: portrait_results = es_user_portrait.mget(index=user_portrait, doc_type=portrait_index_type, body={"ids": results}, fields=["domain", "topic_string", "activity_geo_dict","importance", "influence"])["docs"] bci_results = es_cluster.mget(index=bci_index, doc_type='bci', body={"ids":results}, fields=['user_index'])['docs'] else: portrait_results = {} bci_results = {} in_portrait = [] out_portrait = [] in_portrait_info = [] retweeted_domain = {} retweeted_topic = {} retweeted_geo = {} average_influence = 0 total_influence = 0 count = 0 if bci_results: total_influence = 0 for item in bci_results: if item['found']: total_influence += item['fields']['user_index'][0] try: average_influence = total_influence/len(results) except: average_influence = 0 if portrait_results: for item in portrait_results: if item["found"]: temp = [] count += 1 temp.append(item['_id']) temp.append(item["fields"]["importance"][0]) in_portrait.append(temp) temp_domain = item["fields"]["domain"][0].split('&') temp_topic = item["fields"]["topic_string"][0].split('&') temp_geo = json.loads(item["fields"]["activity_geo_dict"][0])[-1].keys() #total_influence += item["fields"]["influence"][0] retweeted_domain = aggregation(temp_domain, retweeted_domain) retweeted_topic = aggregation(temp_topic, retweeted_topic) retweeted_geo = aggregation(temp_geo, retweeted_geo) else: out_portrait.append(item['_id']) retweeted_domain = proportion(retweeted_domain) retweeted_topic = proportion(retweeted_topic) retweeted_geo = proportion(retweeted_geo) #try: # average_influence = total_influence/count #except: # average_influence = 0 sorted_retweeted_domain = sorted(retweeted_domain.items(),key=lambda x:x[1], reverse=True) sorted_retweeted_topic = sorted(retweeted_topic.items(),key=lambda x:x[1], reverse=True) sorted_retweeted_geo = sorted(retweeted_geo.items(), key=lambda x:x[1], reverse=True) retweeted_results = dict() retweeted_results["domian"] = sorted_retweeted_domain[:5] retweeted_results["topic"] = sorted_retweeted_topic[:5] retweeted_results["geo"] = sorted_retweeted_geo[:5] retweeted_results["influence"] = average_influence in_portrait = sorted(in_portrait, key=lambda x:x[1], reverse=True) temp_list = [] for item in in_portrait: temp_list.append(item[0]) retweeted_results['in_portrait_number'] = len(temp_list) retweeted_results['out_portrait_number'] = len(out_portrait) in_portrait_url = get_user_url(temp_list[:default_number]) out_portrait_url = get_user_url(out_portrait[:default_number]) return_results = dict() return_results["influence_users"] = [in_portrait_url, out_portrait_url] return_results["influence_distribution"] = retweeted_results return return_results