def influence_distribute(): row = [0, 200, 500, 700, 900, 1100, 10000] result = [] ts = time.time() ts = datetime2ts('2013-09-08') # test ts = ts - 8*3600*24 for j in range(7): detail = [] ts += 3600*24 date = ts2datetime(ts).replace('-', '') for i in range(6): low_limit = row[i] upper_limit = row[i+1] query_body = { "query": { "filtered": { "filter": { "range": { date: { "gte": low_limit, "lt": upper_limit } } } } } } number = es.count(index='copy_sensitive_user_portrait', doc_type="user", body=query_body)['count'] detail.append(number) result.append(detail) return [row, result]
def influence_distribute(): row = [0, 200, 500, 700, 900, 1100, 10000] result = [] ts = time.time() ts = datetime2ts('2013-09-08') # test ts = ts - 8 * 3600 * 24 for j in range(7): detail = [] ts += 3600 * 24 date = ts2datetime(ts).replace('-', '') for i in range(6): low_limit = row[i] upper_limit = row[i + 1] query_body = { "query": { "filtered": { "filter": { "range": { date: { "gte": low_limit, "lt": upper_limit } } } } } } number = es.count(index='copy_sensitive_user_portrait', doc_type="user", body=query_body)['count'] detail.append(number) result.append(detail) return [row, result]
def get_attr(date): results = dict() number = es.count(index="sensitive_user_portrait", doc_type="user")['count'] results['total_number'] = number query_body={ "query":{ "filtered":{ "filter":{ "term":{ "type": 1 } } } } } sensitive_number = es.count(index="sensitive_user_portrait", doc_type="user", body=query_body)['count'] results['sensitive_number'] = sensitive_number results['influence_number'] = number - sensitive_number recommend_in_sensitive = 0 sensitive_dict = r.hgetall('recommend_sensitive') for k,v in sensitive_dict.items(): if v: sensitive_list = json.loads(v) recommend_in_sensitive += len(sensitive_list) recommend_in_influence = 0 influence_dict = r.hgetall('recommend_influence') for k,v in influence_dict.items(): if v: sensitive_list = json.loads(v) recommend_in_influence += len(sensitive_list) results['recommend_in'] = recommend_in_influence + recommend_in_sensitive results['monitor_number'] = [4, 83] # test results['new_sensitive_words'] = 5 # test query_body = query_body_module('sensitive_words_string') sw_list = es.search(index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets'] sensitive_words = [] for item in sw_list: temp = [] temp.append(item['key']) temp.append(item['doc_count']) sensitive_words.append(temp) results['sensitive_words'] = sensitive_words query_body = query_body_module('sensitive_geo_string') sg_list = es.search(index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets'] sensitive_geo = [] for item in sg_list: temp = [] temp.append(item['key']) temp.append(item['doc_count']) sensitive_geo.append(temp) results['sensitive_geo'] = sensitive_geo query_body = query_body_module('sensitive_hashtag_string') sh_list = es.search(index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets'] sensitive_hashtag = [] for item in sh_list: temp = [] temp.append(item['key']) temp.append(item['doc_count']) sensitive_hashtag.append(temp) results['sensitive_hashtag'] = sensitive_hashtag query_body = query_body_module('sensitive_geo_string') sg_list = es.search(index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets'] sensitive_geo = [] for item in sg_list: temp = [] temp.append(item['key']) temp.append(item['doc_count']) sensitive_geo.append(temp) results['sensitive_geo'] = sensitive_geo query_body = query_body_module('psycho_status_string') sp_list = es.search(index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets'] psycho_status = [] for item in sp_list: temp = [] temp.append(item['key']) temp.append(item['doc_count']) psycho_status.append(temp) results['psycho_status'] = psycho_status ''' query_body = query_body_module('political_tendency') st_list = es.search(index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets'] political_tendency = [] for item in st_list: temp = [] temp.append(item['key']) temp.append(item['doc_count']) political_tendency.append(temp) results['political_tendency'] = political_tendency ''' results['political_tendency'] = [['left', 123], ['middle', 768], ['right', 1095]] ''' query_body = query_body_module('domain_string') sd_list = es.search(index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets'] domain = [] for item in sd_list: temp = [] temp.append(item['key']) temp.append(item['doc_count']) domain.append(temp) results['domain'] = domain ''' # tendency distribution # domain and topic domain_list = [''] #search_important('domain', ) domain_results = get_top_user() topic_results = get_topic_user() results['domain_rank'] = domain_results results['topic_rank'] = topic_results # rank important_list = search_in_portrait('importance') results['importance'] = important_list results['sensitive'] = search_in_portrait('sensitive') results['influence'] = search_in_portrait('influence') results['activeness'] = search_in_portrait('activeness') query_body={ "query":{ "match_all": {} }, "sort": {"s_origin_weibo_comment_total_number": {"order": "desc"}} } date = ts2datetime(time.time()-24*3600).replace('-','') date = '20130907' results_list = es.search(index=date, doc_type="bci", body=query_body)['hits']['hits'] comment_weibo_detail = [] for item in results_list: temp = [] uid = item['_source']['uid'] try: uname = es_user_profile.get(index='weibo_user', doc_type='user', id=uid)['_source']['nick_name'] except: uname = 'unknown' temp.append(item['_source']['uid']) temp.append(uname) temp.append(item['_source']['s_origin_weibo_comment_total_number']) comment_weibo_detail.append(temp) results['comment_total'] = comment_weibo_detail query_body={ "query":{ "match_all": {} }, "sort": {"s_origin_weibo_retweeted_total_number": {"order": "desc"}} } date = ts2datetime(time.time()-24*3600).replace('-','') date = '20130907' results_list = es.search(index=date, doc_type="bci", body=query_body)['hits']['hits'] retweeted_weibo_detail = [] for item in results_list: temp = [] uid = item['_source']['uid'] try: uname = es_user_profile.get(index='weibo_user', doc_type='user', id=uid)['_source']['nick_name'] except: uname = 'unknown' temp.append(item['_source']['uid']) temp.append(uname) temp.append(item['_source']['s_origin_weibo_retweeted_total_number']) retweeted_weibo_detail.append(temp) results['retweeted_total'] = retweeted_weibo_detail query_body={ "query":{ "match_all": {} }, "sort": {"s_origin_weibo_number": {"order": "desc"}} } date = ts2datetime(time.time()-24*3600).replace('-','') date = '20130907' results_list = es.search(index=date, doc_type="bci", body=query_body)['hits']['hits'] weibo_detail = [] for item in results_list: temp = [] uid = item['_source']['uid'] try: uname = es_user_profile.get(index='weibo_user', doc_type='user', id=uid)['_source']['nick_name'] except: uname = 'unknown' temp.append(item['_source']['uid']) temp.append(uname) temp.append(item['_source']['s_origin_weibo_number']) weibo_detail.append(temp) results['top_weibo_number'] = weibo_detail return results
def search_attribute_portrait(uid): return_results = {} index_name = "sensitive_user_portrait" index_type = "user" try: search_result = es.get(index=index_name, doc_type=index_type, id=uid) except: return None results = search_result['_source'] #return_results = results user_sensitive = user_type(uid) if user_sensitive: #return_results.update(sensitive_attribute(uid)) return_results['user_type'] = 1 return_results['sensitive'] = 1 else: return_results['user_type'] = 0 return_results['sensitive'] = 0 if results['photo_url'] == 0: results['photo_url'] = 'unknown' if results['location'] == 0: results['location'] = 'unknown' return_results['photo_url'] = results['photo_url'] return_results['uid'] = results['uid'] return_results['uname'] = results['uname'] if return_results['uname'] == 0: return_results['uname'] = 'unknown' return_results['location'] = results['location'] return_results['fansnum'] = results['fansnum'] return_results['friendsnum'] = results['friendsnum'] return_results['gender'] = results['gender'] return_results['psycho_status'] = json.loads(results['psycho_status']) keyword_list = [] if results['keywords']: keywords_dict = json.loads(results['keywords']) sort_word_list = sorted(keywords_dict.items(), key=lambda x: x[1], reverse=True) return_results['keywords'] = sort_word_list else: return_results['keywords'] = [] return_results['retweet'] = search_retweet(uid, 0) return_results['follow'] = search_follower(uid, 0) return_results['at'] = search_mention(uid, 0) if results['ip'] and results['geo_activity']: ip_dict = json.loads(results['ip']) geo_dict = json.loads(results['geo_activity']) geo_description = active_geo_description(ip_dict, geo_dict) return_results['geo_description'] = geo_description else: return_results['geo_description'] = '' geo_top = [] temp_geo = {} if results['geo_activity']: geo_dict = json.loads(results['geo_activity']) if len(geo_dict) < 7: ts = time.time() ts = datetime2ts('2013-09-08') - 8 * 24 * 3600 for i in range(7): ts = ts + 24 * 3600 date = ts2datetime(ts).replace('-', '') if geo_dict.has_key(date): pass else: geo_dict[date] = {} activity_geo_list = sorted(geo_dict.items(), key=lambda x: x[0], reverse=False) geo_list = geo_dict.values() for k, v in activity_geo_list: sort_v = sorted(v.items(), key=lambda x: x[1], reverse=True) top_geo = [item[0] for item in sort_v] geo_top.append([k, top_geo[0:2]]) for iter_key in v.keys(): if temp_geo.has_key(iter_key): temp_geo[iter_key] += v[iter_key] else: temp_geo[iter_key] = v[iter_key] sort_geo_dict = sorted(temp_geo.items(), key=lambda x: x[1], reverse=True) return_results['top_activity_geo'] = sort_geo_dict return_results['activity_geo_distribute'] = geo_top else: return_results['top_activity_geo'] = [] return_results['activity_geo_distribute'] = geo_top hashtag_dict = get_user_hashtag(uid)[0] return_results['hashtag'] = hashtag_dict ''' emotion_result = {} emotion_conclusion_dict = {} if results['emotion_words']: emotion_words_dict = json.loads(results['emotion_words']) for word_type in emotion_mark_dict: try: word_dict = emotion_words_dict[word_type] if word_type=='126' or word_type=='127': emotion_conclusion_dict[word_type] = word_dict sort_word_dict = sorted(word_dict.items(), key=lambda x:x[1], reverse=True) word_list = sort_word_dict[:5] except: results['emotion_words'] = emotion_result emotion_result[emotion_mark_dict[word_type]] = word_list return_results['emotion_words'] = emotion_result ''' # topic if results['topic']: topic_dict = json.loads(results['topic']) sort_topic_dict = sorted(topic_dict.items(), key=lambda x: x[1], reverse=True) return_results['topic'] = sort_topic_dict[:5] else: return_results['topic'] = [] # domain if results['domain']: domain_string = results['domain'] domain_list = domain_string.split('_') return_results['domain'] = domain_list else: return_results['domain'] = [] ''' # emoticon if results['emotion']: emotion_dict = json.loads(results['emotion']) sort_emotion_dict = sorted(emotion_dict.items(), key=lambda x:x[1], reverse=True) return_results['emotion'] = sort_emotion_dict[:5] else: return_results['emotion'] = [] ''' # on_line pattern if results['online_pattern']: online_pattern_dict = json.loads(results['online_pattern']) sort_online_pattern_dict = sorted(online_pattern_dict.items(), key=lambda x: x[1], reverse=True) return_results['online_pattern'] = sort_online_pattern_dict[:5] else: return_results['online_pattern'] = [] ''' #psycho_feature if results['psycho_feature']: psycho_feature_list = results['psycho_feature'].split('_') return_results['psycho_feature'] = psycho_feature_list else: return_results['psycho_feature'] = [] ''' # self_state try: profile_result = es_user_profile.get(index='weibo_user', doc_type='user', id=uid) self_state = profile_result['_source'].get('description', '') return_results['description'] = self_state except: return_results['description'] = '' if results['importance']: query_body = { 'query': { 'range': { 'importance': { 'from': results['importance'], 'to': 100000 } } } } importance_rank = es.count(index='sensitive_user_portrait', doc_type='user', body=query_body) if importance_rank['_shards']['successful'] != 0: return_results['importance_rank'] = importance_rank['count'] else: return_results['importance_rank'] = 0 else: return_results['importance_rank'] = 0 return_results['importance'] = results['importance'] if results['activeness']: query_body = { 'query': { 'range': { 'activeness': { 'from': results['activeness'], 'to': 10000 } } } } activeness_rank = es.count(index='sensitive_user_portrait', doc_type='user', body=query_body) print activeness_rank if activeness_rank['_shards']['successful'] != 0: return_results['activeness_rank'] = activeness_rank['count'] else: return_results['activeness_rank'] = 0 else: return_results['activeness_rank'] = 0 return_results['activeness'] = results['activeness'] if results['influence']: query_body = { 'query': { 'range': { 'influence': { 'from': results['influence'], 'to': 100000 } } } } influence_rank = es.count(index='sensitive_user_portrait', doc_type='user', body=query_body) if influence_rank['_shards']['successful'] != 0: return_results['influence_rank'] = influence_rank['count'] else: return_results['influence_rank'] = 0 else: return_results['influence_rank'] = 0 return_results['influence'] = results['influence'] if results['sensitive']: query_body = { 'query': { 'range': { 'sensitive': { 'from': results['sensitive'], 'to': 100000 } } } } influence_rank = es.count(index='sensitive_user_portrait', doc_type='user', body=query_body) if influence_rank['_shards']['successful'] != 0: return_results['sensitive_rank'] = influence_rank['count'] else: return_results['sensitive_rank'] = 0 else: return_results['sensitive_rank'] = 0 return_results['sensitive'] = results['sensitive'] query_body = {'query': {"match_all": {}}} all_count = es.count(index='sensitive_user_portrait', doc_type='user', body=query_body) if all_count['_shards']['successful'] != 0: return_results['all_count'] = all_count['count'] else: print 'es_sensitive_user_portrait error' return_results['all_count'] = 0 # link link_ratio = results['link'] return_results['link'] = link_ratio weibo_trend = get_user_trend(uid)[0] return_results['time_description'] = active_time_description(weibo_trend) return_results['time_trend'] = weibo_trend # user influence trend influence_detail = [] influence_value = [] attention_value = [] ts = time.time() ts = datetime2ts('2013-09-08') - 8 * 24 * 3600 for i in range(1, 8): date = ts2datetime(ts + i * 24 * 3600).replace('-', '') detail = [0] * 10 try: item = es.get(index=date, doc_type='bci', id=uid)['_source'] ''' if return_results['utype']: detail[0] = item.get('s_origin_weibo_number', 0) detail[1] = item.get('s_retweeted_weibo_number', 0) detail[2] = item.get('s_origin_weibo_retweeted_total_number', 0) + item.get('s_retweeted_weibo_retweeted_total_number', 0) detail[3] = item.get('s_origin_weibo_comment_total_number', 0) + item.get('s_retweeted_weibo_comment_total_number', 0) else: ''' if 1: detail[0] = item.get('origin_weibo_number', 0) detail[1] = item.get('retweeted_weibo_number', 0) detail[2] = item.get( 'origin_weibo_retweeted_total_number', 0) + item.get( 'retweeted_weibo_retweeted_total_number', 0) detail[3] = item.get( 'origin_weibo_comment_total_number', 0) + item.get( 'retweeted_weibo_comment_total_number', 0) retweeted_id = item.get('origin_weibo_top_retweeted_id', '0') detail[4] = retweeted_id if retweeted_id: try: detail[5] = es.get(index='sensitive_user_text', doc_type='user', id=retweeted_id)['_source']['text'] except: detail[5] = '' else: detail[5] = '' detail[6] = item.get('origin_weibo_retweeted_top_number', 0) detail[7] = item.get('origin_weibo_top_comment_id', '0') if detail[7]: try: detail[8] = es.get(index='sensitive_user_text', doc_type='user', id=detail[7])['_source']['text'] except: detail[8] = '' else: detail[8] = '' detail[9] = item.get('origin_weibo_comment_top_number', 0) attention_number = detail[2] + detail[3] attention = 2 / (1 + math.exp(-0.005 * attention_number)) - 1 influence_value.append([date, item['user_index']]) influence_detail.append([date, detail]) attention_value.append(attention) except: influence_value.append([date, 0]) influence_detail.append([date, detail]) attention_value.append(0) return_results['influence_trend'] = influence_value return_results['common_influence_detail'] = influence_detail return_results['attention_degree'] = attention_value return return_results
def get_attr(date): results = dict() number = es.count(index="sensitive_user_portrait", doc_type="user")['count'] results['total_number'] = number query_body = {"query": {"filtered": {"filter": {"term": {"type": 1}}}}} sensitive_number = es.count(index="sensitive_user_portrait", doc_type="user", body=query_body)['count'] results['sensitive_number'] = sensitive_number results['influence_number'] = number - sensitive_number recommend_in_sensitive = 0 sensitive_dict = r.hgetall('recommend_sensitive') for k, v in sensitive_dict.items(): if v: sensitive_list = json.loads(v) recommend_in_sensitive += len(sensitive_list) recommend_in_influence = 0 influence_dict = r.hgetall('recommend_influence') for k, v in influence_dict.items(): if v: sensitive_list = json.loads(v) recommend_in_influence += len(sensitive_list) results['recommend_in'] = recommend_in_influence + recommend_in_sensitive results['monitor_number'] = [4, 83] # test results['new_sensitive_words'] = 5 # test query_body = query_body_module('sensitive_words_string') sw_list = es.search( index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets'] sensitive_words = [] for item in sw_list: temp = [] temp.append(item['key']) temp.append(item['doc_count']) sensitive_words.append(temp) results['sensitive_words'] = sensitive_words query_body = query_body_module('sensitive_geo_string') sg_list = es.search( index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets'] sensitive_geo = [] for item in sg_list: temp = [] temp.append(item['key']) temp.append(item['doc_count']) sensitive_geo.append(temp) results['sensitive_geo'] = sensitive_geo query_body = query_body_module('sensitive_hashtag_string') sh_list = es.search( index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets'] sensitive_hashtag = [] for item in sh_list: temp = [] temp.append(item['key']) temp.append(item['doc_count']) sensitive_hashtag.append(temp) results['sensitive_hashtag'] = sensitive_hashtag query_body = query_body_module('sensitive_geo_string') sg_list = es.search( index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets'] sensitive_geo = [] for item in sg_list: temp = [] temp.append(item['key']) temp.append(item['doc_count']) sensitive_geo.append(temp) results['sensitive_geo'] = sensitive_geo query_body = query_body_module('psycho_status_string') sp_list = es.search( index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets'] psycho_status = [] for item in sp_list: temp = [] temp.append(item['key']) temp.append(item['doc_count']) psycho_status.append(temp) results['psycho_status'] = psycho_status ''' query_body = query_body_module('political_tendency') st_list = es.search(index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets'] political_tendency = [] for item in st_list: temp = [] temp.append(item['key']) temp.append(item['doc_count']) political_tendency.append(temp) results['political_tendency'] = political_tendency ''' results['political_tendency'] = [['left', 123], ['middle', 768], ['right', 1095]] ''' query_body = query_body_module('domain_string') sd_list = es.search(index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets'] domain = [] for item in sd_list: temp = [] temp.append(item['key']) temp.append(item['doc_count']) domain.append(temp) results['domain'] = domain ''' # tendency distribution # domain and topic domain_list = [''] #search_important('domain', ) domain_results = get_top_user() topic_results = get_topic_user() results['domain_rank'] = domain_results results['topic_rank'] = topic_results # rank important_list = search_in_portrait('importance') results['importance'] = important_list results['sensitive'] = search_in_portrait('sensitive') results['influence'] = search_in_portrait('influence') results['activeness'] = search_in_portrait('activeness') query_body = { "query": { "match_all": {} }, "sort": { "s_origin_weibo_comment_total_number": { "order": "desc" } } } date = ts2datetime(time.time() - 24 * 3600).replace('-', '') date = '20130907' results_list = es.search(index=date, doc_type="bci", body=query_body)['hits']['hits'] comment_weibo_detail = [] for item in results_list: temp = [] uid = item['_source']['uid'] try: uname = es_user_profile.get(index='weibo_user', doc_type='user', id=uid)['_source']['nick_name'] except: uname = 'unknown' temp.append(item['_source']['uid']) temp.append(uname) temp.append(item['_source']['s_origin_weibo_comment_total_number']) comment_weibo_detail.append(temp) results['comment_total'] = comment_weibo_detail query_body = { "query": { "match_all": {} }, "sort": { "s_origin_weibo_retweeted_total_number": { "order": "desc" } } } date = ts2datetime(time.time() - 24 * 3600).replace('-', '') date = '20130907' results_list = es.search(index=date, doc_type="bci", body=query_body)['hits']['hits'] retweeted_weibo_detail = [] for item in results_list: temp = [] uid = item['_source']['uid'] try: uname = es_user_profile.get(index='weibo_user', doc_type='user', id=uid)['_source']['nick_name'] except: uname = 'unknown' temp.append(item['_source']['uid']) temp.append(uname) temp.append(item['_source']['s_origin_weibo_retweeted_total_number']) retweeted_weibo_detail.append(temp) results['retweeted_total'] = retweeted_weibo_detail query_body = { "query": { "match_all": {} }, "sort": { "s_origin_weibo_number": { "order": "desc" } } } date = ts2datetime(time.time() - 24 * 3600).replace('-', '') date = '20130907' results_list = es.search(index=date, doc_type="bci", body=query_body)['hits']['hits'] weibo_detail = [] for item in results_list: temp = [] uid = item['_source']['uid'] try: uname = es_user_profile.get(index='weibo_user', doc_type='user', id=uid)['_source']['nick_name'] except: uname = 'unknown' temp.append(item['_source']['uid']) temp.append(uname) temp.append(item['_source']['s_origin_weibo_number']) weibo_detail.append(temp) results['top_weibo_number'] = weibo_detail return results
def imagine(uid, query_fields_dict,index_name="sensitive_user_portrait", doctype='user'): """ uid: search users relate to uid query_fields_dict: defined search field weight fields: domain, topic, keywords, psycho_status, psycho_feature, activity_geo, hashtag for example: "domain": 2 domain, psycho_feature """ personal_info = es.get(index="sensitive_user_portrait", doc_type="user", id=uid, _source=True)['_source'] keys_list = query_fields_dict.keys() keys_list.remove('field') keys_list.remove('size') search_dict = {} iter_list = [] for iter_key in keys_list: if iter_key not in personal_info or personal_info[iter_key] == '': query_fields_dict.pop(iter_key) else: iter_list.append(iter_key) temp = personal_info[iter_key] search_dict[iter_key] = temp.split('&') if len(iter_list) == 0: return [] query_body = { 'query':{ 'function_score':{ 'query':{ 'bool':{ 'must':[ ] } }, "field_value_factor":{ } } } } score_standard = {} score_standard["modifier"] = "log1p" if query_fields_dict['field'] == "activeness": score_standard['field'] = "activeness" score_standard['factor'] = 100 elif query_fields_dict['field'] == "importance": score_standard['field'] = "importance" score_standard['factor'] = 0.01 elif query_fields_dict['field'] == "sensitive": score_standard['field'] = "sensitive" score_standard['factor'] = 100 elif query_fields_dict['field'] == 'influence': score_standard['field'] = "influence" score_standard['factor'] = 0.1 else: score_standard['field'] = "influence" score_standard['factor'] = 0 query_body['query']['function_score']['boost_mode'] = "sum" query_body['query']['function_score']['field_value_factor'] = score_standard query_fields_dict.pop('field') number = es.count(index=index_name, doc_type=doctype, body=query_body)['count'] query_body['size'] = 100 # default number query_number = query_fields_dict['size'] # required number query_fields_dict.pop('size') for (k,v) in query_fields_dict.items(): temp = {} temp_list = [] for iter_key in search_dict[k]: temp_list.append({'wildcard':{k:{'wildcard':'*'+iter_key+'*','boost': v}}}) query_body['query']['function_score']['query']['bool']['must'].append({'bool':{'should':temp_list}}) result = es.search(index=index_name, doc_type=doctype, body=query_body)['hits']['hits'] field_list = ['uid','uname', 'activeness','importance', 'influence', 'sensitive'] return_list = [] count = 0 for item in result: if uid == item['_id']: score = item['_score'] continue info = [] if not item['_source']['uname']: item['_source']['uname'] = 'unknown' for field in field_list: info.append(item['_source'][field]) info.append(item['_score']) common_dict = dict() for iter_key in iter_list: iter_common_list = item['_source'][iter_key].split('&') search_common_list = list(set(iter_common_list) & set(search_dict[iter_key])) iter_key = shift_dict[iter_key] common_dict[iter_key] = search_common_list info.append(common_dict) return_list.append(info) count += 1 if count == query_number: break return_list.append(number) temp_list = [] for field in field_list: temp_list.append(personal_info[field]) results = [] results.append(temp_list) results.extend(return_list) return results
def search_attribute_portrait(uid): return_results = {} index_name = "sensitive_user_portrait" index_type = "user" try: search_result = es.get(index=index_name, doc_type=index_type, id=uid) except: return None results = search_result['_source'] #return_results = results user_sensitive = user_type(uid) if user_sensitive: #return_results.update(sensitive_attribute(uid)) return_results['user_type'] = 1 return_results['sensitive'] = 1 else: return_results['user_type'] = 0 return_results['sensitive'] = 0 if results['photo_url'] == 0: results['photo_url'] = 'unknown' if results['location'] == 0: results['location'] = 'unknown' return_results['photo_url'] = results['photo_url'] return_results['uid'] = results['uid'] return_results['uname'] = results['uname'] if return_results['uname'] == 0: return_results['uname'] = 'unknown' return_results['location'] = results['location'] return_results['fansnum'] = results['fansnum'] return_results['friendsnum'] = results['friendsnum'] return_results['gender'] = results['gender'] return_results['psycho_status'] = json.loads(results['psycho_status']) keyword_list = [] if results['keywords']: keywords_dict = json.loads(results['keywords']) sort_word_list = sorted(keywords_dict.items(), key=lambda x:x[1], reverse=True) return_results['keywords'] = sort_word_list else: return_results['keywords'] = [] return_results['retweet'] = search_retweet(uid, 0) return_results['follow'] = search_follower(uid, 0) return_results['at'] = search_mention(uid, 0) if results['ip'] and results['geo_activity']: ip_dict = json.loads(results['ip']) geo_dict = json.loads(results['geo_activity']) geo_description = active_geo_description(ip_dict, geo_dict) return_results['geo_description'] = geo_description else: return_results['geo_description'] = '' geo_top = [] temp_geo = {} if results['geo_activity']: geo_dict = json.loads(results['geo_activity']) if len(geo_dict) < 7: ts = time.time() ts = datetime2ts('2013-09-08') - 8*24*3600 for i in range(7): ts = ts + 24*3600 date = ts2datetime(ts).replace('-', '') if geo_dict.has_key(date): pass else: geo_dict[date] = {} activity_geo_list = sorted(geo_dict.items(), key=lambda x:x[0], reverse=False) geo_list = geo_dict.values() for k,v in activity_geo_list: sort_v = sorted(v.items(), key=lambda x:x[1], reverse=True) top_geo = [item[0] for item in sort_v] geo_top.append([k, top_geo[0:2]]) for iter_key in v.keys(): if temp_geo.has_key(iter_key): temp_geo[iter_key] += v[iter_key] else: temp_geo[iter_key] = v[iter_key] sort_geo_dict = sorted(temp_geo.items(), key=lambda x:x[1], reverse=True) return_results['top_activity_geo'] = sort_geo_dict return_results['activity_geo_distribute'] = geo_top else: return_results['top_activity_geo'] = [] return_results['activity_geo_distribute'] = geo_top hashtag_dict = get_user_hashtag(uid)[0] return_results['hashtag'] = hashtag_dict ''' emotion_result = {} emotion_conclusion_dict = {} if results['emotion_words']: emotion_words_dict = json.loads(results['emotion_words']) for word_type in emotion_mark_dict: try: word_dict = emotion_words_dict[word_type] if word_type=='126' or word_type=='127': emotion_conclusion_dict[word_type] = word_dict sort_word_dict = sorted(word_dict.items(), key=lambda x:x[1], reverse=True) word_list = sort_word_dict[:5] except: results['emotion_words'] = emotion_result emotion_result[emotion_mark_dict[word_type]] = word_list return_results['emotion_words'] = emotion_result ''' # topic if results['topic']: topic_dict = json.loads(results['topic']) sort_topic_dict = sorted(topic_dict.items(), key=lambda x:x[1], reverse=True) return_results['topic'] = sort_topic_dict[:5] else: return_results['topic'] = [] # domain if results['domain']: domain_string = results['domain'] domain_list = domain_string.split('_') return_results['domain'] = domain_list else: return_results['domain'] = [] ''' # emoticon if results['emotion']: emotion_dict = json.loads(results['emotion']) sort_emotion_dict = sorted(emotion_dict.items(), key=lambda x:x[1], reverse=True) return_results['emotion'] = sort_emotion_dict[:5] else: return_results['emotion'] = [] ''' # on_line pattern if results['online_pattern']: online_pattern_dict = json.loads(results['online_pattern']) sort_online_pattern_dict = sorted(online_pattern_dict.items(), key=lambda x:x[1], reverse=True) return_results['online_pattern'] = sort_online_pattern_dict[:5] else: return_results['online_pattern'] = [] ''' #psycho_feature if results['psycho_feature']: psycho_feature_list = results['psycho_feature'].split('_') return_results['psycho_feature'] = psycho_feature_list else: return_results['psycho_feature'] = [] ''' # self_state try: profile_result = es_user_profile.get(index='weibo_user', doc_type='user', id=uid) self_state = profile_result['_source'].get('description', '') return_results['description'] = self_state except: return_results['description'] = '' if results['importance']: query_body = { 'query':{ 'range':{ 'importance':{ 'from':results['importance'], 'to': 100000 } } } } importance_rank = es.count(index='sensitive_user_portrait', doc_type='user', body=query_body) if importance_rank['_shards']['successful'] != 0: return_results['importance_rank'] = importance_rank['count'] else: return_results['importance_rank'] = 0 else: return_results['importance_rank'] = 0 return_results['importance'] = results['importance'] if results['activeness']: query_body = { 'query':{ 'range':{ 'activeness':{ 'from':results['activeness'], 'to': 10000 } } } } activeness_rank = es.count(index='sensitive_user_portrait', doc_type='user', body=query_body) if activeness_rank['_shards']['successful'] != 0: return_results['activeness_rank'] = activeness_rank['count'] else: return_results['activeness_rank'] = 0 else: return_results['activeness_rank'] = 0 return_results['activeness'] = results['activeness'] if results['influence']: query_body = { 'query':{ 'range':{ 'influence':{ 'from':results['influence'], 'to': 100000 } } } } influence_rank = es.count(index='sensitive_user_portrait', doc_type='user', body=query_body) if influence_rank['_shards']['successful'] != 0: return_results['influence_rank'] = influence_rank['count'] else: return_results['influence_rank'] = 0 else: return_results['influence_rank'] = 0 return_results['influence'] = results['influence'] if results['sensitive']: query_body = { 'query':{ 'range':{ 'sensitive':{ 'from':results['sensitive'], 'to': 100000 } } } } influence_rank = es.count(index='sensitive_user_portrait', doc_type='user', body=query_body) if influence_rank['_shards']['successful'] != 0: return_results['sensitive_rank'] = influence_rank['count'] else: return_results['sensitive_rank'] = 0 else: return_results['sensitive_rank'] = 0 return_results['sensitive'] = results['sensitive'] query_body = { 'query':{ "match_all":{} } } all_count = es.count(index='sensitive_user_portrait', doc_type='user', body=query_body) if all_count['_shards']['successful'] != 0: return_results['all_count'] = all_count['count'] else: print 'es_sensitive_user_portrait error' return_results['all_count'] = 0 # link link_ratio = results['link'] return_results['link'] = link_ratio weibo_trend = get_user_trend(uid)[0] return_results['time_description'] = active_time_description(weibo_trend) return_results['time_trend'] = weibo_trend # user influence trend influence_detail = [] influence_value = [] attention_value = [] ts = time.time() ts = datetime2ts('2013-09-08') - 8*24*3600 for i in range(1,8): date = ts2datetime(ts + i*24*3600).replace('-', '') detail = [0]*10 try: item = es.get(index=date, doc_type='bci', id=uid)['_source'] ''' if return_results['utype']: detail[0] = item.get('s_origin_weibo_number', 0) detail[1] = item.get('s_retweeted_weibo_number', 0) detail[2] = item.get('s_origin_weibo_retweeted_total_number', 0) + item.get('s_retweeted_weibo_retweeted_total_number', 0) detail[3] = item.get('s_origin_weibo_comment_total_number', 0) + item.get('s_retweeted_weibo_comment_total_number', 0) else: ''' if 1: detail[0] = item.get('origin_weibo_number', 0) detail[1] = item.get('retweeted_weibo_number', 0) detail[2] = item.get('origin_weibo_retweeted_total_number', 0) + item.get('retweeted_weibo_retweeted_total_number', 0) detail[3] = item.get('origin_weibo_comment_total_number', 0) + item.get('retweeted_weibo_comment_total_number', 0) retweeted_id = item.get('origin_weibo_top_retweeted_id', '0') detail[4] = retweeted_id if retweeted_id: try: detail[5] = es.get(index='sensitive_user_text', doc_type='user', id=retweeted_id)['_source']['text'] except: detail[5] = '' else: detail[5] = '' detail[6] = item.get('origin_weibo_retweeted_top_number', 0) detail[7] = item.get('origin_weibo_top_comment_id', '0') if detail[7]: try: detail[8] = es.get(index='sensitive_user_text', doc_type='user', id=detail[7])['_source']['text'] except: detail[8] = '' else: detail[8] = '' detail[9] = item.get('origin_weibo_comment_top_number', 0) attention_number = detail[2] + detail[3] attention = 2/(1+math.exp(-0.005*attention_number)) - 1 influence_value.append([date, item['user_index']]) influence_detail.append([date, detail]) attention_value.append(attention) except: influence_value.append([date, 0]) influence_detail.append([date, detail]) attention_value.append(0) return_results['influence_trend'] = influence_value return_results['common_influence_detail'] = influence_detail return_results['attention_degree'] = attention_value return return_results
def search_attribute_portrait(uid): return_results = {} index_name = "sensitive_user_portrait" index_type = "user" try: search_result = es.get(index=index_name, doc_type=index_type, id=uid) except: return None results = search_result["_source"] # return_results = results user_sensitive = user_type(uid) if user_sensitive: # return_results.update(sensitive_attribute(uid)) return_results["user_type"] = 1 return_results["sensitive"] = 1 else: return_results["user_type"] = 0 return_results["sensitive"] = 0 if results["photo_url"] == 0: results["photo_url"] = "unknown" if results["location"] == 0: results["location"] = "unknown" return_results["photo_url"] = results["photo_url"] return_results["uid"] = results["uid"] return_results["uname"] = results["uname"] if return_results["uname"] == 0: return_results["uname"] = "unknown" return_results["location"] = results["location"] return_results["fansnum"] = results["fansnum"] return_results["friendsnum"] = results["friendsnum"] return_results["gender"] = results["gender"] return_results["psycho_status"] = json.loads(results["psycho_status"]) keyword_list = [] if results["keywords"]: keywords_dict = json.loads(results["keywords"]) sort_word_list = sorted(keywords_dict.items(), key=lambda x: x[1], reverse=True) return_results["keywords"] = sort_word_list else: return_results["keywords"] = [] return_results["retweet"] = search_retweet(uid, 0) return_results["follow"] = search_follower(uid, 0) return_results["at"] = search_mention(uid, 0) if results["ip"] and results["geo_activity"]: ip_dict = json.loads(results["ip"]) geo_dict = json.loads(results["geo_activity"]) geo_description = active_geo_description(ip_dict, geo_dict) return_results["geo_description"] = geo_description else: return_results["geo_description"] = "" geo_top = [] temp_geo = {} if results["geo_activity"]: geo_dict = json.loads(results["geo_activity"]) if len(geo_dict) < 7: ts = time.time() ts = datetime2ts("2013-09-08") - 8 * 24 * 3600 for i in range(7): ts = ts + 24 * 3600 date = ts2datetime(ts).replace("-", "") if geo_dict.has_key(date): pass else: geo_dict[date] = {} activity_geo_list = sorted(geo_dict.items(), key=lambda x: x[0], reverse=False) geo_list = geo_dict.values() for k, v in activity_geo_list: sort_v = sorted(v.items(), key=lambda x: x[1], reverse=True) top_geo = [item[0] for item in sort_v] geo_top.append([k, top_geo[0:2]]) for iter_key in v.keys(): if temp_geo.has_key(iter_key): temp_geo[iter_key] += v[iter_key] else: temp_geo[iter_key] = v[iter_key] sort_geo_dict = sorted(temp_geo.items(), key=lambda x: x[1], reverse=True) return_results["top_activity_geo"] = sort_geo_dict return_results["activity_geo_distribute"] = geo_top else: return_results["top_activity_geo"] = [] return_results["activity_geo_distribute"] = geo_top hashtag_dict = get_user_hashtag(uid)[0] return_results["hashtag"] = hashtag_dict """ emotion_result = {} emotion_conclusion_dict = {} if results['emotion_words']: emotion_words_dict = json.loads(results['emotion_words']) for word_type in emotion_mark_dict: try: word_dict = emotion_words_dict[word_type] if word_type=='126' or word_type=='127': emotion_conclusion_dict[word_type] = word_dict sort_word_dict = sorted(word_dict.items(), key=lambda x:x[1], reverse=True) word_list = sort_word_dict[:5] except: results['emotion_words'] = emotion_result emotion_result[emotion_mark_dict[word_type]] = word_list return_results['emotion_words'] = emotion_result """ # topic if results["topic"]: topic_dict = json.loads(results["topic"]) sort_topic_dict = sorted(topic_dict.items(), key=lambda x: x[1], reverse=True) return_results["topic"] = sort_topic_dict[:5] else: return_results["topic"] = [] # domain if results["domain"]: domain_string = results["domain"] domain_list = domain_string.split("_") return_results["domain"] = domain_list else: return_results["domain"] = [] """ # emoticon if results['emotion']: emotion_dict = json.loads(results['emotion']) sort_emotion_dict = sorted(emotion_dict.items(), key=lambda x:x[1], reverse=True) return_results['emotion'] = sort_emotion_dict[:5] else: return_results['emotion'] = [] """ # on_line pattern if results["online_pattern"]: online_pattern_dict = json.loads(results["online_pattern"]) sort_online_pattern_dict = sorted(online_pattern_dict.items(), key=lambda x: x[1], reverse=True) return_results["online_pattern"] = sort_online_pattern_dict[:5] else: return_results["online_pattern"] = [] """ #psycho_feature if results['psycho_feature']: psycho_feature_list = results['psycho_feature'].split('_') return_results['psycho_feature'] = psycho_feature_list else: return_results['psycho_feature'] = [] """ # self_state try: profile_result = es_user_profile.get(index="weibo_user", doc_type="user", id=uid) self_state = profile_result["_source"].get("description", "") return_results["description"] = self_state except: return_results["description"] = "" if results["importance"]: query_body = {"query": {"range": {"importance": {"from": results["importance"], "to": 100000}}}} importance_rank = es.count(index="sensitive_user_portrait", doc_type="user", body=query_body) if importance_rank["_shards"]["successful"] != 0: return_results["importance_rank"] = importance_rank["count"] else: return_results["importance_rank"] = 0 else: return_results["importance_rank"] = 0 return_results["importance"] = results["importance"] if results["activeness"]: query_body = {"query": {"range": {"activeness": {"from": results["activeness"], "to": 10000}}}} activeness_rank = es.count(index="sensitive_user_portrait", doc_type="user", body=query_body) print activeness_rank if activeness_rank["_shards"]["successful"] != 0: return_results["activeness_rank"] = activeness_rank["count"] else: return_results["activeness_rank"] = 0 else: return_results["activeness_rank"] = 0 return_results["activeness"] = results["activeness"] if results["influence"]: query_body = {"query": {"range": {"influence": {"from": results["influence"], "to": 100000}}}} influence_rank = es.count(index="sensitive_user_portrait", doc_type="user", body=query_body) if influence_rank["_shards"]["successful"] != 0: return_results["influence_rank"] = influence_rank["count"] else: return_results["influence_rank"] = 0 else: return_results["influence_rank"] = 0 return_results["influence"] = results["influence"] if results["sensitive"]: query_body = {"query": {"range": {"sensitive": {"from": results["sensitive"], "to": 100000}}}} influence_rank = es.count(index="sensitive_user_portrait", doc_type="user", body=query_body) if influence_rank["_shards"]["successful"] != 0: return_results["sensitive_rank"] = influence_rank["count"] else: return_results["sensitive_rank"] = 0 else: return_results["sensitive_rank"] = 0 return_results["sensitive"] = results["sensitive"] query_body = {"query": {"match_all": {}}} all_count = es.count(index="sensitive_user_portrait", doc_type="user", body=query_body) if all_count["_shards"]["successful"] != 0: return_results["all_count"] = all_count["count"] else: print "es_sensitive_user_portrait error" return_results["all_count"] = 0 # link link_ratio = results["link"] return_results["link"] = link_ratio weibo_trend = get_user_trend(uid)[0] return_results["time_description"] = active_time_description(weibo_trend) return_results["time_trend"] = weibo_trend # user influence trend influence_detail = [] influence_value = [] attention_value = [] ts = time.time() ts = datetime2ts("2013-09-08") - 8 * 24 * 3600 for i in range(1, 8): date = ts2datetime(ts + i * 24 * 3600).replace("-", "") detail = [0] * 10 try: item = es.get(index=date, doc_type="bci", id=uid)["_source"] """ if return_results['utype']: detail[0] = item.get('s_origin_weibo_number', 0) detail[1] = item.get('s_retweeted_weibo_number', 0) detail[2] = item.get('s_origin_weibo_retweeted_total_number', 0) + item.get('s_retweeted_weibo_retweeted_total_number', 0) detail[3] = item.get('s_origin_weibo_comment_total_number', 0) + item.get('s_retweeted_weibo_comment_total_number', 0) else: """ if 1: detail[0] = item.get("origin_weibo_number", 0) detail[1] = item.get("retweeted_weibo_number", 0) detail[2] = item.get("origin_weibo_retweeted_total_number", 0) + item.get( "retweeted_weibo_retweeted_total_number", 0 ) detail[3] = item.get("origin_weibo_comment_total_number", 0) + item.get( "retweeted_weibo_comment_total_number", 0 ) retweeted_id = item.get("origin_weibo_top_retweeted_id", "0") detail[4] = retweeted_id if retweeted_id: try: detail[5] = es.get(index="sensitive_user_text", doc_type="user", id=retweeted_id)["_source"][ "text" ] except: detail[5] = "" else: detail[5] = "" detail[6] = item.get("origin_weibo_retweeted_top_number", 0) detail[7] = item.get("origin_weibo_top_comment_id", "0") if detail[7]: try: detail[8] = es.get(index="sensitive_user_text", doc_type="user", id=detail[7])["_source"][ "text" ] except: detail[8] = "" else: detail[8] = "" detail[9] = item.get("origin_weibo_comment_top_number", 0) attention_number = detail[2] + detail[3] attention = 2 / (1 + math.exp(-0.005 * attention_number)) - 1 influence_value.append([date, item["user_index"]]) influence_detail.append([date, detail]) attention_value.append(attention) except: influence_value.append([date, 0]) influence_detail.append([date, detail]) attention_value.append(0) return_results["influence_trend"] = influence_value return_results["common_influence_detail"] = influence_detail return_results["attention_degree"] = attention_value return return_results