def get_vary_detail_info(vary_detail_dict, uid_list): results = {} #get uname try: user_portrait_result = es.mget(index=portrait_index_name, doc_type=portrait_index_type,\ body={'ids':uid_list})['docs'] except: user_portrait_result = [] uname_dict = {} for portrait_item in user_portrait_result: uid = portrait_item['_id'] if portrait_item['found'] == True: uname = portrait_item['_source']['uname'] uname_dict[uid] = uname else: uname_dict[uid] = uid #get new vary detail information for vary_pattern in vary_detail_dict: user_info_list = vary_detail_dict[vary_pattern] new_pattern_list = [] for user_item in user_info_list: uid = user_item[0] uname = uname_dict[uid] start_date = ts2datetime(int(user_item[1])) end_date = ts2datetime(int(user_item[2])) new_pattern_list.append([uid, uname, start_date, end_date]) results[vary_pattern] = new_pattern_list return results
def get_group_user_track(uid): results = [] #step1:get user_portrait activity_geo_dict try: portrait_result = es.get(index=portrait_index_name, doc_type=portrait_index_type,\ id=uid, _source=False, fields=['activity_geo_dict']) except: portrait_result = {} if portrait_result == {}: return 'uid is not in user_portrait' activity_geo_dict = json.loads(portrait_result['fields']['activity_geo_dict'][0]) now_date_ts = datetime2ts(ts2datetime(int(time.time()))) start_ts = now_date_ts - DAY * len(activity_geo_dict) #step2: iter date to get month track for geo_item in activity_geo_dict: iter_date = ts2datetime(start_ts) sort_day_dict = sorted(geo_item.items(), key=lambda x:x[1], reverse=True) if sort_day_dict: results.append([iter_date, sort_day_dict[0][0]]) else: results.append([iter_date, '']) start_ts = start_ts + DAY # results= [['2017-03-31', ''], ['2017-04-01', u'\u4e2d\u56fd\t\u56db\u5ddd\t2\u6210\u90fd'], ['2017-04-02', u'\u4e2d\u56fd\t\u56db\u5ddd\t3\u6210\u90fd'], ['2017-04-03', u'\u4e2d\u56fd\t\u56db\u5ddd\t7\u6210\u90fd'], ['2017-04-04', u'\u4e2d\u56fd\t\u56db\u5ddd\t\u6210\u90fd'], ['2017-04-05', u'\u4e2d\u56fd\t\u56db\u5ddd\t\u6210\u90fd'], ['2017-04-06', u'\u4e2d\u56fd\t\u56db\u5ddd\t\u6210\u90fd']] geolist = [] line_list = [] index_city = 0 for i in results: if i[1] and i[1].split('\t')[0] == u'中国': geolist.append(i[1]) geolist = [i for i in set(geolist)] for x in range(len(results)-1): if results[x][1] != '' and results[x+1][1]!='' and results[x][1].split('\t')[0] == u'中国' and results[x+1][1].split('\t')[0] == u'中国': if results[x][1] != results[x+1][1]: line_list.append([results[x][1], results[x+1][1]]) return {'city':geolist, 'line':line_list}
def get_group_user_track(uid): results = [] #step1:get user_portrait activity_geo_dict try: portrait_result = es.get(index=portrait_index_name, doc_type=portrait_index_type,\ id=uid, _source=False, fields=['activity_geo_dict']) except: portrait_result = {} if portrait_result == {}: return 'uid is not in user_portrait' activity_geo_dict = json.loads( portrait_result['fields']['activity_geo_dict'][0]) now_date_ts = datetime2ts(ts2datetime(int(time.time()))) start_ts = now_date_ts - DAY * len(activity_geo_dict) #step2: iter date to get month track for geo_item in activity_geo_dict: iter_date = ts2datetime(start_ts) sort_day_dict = sorted(geo_item.items(), key=lambda x: x[1], reverse=True) if sort_day_dict: results.append([iter_date, sort_day_dict[0][0]]) else: results.append([iter_date, '']) start_ts = start_ts + DAY return results
def get_people_org_track(activity_geo_dict):#根据用户地理位置计算轨迹 results = [] now_date_ts = datetime2ts(ts2datetime(int(time.time()))) start_ts = now_date_ts - DAY * len(activity_geo_dict) #step2: iter date to get month track for geo_item in activity_geo_dict: iter_date = ts2datetime(start_ts) sort_day_dict = sorted(geo_item.items(), key=lambda x:x[1], reverse=True) if sort_day_dict: results.append([iter_date, sort_day_dict[0][0]]) else: results.append([iter_date, '']) start_ts = start_ts + DAY geolist = [] line_list = [] index_city = 0 for i in results: if i[1] and i[1].split('\t')[0] == u'中国': geolist.append(i[1]) geolist = [i for i in set(geolist)] for x in range(len(results)-1): if results[x][1] != '' and results[x+1][1]!='' and results[x][1].split('\t')[0] == u'中国' and results[x+1][1].split('\t')[0] == u'中国': if results[x][1] != results[x+1][1]: line_list.append([results[x][1], results[x+1][1]]) return {'city':geolist, 'line':line_list}
def get_user_detail(date, input_result): bci_date = ts2datetime(datetime2ts(date) - DAY) results = [] uid_list = input_result if date!='all': index_name = 'bci_' + ''.join(bci_date.split('-')) else: now_ts = time.time() now_date = ts2datetime(now_ts) index_name = 'bci_' + ''.join(now_date.split('-')) index_type = 'bci' user_bci_result = es_cluster.mget(index=index_name, doc_type=index_type, body={'ids':uid_list}, _source=True)['docs'] #INFLUENCE,fans,status user_profile_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={'ids':uid_list}, _source=True)['docs'] #个人姓名,注册地 max_evaluate_influ = get_evaluate_max(index_name) for i in range(0, len(uid_list)): uid = uid_list[i] bci_dict = user_bci_result[i] profile_dict = user_profile_result[i] try: bci_source = bci_dict['_source'] except: bci_source = None if bci_source: influence = bci_source['user_index'] influence = math.log(influence/float(max_evaluate_influ['user_index']) * 9 + 1 ,10) influence = influence * 100 else: influence = '' try: profile_source = profile_dict['_source'] except: profile_source = None if profile_source: uname = profile_source['nick_name'] location = profile_source['user_location'] try: fansnum = bci_dict['fields']['user_fansnum'][0] except: fansnum = 0 try: statusnum = bci_dict['fields']['weibo_month_sum'][0] except: statusnum = 0 else: uname = uid location = '' try: fansnum = bci_dict['fields']['user_fansnum'][0] except: fansnum = 0 try: statusnum = bci_dict['fields']['weibo_month_sum'][0] except: statusnum = 0 results.append({'uid':uid, 'uname':uname, 'location':location, 'fansnum':fansnum, 'statusnum':statusnum, 'influence':round(influence,2)}) return results
def recommentation_in(input_ts, recomment_type, submit_user, node_type): date = ts2datetime(input_ts) recomment_results = [] # read from redis results = [] hash_name = 'recomment_' + str( date) + "_" + recomment_type + "_" + node_type identify_in_hashname = "identify_in_" + str(date) # submit_user_recomment = "recomment_" + submit_user + "_" + str(date) # 用户自推荐名单 results = r.hgetall(hash_name) if not results: return [] # search from user_profile to rich the show information recommend_list = set(r.hkeys(hash_name)) identify_in_list = set(r.hkeys("compute")) # submit_user_recomment = set(r.hkeys(submit_user_recomment)) recomment_results = list(recommend_list - identify_in_list) # recomment_results = list(set(recomment_results) - submit_user_recomment) if recomment_results: results = get_user_detail(date, recomment_results[:1000], 'show_in', recomment_type) else: results = [] return results
def query_retweeted(uid, mid, ts, ttype=3): query_body = { "query": { "bool": { "must": [{ "term": { "root_mid": mid } }, { "term": { "directed_uid": uid } }, { "term": { "message_type": ttype } }] } } } index_list = [] for i in range(2): index_name = flow_text_index_name_pre + ts2datetime(ts) if es_flow_text.indices.exists(index=index_name): index_list.append(index_name) ts = ts + 3600 * 24 count = es_flow_text.count(index=index_list, doc_type=flow_text_index_type, body=query_body)["count"] return count
def get_user_detail(date, input_result, status, user_type="influence", auth=""): bci_date = ts2datetime(datetime2ts(date) - DAY) results = [] if status == 'show_in': uid_list = input_result if status == 'show_compute': uid_list = input_result.keys() if status == 'show_in_history': uid_list = input_result.keys() if date != 'all': index_name = 'bci_' + ''.join(bci_date.split('-')) else: now_ts = time.time() now_date = ts2datetime(now_ts) index_name = 'bci_' + ''.join(now_date.split('-')) tmp_ts = str(datetime2ts(date) - DAY) sensitive_string = "sensitive_score_" + tmp_ts query_sensitive_body = { "query": { "match_all": {} }, "size": 1, "sort": { sensitive_string: { "order": "desc" } } } try: top_sensitive_result = es_bci_history.search( index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX, body=query_sensitive_body, _source=False, fields=[sensitive_string])['hits']['hits'] top_sensitive = top_sensitive_result[0]['fields'][sensitive_string][0] except Exception, reason: print Exception, reason top_sensitive = 400
def recommentation_in_auto(date, submit_user): results = [] #run type if RUN_TYPE == 1: now_date = search_date else: now_date = ts2datetime(datetime2ts(RUN_TEST_TIME)) recomment_hash_name = 'recomment_' + now_date + '_auto' # print recomment_hash_name,'============' recomment_influence_hash_name = 'recomment_' + now_date + '_influence' recomment_sensitive_hash_name = 'recomment_' + now_date + '_sensitive' recomment_submit_hash_name = 'recomment_' + submit_user + '_' + now_date recomment_compute_hash_name = 'compute' # #step1: get auto # auto_result = r.hget(recomment_hash_name, 'auto') # if auto_result: # auto_user_list = json.loads(auto_result) # else: # auto_user_list = [] #step2: get admin user result admin_result = r.hget(recomment_hash_name, submit_user) admin_user_list = [] if admin_result: admin_result_dict = json.loads(admin_result) else: return None final_result = [] #step3: get union user and filter compute/influence/sensitive for k, v in admin_result_dict.iteritems(): admin_user_list = v union_user_auto_set = set(admin_user_list) influence_user = set(r.hkeys(recomment_influence_hash_name)) sensitive_user = set(r.hkeys(recomment_sensitive_hash_name)) compute_user = set(r.hkeys(recomment_compute_hash_name)) been_submit_user = set(r.hkeys(recomment_submit_hash_name)) filter_union_user = union_user_auto_set - ( influence_user | sensitive_user | compute_user | been_submit_user) auto_user_list = list(filter_union_user) #step4: get user detail if auto_user_list == []: return auto_user_list results = get_user_detail(now_date, auto_user_list, 'show_in', 'auto') for detail in results: #add root re_detail = detail re_detail.append(k) final_result.append(re_detail) return final_result
def recommentation_in(input_ts, recomment_type): date = ts2datetime(input_ts) recomment_results = [] results = [] hash_name = 'recomment_' + str(date) + "_" + recomment_type identify_in_hashname = "identify_in_" + str(date) results = r.hgetall(hash_name) if not results: return [] recommend_list = set(r.hkeys(hash_name)) identify_in_list = set(r.hkeys("compute")) recomment_results = list(recommend_list - identify_in_list)[0:3] if recomment_results: results = get_user_detail(date, recomment_results) else: results = [] return results
def search_bci(uid):#获取用户的粉丝数、关注数和微博数 uid_list = [uid] date = 1480176000#time.time() bci_date = ts2datetime(date - DAY) index_name = 'bci_' + ''.join(bci_date.split('-')) index_type = 'bci' user_bci_result = es_cluster.mget(index=index_name, doc_type=index_type, body={'ids':uid_list})['docs'] if len(user_bci_result): return {'fansnum':'', 'statusnum':'', 'friendnum':''} result = {'fansnum':'', 'statusnum':'', 'friendnum':''} for item in user_bci_result: if not item['found']: return result else: data = item['_source'] fansnum = data['user_fansnum'] friendsnum = data['user_friendsnum'] statusnum = data['origin_weibo_number']+data['retweeted_weibo_number'] result = {'fansnum':fansnum, 'statusnum':statusnum, 'friendnum':friendsnum} return result
def get_final_submit_user_info(uid_list): final_results = [] try: profile_results = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={'ids': uid_list})['docs'] except: profile_results = [] try: bci_history_results = es_bci_history.mget( index=bci_history_index_name, doc_type=bci_history_index_type, body={'ids': uid_list})['docs'] except: bci_history_results = [] #get bci_history max value now_time_ts = time.time() search_date_ts = datetime2ts(ts2datetime(now_time_ts - DAY)) bci_key = 'bci_' + str(search_date_ts) query_body = { 'query': { 'match_all': {} }, 'sort': [{ bci_key: { 'order': 'desc' } }], 'size': 1 } #try: bci_max_result = es_bci_history.search(index=bci_history_index_name, doc_type=bci_history_index_type, body=query_body, _source=False, fields=[bci_key])['hits']['hits'] #except: # bci_max_result = {} if bci_max_result: bci_max_value = bci_max_result[0]['fields'][bci_key][0] else: bci_max_value = MAX_VALUE iter_count = 0 for uid in uid_list: try: profile_item = profile_results[iter_count] except: profile_item = {} try: bci_history_item = bci_history_results[iter_count] except: bci_history_item = {} if profile_item and profile_item['found'] == True: uname = profile_item['_source']['nick_name'] location = profile_item['_source']['user_location'] else: uname = '' location = '' if bci_history_item and bci_history_item['found'] == True: fansnum = bci_history_item['_source']['user_fansnum'] statusnum = bci_history_item['_source']['weibo_month_sum'] try: bci = bci_history_item['_source'][bci_key] normal_bci = math.log(bci / bci_max_value * 9 + 1, 10) * 100 except: normal_bci = '' else: fansnum = '' statusnum = '' normal_bci = '' final_results.append( [uid, uname, location, fansnum, statusnum, normal_bci]) iter_count += 1 return final_results
def group_geo_vary(g_name, submit_user): group_id = p.get_pinyin(g_name) group_id = group_id.lower() uid_string = es_group.get(index=group_name, doc_type=group_type, id=group_id, fields=['people']) uid_list = uid_string['fields']['people'][0].split('&') activity_geo_vary = {} main_start_geo = {} main_end_geo = {} vary_detail_geo = {} activity_geo_distribution_date = {} if RUN_TYPE == 1: now_ts = int(time.time()) else: now_ts = datetime2ts(RUN_TEST_TIME) now_date_ts = datetime2ts(ts2datetime(now_ts)) try: iter_user_dict_list = es.mget(index=portrait_index_name, doc_type=portrait_index_type, \ body={'ids':uid_list})['docs'] except: iter_user_dict_list = [] for user_dict in iter_user_dict_list: uid = user_dict['_id'] source = user_dict['_source'] #attr8: activity_geo_dict---distribution by date user_activity_geo = {} activity_geo_dict_list = json.loads(source['activity_geo_dict']) activity_geo_date_count = len(activity_geo_dict_list) iter_ts = now_date_ts - activity_geo_date_count * DAY user_date_main_list = [] for i in range(0, activity_geo_date_count): date_item = activity_geo_dict_list[i] if iter_ts in activity_geo_distribution_date: activity_geo_distribution_date[iter_ts] = union_dict_list( [activity_geo_distribution_date[iter_ts], date_item]) else: activity_geo_distribution_date[iter_ts] = date_item #use to get activity_geo vary sort_date_item = sorted(date_item.items(), key=lambda x: x[1], reverse=True) if date_item != {}: main_date_city = sort_date_item[0][0] try: last_user_date_main_item = user_date_main_list[-1][0] except: last_user_date_main_item = '' if main_date_city != last_user_date_main_item: user_date_main_list.append([main_date_city, iter_ts]) iter_ts += DAY #attr8: activity_geo_dict---location vary if len(user_date_main_list) > 1: for i in range(1, len(user_date_main_list)): vary_city = [ geo_ts_item[0] for geo_ts_item in user_date_main_list[i - 1:i + 1] ] vary_ts = [ geo_ts_item[1] for geo_ts_item in user_date_main_list[i - 1:i + 1] ] vary_item = '&'.join(vary_city) #vary_item = '&'.join(user_date_main_list[i-1:i+1]) #get activity geo vary for vary table and map try: activity_geo_vary[vary_item] += 1 except: activity_geo_vary[vary_item] = 1 #get main start geo try: main_start_geo[vary_city[0]] += 1 except: main_start_geo[vary_city[0]] = 1 #get main end geo try: main_end_geo[vary_city[1]] += 1 except: main_end_geo[vary_city[1]] = 1 #get vary detail geo try: vary_detail_geo[vary_item].append( [uid, vary_ts[0], vary_ts[1]]) except: vary_detail_geo[vary_item] = [[ uid, vary_ts[0], vary_ts[1] ]] all_activity_geo = union_dict_list(activity_geo_distribution_date.values()) sort_all_activity_geo = sorted(all_activity_geo.items(), key=lambda x: x[1], reverse=True) try: main_activity_geo = sort_all_activity_geo[0][0] except: main_activity_geo = '' return {'main_start_geo':main_start_geo, 'main_end_geo': main_end_geo, \ 'vary_detail_geo': vary_detail_geo, 'activity_geo_vary':activity_geo_vary,\ 'main_activity_geo':main_activity_geo, 'activity_geo_distribution_date':activity_geo_distribution_date}
def current_status(mid): es_results = es_prediction.get(index="social_sensing_text", doc_type="text", id=mid)["_source"] uid = es_results["uid"] ts = es_results["timestamp"] print "mid result: ", es_results query_body = { "query": { "bool": { "must": [{ "term": { "root_mid": mid } }, { "term": { "message_type": 3 } }] } }, "aggs": { "hot_uid": { "terms": { "field": "directed_uid", "size": 11 } } } } index_list = [] for i in range(2): index_name = flow_text_index_name_pre + ts2datetime(ts) if es_flow_text.indices.exists(index=index_name): index_list.append(index_name) ts = ts + 3600 * 24 results = es_flow_text.search( index=index_list, doc_type=flow_text_index_type, body=query_body)["aggregations"]["hot_uid"]["buckets"] retweet_dict = dict() for item in results: iter_uid = item["key"] if str(iter_uid) == str(uid): continue else: retweet_dict[str(iter_uid)] = item["doc_count"] print "retweet_dict: ", retweet_dict query_body = { "query": { "bool": { "must": [{ "term": { "root_mid": mid } }, { "term": { "message_type": 2 } }] } }, "aggs": { "hot_uid": { "terms": { "field": "directed_uid", "size": 11 } } } } index_name = flow_text_index_name_pre + ts2datetime(ts) results = es_flow_text.search( index=index_list, doc_type=flow_text_index_type, body=query_body)["aggregations"]["hot_uid"]["buckets"] comment_dict = dict() for item in results: iter_uid = str(item["key"]) if iter_uid == str(uid): continue else: comment_dict[iter_uid] = item["doc_count"] print "comment_dict: ", comment_dict # user_profile uid_list = list(set(comment_dict.keys()) | set(retweet_dict.keys())) profile_results = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids": uid_list})["docs"] profile_dict = dict() for item in profile_results: if item["found"]: item = item["_source"] iter_uid = str(item["uid"]) tmp = dict() tmp["nick_name"] = item["nick_name"] if not tmp["nick_name"]: tmp["nick_name"] = iter_uid tmp["photo_url"] = item["photo_url"] profile_dict[iter_uid] = tmp else: tmp = dict() tmp["nick_name"] = item["_id"] tmp["photo_url"] = "" profile_dict[iter_uid] = tmp hot_retweet_list = [] retweet_uid_list = retweet_dict.keys() retweet_list = es_flow_text.search(index=index_list, doc_type="text", body={ "query": { "bool": { "must": [{ "terms": { "uid": retweet_uid_list } }, { "term": { "root_mid": mid } }] } }, "size": 100 })["hits"]["hits"] in_set = set() for item in retweet_list: item = item["_source"] iter_uid = str(item["uid"]) if iter_uid in in_set: continue else: in_set.add(iter_uid) item["retweeted"] = retweet_dict[iter_uid] item["comment"] = query_retweeted(iter_uid, mid, ts, 2) # 获取转发微博的评论量 item.update(profile_dict[iter_uid]) hot_retweet_list.append(item) hot_retweet_list = sorted(hot_retweet_list, key=lambda x: x["retweeted"], reverse=True) hot_comment_list = [] comment_uid_list = comment_dict.keys() comment_list = es_flow_text.search(index=index_list, doc_type="text", body={ "query": { "bool": { "must": [{ "terms": { "uid": comment_uid_list } }, { "term": { "root_mid": mid } }] } }, "size": 100 })["hits"]["hits"] in_set = set() for item in comment_list: item = item["_source"] iter_uid = str(item["uid"]) if iter_uid in in_set: continue else: in_set.add(iter_uid) item["comment"] = comment_dict[iter_uid] item["retweeted"] = query_retweeted(iter_uid, mid, ts, 3) # 获取转发微博的评论量 item.update(profile_dict[iter_uid]) hot_comment_list.append(item) hot_comment_list = sorted(hot_comment_list, key=lambda x: x["comment"], reverse=True) results = dict() results["hot_retweeted"] = hot_retweet_list results["hot_comment"] = hot_comment_list return results