def get_show_trace_followers(xnr_user_no): es_get_result = es.get(index=tw_xnr_fans_followers_index_name,doc_type=tw_xnr_fans_followers_index_type,\ id=xnr_user_no)['_source'] trace_follow_list = es_get_result['trace_follow_list'] weibo_user_info = [] if trace_follow_list: mget_results = es.mget(index=twitter_user_index_name,doc_type=twitter_user_index_type,\ body={'ids':trace_follow_list})['docs'] # print 'mget_results::',mget_results for result in mget_results: if result['found']: weibo_user_info.append(result['_source']) else: uid = result['_id'] weibo_user_info.append({ 'uid': uid, 'statusnum': 0, 'fansnum': 0, 'friendsnum': 0, 'photo_url': '', 'sex': '', 'nick_name': uid, 'user_location': '' }) else: weibo_user_info = [] return weibo_user_info
def my_topic_classfiy(uid_list, datetime_list): topic_dict_results = {} topic_string_results = {} #将处理后的结果保存到数据库中,并在处理前查询数据库中是否已经有了相应内容之前存储的结果,以提高效率 uids = uid_list unresolved_uids = [] res = es.mget(index=tw_portrait_index_name, doc_type=tw_portrait_index_type, body={'ids': uids})['docs'] for r in res: uid = r['_id'] if r.has_key('found'): found = r['found'] if found and r['_source'].has_key('topic'): topic = r['_source']['topic'] topic_string = r['_source']['topic_string'] topic_dict_results[uid] = json.loads(topic) topic_string_results[uid] = [topic_ch2en_dict[ch_topic] for ch_topic in topic_string.split('&')] else: unresolved_uids.append(uid) else: #es表中目前无任何记录 unresolved_uids.append(uid) #未在数据库中的进行计算并存储 user_topic_dict = {} user_topic_list = {} if unresolved_uids: tw_flow_text_index_list = [] for datetime in datetime_list: tw_flow_text_index_list.append(flow_text_index_name_pre + datetime) user_topic_data = get_filter_keywords(tw_flow_text_index_list, unresolved_uids) user_topic_dict, user_topic_list = topic_classfiy(unresolved_uids, user_topic_data) user_topic_string = {} for uid, topic_list in user_topic_list.items(): li = [] for t in topic_list: li.append(zh_data[name_list.index(t)].decode('utf8')) user_topic_string[uid] = '&'.join(li) user_topic = {} for uid in unresolved_uids: if uid in user_topic_dict: user_topic[uid] = { 'filter_keywords': json.dumps(user_topic_data[uid]), 'topic': json.dumps(user_topic_dict[uid]), 'topic_string': user_topic_string[uid] } else: user_topic[uid] = { 'filter_keywords': json.dumps({}), 'topic': json.dumps({}), 'topic_string': '' } save_data2es(user_topic) #整合 user_topic_dict.update(topic_dict_results) user_topic_list.update(topic_string_results) return user_topic_dict, user_topic_list
def get_recommend_at_user(xnr_user_no): #_id = user_no2_id(user_no) es_result = es.get(index=tw_xnr_index_name, doc_type=tw_xnr_index_type, id=xnr_user_no)['_source'] #print 'es_result:::',es_result if es_result: uid = es_result['uid'] daily_interests = es_result['daily_interests'] if S_TYPE == 'test': now_ts = datetime2ts(S_DATE_TW) else: now_ts = int(time.time()) datetime = ts2datetime(now_ts - 24 * 3600) index_name = twitter_flow_text_index_name_pre + datetime nest_query_list = [] daily_interests_list = daily_interests.split('&') es_results_daily = es.search(index=index_name,doc_type=twitter_flow_text_index_type,\ body={'query':{'match_all':{}},'size':200,\ 'sort':{'timestamp':{'order':'desc'}}})['hits']['hits'] uid_list = [] if es_results_daily: for result in es_results_daily: result = result['_source'] uid_list.append(result['uid']) ## 根据uid,从weibo_user中得到 nick_name uid_nick_name_dict = dict() # uid不会变,而nick_name可能会变 es_results_user = es.mget(index=twitter_user_index_name, doc_type=twitter_user_index_type, body={'ids': uid_list})['docs'] i = 0 for result in es_results_user: if result['found'] == True: result = result['_source'] uid = result['uid'] nick_name = result['name'] if nick_name: i += 1 uid_nick_name_dict[uid] = nick_name if i >= DAILY_AT_RECOMMEND_USER_TOP: break return uid_nick_name_dict
def create_event_warning(xnr_user_no, today_datetime, write_mark): #获取事件名称 hashtag_list = get_hashtag(today_datetime) #print 'hashtag_list/:',hashtag_list facebook_flow_text_index_name = get_timets_set_indexset_list( facebook_flow_text_index_name_pre, today_datetime, today_datetime) #虚拟人的好友列表 friends_list = lookup_xnr_friends(xnr_user_no) event_warming_list = [] for event_item in hashtag_list: event_warming_content = dict() #事件名称、主要参与用户、典型微博、事件影响力、事件平均时间 event_warming_content['event_name'] = event_item['event_name'] event_influence_sum = 0 event_time_sum = 0 query_body = { 'query': { 'filtered': { 'filter': { 'bool': { 'must': [{ 'term': { 'hashtag': event_item['event_name'] } }, { 'range': { 'sensitive': { 'gte': 1 } } }] } } } }, 'size': MAX_WARMING_SIZE, 'sort': { 'sensitive': { 'order': 'desc' } } } event_results = es_xnr_2.search(index=facebook_flow_text_index_name, doc_type=facebook_flow_text_index_type, body=query_body)['hits']['hits'] if event_results: facebook_result = [] friends_num_dict = dict() alluser_num_dict = dict() #print 'sencond_time:::',int(time.time()) for item in event_results: #查询三个指标字段 fid_result = lookup_fid_attend_index(item['_source']['fid'], today_datetime) if fid_result: item['_source']['comment'] = fid_result['comment'] item['_source']['share'] = fid_result['share'] item['_source']['favorite'] = fid_result['favorite'] else: item['_source']['comment'] = 0 item['_source']['share'] = 0 item['_source']['favorite'] = 0 #print 'event_content:',item['_source']['text'] #统计用户信息 if alluser_num_dict.has_key(str(item['_source']['uid'])): friends_mark = set_intersection(item['_source']['uid'], friends_list) if friends_mark > 0: alluser_num_dict[str( item['_source']['uid'])] = alluser_num_dict[str( item['_source']['uid'])] + 1 * 2 else: alluser_num_dict[str( item['_source']['uid'])] = alluser_num_dict[str( item['_source']['uid'])] + 1 else: alluser_num_dict[str(item['_source']['uid'])] = 1 #计算影响力 origin_influence_value = (1 + item['_source']['comment'] + item['_source']['share'] + item['_source']['favorite']) * ( 1 + item['_source']['sensitive']) friends_value = judge_user_type(item['_source']['uid'], friends_list) item['_source'][ 'facebook_influence_value'] = origin_influence_value * friends_value #查询用户昵称 item['_source']['nick_name'] = get_user_nickname( item['_source']['uid']) facebook_result.append(item['_source']) #统计影响力、时间 event_influence_sum = event_influence_sum + item['_source'][ 'facebook_influence_value'] event_time_sum = event_time_sum + item['_source']['timestamp'] # print 'third_time:::',int(time.time()) #典型信息 facebook_result.sort(key=lambda k: (k.get('facebook_influence_value', 0)), reverse=True) event_warming_content['main_facebook_info'] = json.dumps( facebook_result) #事件影响力和事件时间 number = len(event_results) event_warming_content[ 'event_influence'] = event_influence_sum / number event_warming_content['event_time'] = event_time_sum / number #对用户进行排序 alluser_num_dict = sorted(alluser_num_dict.items(), key=lambda d: d[1], reverse=True) main_userid_list = [] for i in xrange(0, len(alluser_num_dict)): main_userid_list.append(alluser_num_dict[i][0]) #主要参与用户信息 main_user_info = [] user_es_result = es_xnr_2.mget(index=facebook_user_index_name, doc_type=facebook_user_index_type, body={'ids': main_userid_list})['docs'] # print 'user_es_result:',user_es_result for item in user_es_result: user_dict = dict() if item['found']: user_dict['uid'] = item['_id'] user_dict['username'] = item['_source']['username'] if item['_source'].has_key('talking_about_count'): user_dict['talking_about_count'] = item['_source'][ 'talking_about_count'] else: user_dict['talking_about_count'] = 0 if item['_source'].has_key('likes'): user_dict['likes'] = item['_source']['likes'] else: user_dict['likes'] = 0 if item['_source'].has_key('category'): user_dict['category'] = item['_source']['category'] else: user_dict['category'] = '' else: # user_dict['icon']='' user_dict['uid'] = item['_id'] user_dict['username'] = '' user_dict['talking_about_count'] = 0 user_dict['likes'] = 0 user_dict['category'] = '' main_user_info.append(user_dict) event_warming_content['main_user_info'] = json.dumps( main_user_info) # print 'fourth_time:::',int(time.time()) event_warming_content['xnr_user_no'] = xnr_user_no event_warming_content['validity'] = 0 event_warming_content['timestamp'] = today_datetime now_time = int(time.time()) # task_id=xnr_user_no+'_'+str(now_time) task_id = xnr_user_no + '_' + event_warming_content['event_name'] #写入数据库 if write_mark: # print 'today_datetime:::',ts2datetime(today_datetime) print 'task_id_event:', task_id mark = write_envent_warming(today_datetime, event_warming_content, task_id) event_warming_list.append(mark) else: event_warming_list.append(event_warming_content) else: pass # print 'fifth_time:::',int(time.time()) return event_warming_list
def detect_by_seed_users(seed_users): retweet_mark = 1 #目前只有部分数据 comment_mark = 0 #暂无数据 group_uid_list = set() all_union_result_dict = {} #get retweet/comment es db_number now_ts = time.time() db_number = get_db_num(now_ts) #step1: mget retweet and be_retweet if retweet_mark == 1: # retweet_index_name = retweet_index_name_pre + str(db_number) be_retweet_index_name = be_retweet_index_name_pre + str(db_number) #mget retwet ''' try: retweet_result = es.mget(index=retweet_index_name, doc_type=retweet_index_type, \ body={'ids':seed_users}, _source=True)['docs'] except: retweet_result = [] ''' #mget be_retweet try: be_retweet_result = es.mget(index=be_retweet_index_name, doc_type=be_retweet_index_type, \ body={'ids':seed_users} ,_source=True)['docs'] except: be_retweet_result = [] ''' #step2: mget comment and be_comment if comment_mark == 1: comment_index_name = comment_index_name_pre + str(db_number) be_comment_index_name = be_comment_index_name_pre + str(db_number) #mget comment try: comment_result = es.mget(index=comment_index_name, doc_type=comment_index_type, \ body={'ids':seed_users}, _source=True)['docs'] except: comment_result = [] #mget be_comment try: be_comment_result = es.mget(index=be_comment_index_name, doc_type=be_comment_index_type, \ body={'ids':seed_users}, _source=True)['docs'] except: be_comment_result = [] ''' #step3: union retweet/be_retweet/comment/be_comment result union_count = 0 for iter_search_uid in seed_users: try: uid_retweet_dict = json.loads( retweet_result[union_count]['_source']['uid_retweet']) except: uid_retweet_dict = {} try: uid_be_retweet_dict = json.loads( be_retweet_result[union_count]['_source']['uid_be_retweet']) except: uid_be_retweet_dict = {} try: uid_comment_dict = json.loads( comment_result[union_count]['_source']['uid_comment']) except: uid_comment_dict = {} try: uid_be_comment_dict = json.loads( be_comment_result[union_count]['_source']['uid_be_comment']) except: uid_be_comment_dict = {} #union four type user set union_result = union_dict(uid_retweet_dict, uid_be_retweet_dict, uid_comment_dict, uid_be_comment_dict) all_union_result_dict[iter_search_uid] = union_result ''' !!!! 有一个转化提取 从 all_union_result_dict 中提取 所有的uid ''' for seeder_uid, inter_dict in all_union_result_dict.iteritems(): for uid, inter_count in inter_dict.iteritems(): group_uid_list.add(uid) group_uid_list = list(group_uid_list) return group_uid_list
def my_domain_classfiy(uid_list, datetime_list): domain_results = {} #将处理后的结果保存到数据库中,并在处理前查询数据库中是否已经有了相应内容之前存储的结果,以提高效率 uids = uid_list unresolved_uids = [] res = es.mget(index=fb_portrait_index_name, doc_type=fb_portrait_index_type, body={'ids': uids})['docs'] for r in res: uid = r['_id'] if r.has_key('found'): found = r['found'] if found and r['_source'].has_key('domain'): domain = r['_source']['domain'] domain_results[uid] = domain else: unresolved_uids.append(uid) else: #es表中目前无任何记录 unresolved_uids.append(uid) #未在数据库中的进行计算并存储 user_domain = {} user_domain_temp = {} if unresolved_uids: fb_flow_text_index_list = [] for datetime in datetime_list: fb_flow_text_index_list.append(flow_text_index_name_pre + datetime) user_domain_data = {} #load num of text count_result = count_text_num(unresolved_uids, fb_flow_text_index_list) #load baseinfo fb_user_query_body = { 'query': { "filtered": { "filter": { "bool": { "must": [ { "terms": { "uid": unresolved_uids } }, ] } } } }, 'size': MAX_SEARCH_SIZE, "fields": ["bio_str", "category", "uid"] } try: search_results = es.search(index=facebook_user_index_name, doc_type=facebook_user_index_type, body=fb_user_query_body)['hits']['hits'] for item in search_results: content = item['fields'] uid = content['uid'][0] if not uid in user_domain_data: text_num = count_result[uid] user_domain_data[uid] = { 'bio_str': '', 'category': '', 'number_of_text': text_num } #对于长文本,Goslate 会在标点换行等分隔处把文本分拆为若干接近 2000 字节的子文本,再一一查询,最后将翻译结果拼接后返回用户。通过这种方式,Goslate 突破了文本长度的限制。 if content.has_key('category'): category = content.get('category')[0] else: category = '' if content.has_key('bio_str'): bio_str = content.get('bio_str')[0] else: bio_str = '____' user_domain_data[uid]['bio_str'] = bio_str user_domain_data[uid]['category'] = category except Exception, e: print e #domain计算 user_domain_temp = domain_main(user_domain_data) for uid in unresolved_uids: if uid in user_domain_temp: user_domain[uid] = {'domain': user_domain_temp[uid]} else: user_domain_temp[uid] = 'other' user_domain[uid] = {'domain': 'other'} save_data2es(user_domain)
def my_domain_classfiy(uid_list, datetime_list): domain_results = {} #将处理后的结果保存到数据库中,并在处理前查询数据库中是否已经有了相应内容之前存储的结果,以提高效率 uids = uid_list unresolved_uids = [] res = es.mget(index=tw_portrait_index_name, doc_type=tw_portrait_index_type, body={'ids': uids})['docs'] for r in res: uid = r['_id'] if r.has_key('found'): found = r['found'] if found and r['_source'].has_key('domain'): domain = r['_source']['domain'] domain_results[uid] = domain else: unresolved_uids.append(uid) else: #es表中目前无任何记录 unresolved_uids.append(uid) #未在数据库中的进行计算并存储 user_domain = {} user_domain_temp = {} if unresolved_uids: tw_flow_text_index_list = [] for datetime in datetime_list: tw_flow_text_index_list.append(flow_text_index_name_pre + datetime) user_domain_data = {} #load num of text count_result = count_text_num(unresolved_uids, tw_flow_text_index_list) #load baseinfo tw_user_query_body = { 'query': { "filtered": { "filter": { "bool": { "must": [ { "terms": { "uid": unresolved_uids } }, ] } } } }, 'size': MAX_SEARCH_SIZE, "fields": ["location", "username", "description", "uid"] } try: search_results = es.search(index=twitter_user_index_name, doc_type=twitter_user_index_type, body=tw_user_query_body)['hits']['hits'] for item in search_results: content = item['fields'] uid = content['uid'][0] if not uid in user_domain_data: text_num = count_result[uid] user_domain_data[uid] = { 'location': '', 'username': '', 'description': '', 'number_of_text': text_num } if content.has_key('location_ch'): location = content.get('location_ch')[0] else: location = '' if content.has_key('description_ch'): description = content.get('description_ch')[0] else: description = '' if content.has_key('username'): username = content.get('username')[0] else: username = '' user_domain_data[uid]['location'] = location user_domain_data[uid]['username'] = username user_domain_data[uid]['description'] = description except Exception, e: print e #domian计算 user_domain_temp = domain_main(user_domain_data) for uid in unresolved_uids: if uid in user_domain_temp: user_domain[uid] = {'domain': user_domain_temp[uid]} else: user_domain_temp[uid] = 'other' user_domain[uid] = {'domain': 'other'} save_data2es(user_domain)
def get_hot_sensitive_recommend_at_user(sort_item): if S_TYPE == 'test': now_ts = datetime2ts(S_DATE_TW) else: now_ts = int(time.time()) datetime = ts2datetime(now_ts - 24 * 3600) #sort_item = 'sensitive' sort_item_2 = 'timestamp' index_name = twitter_flow_text_index_name_pre + datetime query_body = { 'query': { 'match_all': {} }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': HOT_EVENT_TOP_USER, '_source': ['uid', 'user_fansnum', 'retweeted', 'timestamp'] } es_results = es.search(index=index_name, doc_type=twitter_flow_text_index_type, body=query_body)['hits']['hits'] uid_fansnum_dict = dict() if es_results: for result in es_results: result = result['_source'] uid = result['uid'] uid_fansnum_dict[uid] = {} uid_fansnum_dict[uid][sort_item_2] = result[sort_item_2] uid_fansnum_dict_sort_top = sorted(uid_fansnum_dict.items(), key=lambda x: x[1][sort_item_2], reverse=True) uid_set = set() for item in uid_fansnum_dict_sort_top: uid_set.add(item[0]) uid_list = list(uid_set) ## 根据uid,从weibo_user中得到 nick_name uid_nick_name_dict = dict() # uid不会变,而nick_name可能会变 es_results_user = es.mget(index=twitter_user_index_name, doc_type=twitter_user_index_type, body={'ids': uid_list})['docs'] i = 0 for result in es_results_user: if result['found'] == True: result = result['_source'] uid = result['uid'] nick_name = result['username'] if nick_name: i += 1 uid_nick_name_dict[uid] = nick_name if i >= HOT_AT_RECOMMEND_USER_TOP: break return uid_nick_name_dict