def get_user_profile_weibo(user_list): user_info_dict = {} try: user_profile_dict = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, \ body={'ids': user_list})['docs'] except: user_profile_dict = [] if user_profile_dict: for user_dict in user_profile_dict: if user_dict['found'] == True: source = user_dict['_source'] source_dict['uid'] = source['uid'] source_dict['uname'] = source['nick_name'] source_dict['location'] = source['location'] source_dict['photo_url'] = source['photo_url'] source_dict['fansnum'] = source['fansnum'] source_dict['friendsnum'] = source['friendsnum'] source_dict['statusnum'] = source['statusnum'] source_dict['description'] = source['description'] else: source_dict['uid'] = source['uid'] source_dict['uname'] = 'unknown' source_dict['location'] = 'unknown' source_dict['photo_url'] = '' source_dict['fansnum'] = 0 source_dict['friendsnum'] = 0 source_dict['statusnum'] = 0 source_dict['description'] = '' user_info_dict[user_dict['_id']] = source_dict return user_info_dict
def query_vary_top_k(index_name, doctype, top_k, sort_index="vary"): query_body = { "query": { "match_all": {} }, "size": top_k, "sort": [{sort_index: {"order": "desc"}}] } result = es.search(index=index_name, doc_type=doctype, body=query_body)['hits']['hits'] uid_list = [] for item in result: uid_list.append(item['_id']) portrait_result = es_portrait.mget(index="user_portrait", doc_type="user", body={"ids":uid_list}, _source=True)['docs'] profile_result = es_profile.mget(index="weibo_user",doc_type="user", body={"ids":uid_list}, _source=True)['docs'] return_list = [] rank = 1 for i in range(len(result)): info = ['','','','',''] info[0] = rank if profile_result[i]['found']: info[1] = profile_result[i]['_source'].get('photo_url','') info[3] = profile_result[i]['_source'].get('nick_name','') info[2] = result[i].get('_id','') info[4] = result[i]['_source']['vary'] if portrait_result[i]['found']: info.append('1') else: info.append('0') return_list.append(info) rank += 1 return return_list
def search_portrait_user(es, number, active_index, active_type, portrait_index, portrait_type, field="user_index"): return_list = [] index_exist = es.indices.exists(index=active_index) if not index_exist: return "no active_index exist" sys.exit(0) count_s = 0 count_c = 0 start = 0 rank = 1 while 1: search_list = [] user_list = search_k(es, active_index, active_type, start, field, 100) start += 100 for item in user_list: if field == "vary": uid = item.get('uid', '0') # obtain uid, notice "uid" or "user" else: uid = item.get('user', '0') search_list.append(uid) # uid list search_result = es_portrait.mget(index=portrait_index, doc_type=portrait_type, body={"ids": search_list}, _source=True)["docs"] profile_result = es_profile.mget(index="weibo_user", doc_type="user", body={"ids": search_list}, _source=True)["docs"] for item in search_result: if item["found"]: info = ['', '', '', '', '', ''] info[0] = rank index = search_result.index(item) if profile_result[index]['found']: info[1] = profile_result[index]['_source'].get( 'photo_url', '') info[3] = profile_result[index]['_source'].get( 'nick_name', '') info[2] = search_result[index].get('_id', '') info[4] = user_list[index][field] info[5] = "1" return_list.append(info) rank += 1 count_c += 1 if count_c >= int(number): return return_list
def search_portrait_user_in_activity(es, number, active_index, active_type, portrait_index, portrait_type, field="user_index"): return_list = [] index_exist = es.indices.exists(index=active_index) if not index_exist: return "no active_index exist" sys.exit(0) count_s = 0 count_c = 0 start = 0 rank = 1 while 1: search_list = [] user_list = search_k(es, active_index, active_type, start, field, 100) start += 100 for item in user_list: if field == "vary": uid = item.get('uid', '0') # obtain uid, notice "uid" or "user" else: uid = item.get('user', '0') search_list.append(uid) # uid list search_result = es_portrait.mget(index=portrait_index, doc_type=portrait_type, body={"ids": search_list}, _source=True)["docs"] profile_result = es_profile.mget(index="weibo_user", doc_type="user", body={"ids": search_list}, _source=True)["docs"] key_list = ["origin_weibo_retweeted_total_number", "origin_weibo_retweeted_average_number", "origin_weibo_retweeted_top_number", "origin_weibo_retweeted_brust_average", \ "origin_weibo_comment_total_number", "origin_weibo_comment_average_number", "origin_weibo_comment_top_number", "origin_weibo_retweeted_brust_average", \ "retweeted_weibo_retweeted_total_number", "retweeted_weibo_retweeted_average_number", "retweeted_weibo_retweeted_top_number", "retweeted_weibo_retweeted_brust_average", \ "retweeted_weibo_comment_total_number", "retweeted_weibo_comment_average_number", "retweeted_weibo_comment_top_number", "retweeted_weibo_retweeted_brust_average"] for item in search_result: if item["found"]: info = ['','','','','',''] info[0] = rank index = search_result.index(item) if profile_result[index]['found']: info[1] = profile_result[index]['_source'].get('photo_url','') info[3] = profile_result[index]['_source'].get('nick_name','') info[2] = search_result[index].get('_id','') info[4] = user_list[index]['user_index'] info[5] = "1" if field == 'origin_weibo_retweeted_brust_average': info.append(user_list[index]['origin_weibo_retweeted_brust_average']) for key in key_list: info.append(user_list[index][key]) elif field == 'origin_weibo_comment_brust_average': info.append(user_list[index]['origin_weibo_comment_brust_average']) for key in key_list: info.append(user_list[index][key]) else: pass return_list.append(info) rank += 1 count_c += 1 if count_c >= int(number): return return_list
def search_user_info(es,index_name,doc_type,uid,result_name): try: retweet_result = es.get(index=index_name, doc_type=doc_type, id=uid)['_source'] except: return None if retweet_result: retweet_dict = json.loads(retweet_result[result_name]) sorted_list = sorted(retweet_dict.iteritems(),key=lambda x:x[1],reverse=True)[:20] uid_list = [i[0] for i in sorted_list if i[0] != uid] portrait_result = [] try: user_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={'ids':uid_list})['docs'] except: user_result = [] try: bci_history_result = es_bci_history.mget(index=bci_history_index_name, doc_type=bci_history_index_type, body={'ids':uid_list}, fields=fields)['docs'] except: bci_history_result = [] #print bci_history_result iter_count = 0 out_portrait_list = [] for out_user_item in user_result: uid = out_user_item['_id'] if out_user_item['found'] == True: source = out_user_item['_source'] uname = source['nick_name'] photo_url = source['photo_url'] if uname == '': uname = u'未知' #location = source['user_location'] friendsnum = source['friendsnum'] else: uname = u'未知' location = '' friendsnum = '' photo_url = 'unknown' #add index from bci_history try: bci_history_item = bci_history_result[iter_count] except: bci_history_item = {'found': False} if bci_history_item['found']==True: fansnum = bci_history_item['fields'][fields[0]][0] user_weibo_count = bci_history_item['fields'][fields[1]][0] user_friendsnum = bci_history_item['fields'][fields[2]][0] influence = bci_history_item['fields'][fields[3]][0] else: fansnum = '' user_weibo_count = '' user_friendsnum = '' influence = '' #retweet_count = int(retweet_dict[uid]) count = retweet_dict[uid] out_portrait_list.append({'uid':uid,'photo_url':photo_url,'count':count,'uname':uname,'influence':influence,'fansnum':fansnum, 'friendsnum':user_friendsnum,'weibo_count':user_weibo_count})#location, iter_count += 1 return out_portrait_list else: return None
def search_top_index(index_name, top_k=1, index_type="bci", top=False, sort_order="user_index"): query_body = { "query": { "match_all": {} }, "size": top_k, "sort": [{sort_order: {"order": "desc"}}] } if top: result = es.search(index=index_name, doc_type=index_type, body=query_body)['hits']['hits'][0]['_source'][sort_order] else: search_result = es.search(index=index_name, doc_type=index_type, body=query_body)['hits']['hits'] uid_list = [] for item in search_result: uid_list.append(item['_id']) profile_result = es_profile.mget(index=profile_index_name,doc_type=profile_index_type, body={"ids":uid_list}, _source=True)['docs'] portrait_result = es_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, body={"ids":uid_list}, _source=True)['docs'] result = [] rank = 1 for i in range(len(search_result)): info = ['','','',''] info[0] = rank if profile_result[i]['found']: info[1] = profile_result[i]['_source'].get('photo_url','') info[3] = profile_result[i]['_source'].get('nick_name','') info[2] = search_result[i].get('_id','') if sort_order in ["user_index","origin_weibo_retweeted_brust_average","origin_weibo_comment_brust_average"]: info.append(search_result[i]['_source'][sort_order]) if portrait_result[i]['found']: info.append("1") else: info.append("0") elif sort_order == "origin_weibo_retweeted_top_number": info.append(search_result[i]['_source']['origin_weibo_retweeted_top_number']) mid = search_result[i]['_source']['origin_weibo_top_retweeted_id'] info.append(weiboinfo2url(info[2],mid)) if portrait_result[i]['found']: info.append("1") else: info.append("0") elif sort_order == "origin_weibo_comment_top_number": info.append(search_result[i]['_source']['origin_weibo_comment_top_number']) mid = search_result[i]['_source']['origin_weibo_top_comment_id'] info.append(weiboinfo2url(info[2],mid)) if portrait_result[i]['found']: info.append("1") else: info.append("0") rank += 1 result.append(info) return result
def search_tag(es, number, active_index, active_type, portrait_index, portrait_type, tag): #field_dict = {"domain":"art"} return_list = [] count_s = 0 count_c = 0 start = 0 rank = 1 while 1: search_list = [] user_list = search_k(es, active_index, active_type, start, "user_index", 10000) start += 10000 for item in user_list: uid = item.get('user', '0') search_list.append(uid) # uid list search_result = es_portrait.mget(index=portrait_index, doc_type=portrait_type, body={"ids": search_list}, _source=True)["docs"] profile_result = es_profile.mget(index="weibo_user", doc_type="user", body={"ids": search_list}, _source=True)["docs"] for item in search_result: count_s += 1 if item['found'] and tag in item['_source']['domain']: info = ['', '', '', '', '', '', ''] info[0] = rank index = search_result.index(item) if profile_result[index]['found']: info[1] = profile_result[index]['_source'].get( 'photo_url', '') info[3] = profile_result[index]['_source'].get( 'nick_name', '') info[2] = search_result[index].get('_id', '') info[4] = user_list[index]['user_index'] info[5] = search_result[index]['_source'].get('activeness', '') info[6] = search_result[index]['_source'].get('importance', '') rank += 1 return_list.append(info) if rank >= int(number) + 1: return return_list if count_s > 100000: return return_list
def search_max_single_field(field, index_name, doctype, top_k=3): # field = "origin_weibo_retweeted_top_number", "origin_weibo_comment_top_number" query_body = { "query": { "match_all": {} }, "sort": [{field: {"order": "desc"}}], "size": top_k } return_list = [] rank = 1 count_c = 0 start = 0 while 1: search_list = [] user_list = search_k(es, index_name, doctype, start, field, 100) start += 100 for item in user_list: uid = item.get('user','0') search_list.append(uid) # uid list search_result = es_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, body={"ids": search_list}, _source=True)["docs"] profile_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids": search_list}, _source=True)["docs"] for i in range(len(search_result)): if search_result[i]['found']: info = ['','','','','','','1'] info[0] = rank info[2] = search_result[i].get('_id','') if profile_result[i]['found']: info[1] = profile_result[i]['_source'].get('photo_url','') info[3] = profile_result[i]['_source'].get('nick_name','') if 'retweeted' in field: temp_mid = user_list[i]['origin_weibo_top_retweeted_id'] info[5] = weiboinfo2url(info[2], temp_mid) info[4] = user_list[i]['origin_weibo_retweeted_top_number'] else: temp_mid = user_list[i]['origin_weibo_top_comment_id'] info[5] = weiboinfo2url(info[2], temp_mid) info[4] = user_list[i]['origin_weibo_comment_top_number'] rank += 1 return_list.append(info) if rank >= int(top_k)+1: return return_list
def search_yangshi_attention(uid, top_count): results = {} now_ts = time.time() db_number = get_db_num(now_ts) index_name = retweet_index_name_pre + str(db_number) center_uid = uid # # print es_retweet,index_name,retweet_index_type,uid try: retweet_result = es_retweet.get(index=index_name, doc_type=retweet_index_type, id=uid)['_source'] except: return None if retweet_result: retweet_dict = json.loads(retweet_result['uid_retweet']) sorted_list = sorted(retweet_dict.iteritems(), key=lambda x: x[1], reverse=True)[:20] uid_list = [i[0] for i in sorted_list if i[0] != uid] portrait_result = [] try: user_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={'ids': uid_list})['docs'] except: user_result = [] iter_count = 0 out_portrait_list = [] for out_user_item in user_result: uid = out_user_item['_id'] if out_user_item['found'] == True: source = out_user_item['_source'] uname = source['nick_name'] if uname == '': uname = u'未知' else: uname = u'未知' count = retweet_dict[uid] out_portrait_list.append({ 'uid': uid, 'count': count, 'uname': uname, }) #location, iter_count += 1 return out_portrait_list else: return None
def search_user_profile_by_user_ids(users): users = list(users) user_profile_return = dict() try: user_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={'ids': users})['docs'] except: user_result = [] for out_user_item in user_result: if out_user_item['found']: uid = out_user_item['_id'] user_profile_return[uid] = out_user_item['_source'] return user_profile_return
def query_vary_top_k(index_name, doctype, top_k, sort_index="vary"): query_body = { "query": { "match_all": {} }, "size": top_k, "sort": [{ sort_index: { "order": "desc" } }] } result = es.search(index=index_name, doc_type=doctype, body=query_body)['hits']['hits'] uid_list = [] for item in result: uid_list.append(item['_id']) portrait_result = es_portrait.mget(index="user_portrait", doc_type="user", body={"ids": uid_list}, _source=True)['docs'] profile_result = es_profile.mget(index="weibo_user", doc_type="user", body={"ids": uid_list}, _source=True)['docs'] return_list = [] rank = 1 for i in range(len(result)): info = ['', '', '', '', ''] info[0] = rank if profile_result[i]['found']: info[1] = profile_result[i]['_source'].get('photo_url', '') info[3] = profile_result[i]['_source'].get('nick_name', '') info[2] = result[i].get('_id', '') info[4] = result[i]['_source']['vary'] if portrait_result[i]['found']: info.append('1') else: info.append('0') return_list.append(info) rank += 1 return return_list
def portrait_user_vary(es, number, active_index, active_type, portrait_index, portrait_type, field="vary"): return_list = [] index_exist = es.indices.exists(index=active_index) if not index_exist: return "no active_index exist" sys.exit(0) count_s = 0 count_c = 0 start = 0 rank = 1 try: while 1: search_list = [] user_list = search_k(es, active_index, active_type, start, field, 100) start += 100 for item in user_list: uid = item.get('uid', '0') # obtain uid, notice "uid" or "user" search_list.append(uid) # uid list search_result = es_portrait.mget(index="user_portrait", doc_type="user", body={"ids": search_list}, _source=True)["docs"] profile_result = es_profile.mget(index="weibo_user", doc_type="user", body={"ids": search_list}, _source=True)["docs"] for item in search_result: count_c += 1 if item["found"]: info = ['','','','','','1'] info[0] = rank index = search_result.index(item) if profile_result[index]['found']: info[1] = profile_result[index]['_source'].get('photo_url','') info[3] = profile_result[index]['_source'].get('nick_name','') info[2] = search_result[index].get('_id','') info[4] = user_list[index]['vary'] return_list.append(info) rank += 1 if rank == int(number)+1: return return_list if count_c > 10000: break except RequestError: print "timeout" return return_list
def get_user_url(uid_list): results = [] try: es_results = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids": uid_list})['docs'] except: es_results = {} for item in es_results: temp = [] if item['found']: temp.append(item['_source']["photo_url"]) temp.append(item['_source']['nick_name']) temp.append(item['_id']) else: temp.append("unknown") temp.append("unknown") temp.append(item['_id']) results.append(temp) return results
def search_tag(es, number, active_index, active_type, portrait_index, portrait_type, tag): #field_dict = {"domain":"art"} return_list = [] count_s = 0 count_c = 0 start = 0 rank = 1 while 1: search_list = [] user_list = search_k(es, active_index, active_type, start, "user_index", 10000) start += 10000 for item in user_list: uid = item.get('user', '0') search_list.append(uid) # uid list search_result = es_portrait.mget(index=portrait_index, doc_type=portrait_type, body={"ids": search_list}, _source=True)["docs"] profile_result = es_profile.mget(index="weibo_user", doc_type="user", body={"ids": search_list}, _source=True)["docs"] for item in search_result: count_s += 1 if item['found'] and tag in item['_source']['domain']: info = ['','','','','','',''] info[0] = rank index = search_result.index(item) if profile_result[index]['found']: info[1] = profile_result[index]['_source'].get('photo_url','') info[3] = profile_result[index]['_source'].get('nick_name','') info[2] = search_result[index].get('_id','') info[4] = user_list[index]['user_index'] info[5] = search_result[index]['_source'].get('activeness','') info[6] = search_result[index]['_source'].get('importance','') rank += 1 return_list.append(info) if rank >= int(number)+1: return return_list if count_s > 100000: return return_list
def search_yangshi_follower(uid, top_count): results = {} now_ts = time.time() db_number = get_db_num(now_ts) index_name = be_retweet_index_name_pre + str(db_number) center_uid = uid try: retweet_result = es_retweet.get(index=index_name, doc_type=be_retweet_index_type, id=uid)['_source'] except: return None # print retweet_result if retweet_result: retweet_dict = json.loads(retweet_result['uid_be_retweet']) sorted_list = sorted(retweet_dict.iteritems(),key=lambda x:x[1],reverse=True)[:20] uid_list = [i[0] for i in sorted_list if i[0] != uid] portrait_result = [] try: user_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={'ids':uid_list})['docs'] except: user_result = [] iter_count = 0 out_portrait_list = [] for out_user_item in user_result: uid = out_user_item['_id'] if out_user_item['found'] == True: source = out_user_item['_source'] uname = source['nick_name'] if uname == '': uname = u'未知' else: uname = u'未知' #retweet_count = int(retweet_dict[uid]) count = retweet_dict[uid] out_portrait_list.append({'uid':uid,'count':count,'uname':uname})#location, iter_count += 1 return out_portrait_list else: return None
def get_sensitive_text_detail(task_name, ts, user, order): _id = user + '-' + task_name task_detail = es.get(index=index_sensing_task, doc_type=_id, id=ts)['_source'] weibo_detail = json.loads(task_detail['sensitive_weibo_detail']) weibo_detail_list = [] if weibo_detail: for iter_mid, item in weibo_detail.iteritems(): tmp = [] tmp.append(iter_mid) tmp.append(item[iter_mid]) tmp.append(item['retweeted']) tmp.append(item['comment']) weibo_detail_list.append(tmp) mid_list = weibo_detail.keys() results = [] query_body = { "query": { "filtered": { "filter": { "terms": { "mid": mid_list } } } } } index_list = [] datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts - DAY) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: index_list.append(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) if exist_es_1: index_list.append(index_name_1) if index_list and mid_list: search_results = es_text.search(index=index_list, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] uid_list = [] text_dict = dict() # 文本信息 portrait_dict = dict() # 背景信息 if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) text_dict[item['_id']] = item['_source'] # _id是mid if uid_list: portrait_result = es_profile.mget( index=profile_index_name, doc_type=profile_index_type, body={"ids": uid_list}, fields=['nick_name', 'photo_url'])["docs"] for item in portrait_result: if item['found']: portrait_dict[item['_id']] = { "nick_name": item["fields"]["nick_name"][0], "photo_url": item["fields"]["photo_url"][0] } else: portrait_dict[item['_id']] = { "nick_name": item['_id'], "photo_url": "" } if order == "total": sorted_list = sorted(weibo_detail_list, key=lambda x: x[1], reverse=True) elif order == "retweeted": sorted_list = sorted(weibo_detail_list, key=lambda x: x[2], reverse=True) elif order == "comment": sorted_list = sorted(weibo_detail_list, key=lambda x: x[3], reverse=True) else: sorted_list = weibo_detail_list count_n = 0 for item in sorted_list: mid = item[0] iter_text = text_dict.get(mid, {}) temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type if iter_text: uid = iter_text['uid'] temp.append(uid) iter_portrait = portrait_dict.get(uid, {}) if iter_portrait: temp.append(iter_portrait['nick_name']) temp.append(iter_portrait['photo_url']) else: temp.extend([uid, '']) temp.append(iter_text["text"]) temp.append(iter_text["sentiment"]) temp.append(ts2date(iter_text['timestamp'])) temp.append(iter_text['geo']) temp.append(iter_text['message_type']) temp.append(item[2]) temp.append(item[3]) temp.append(iter_text.get('sensitive', 0)) count_n += 1 results.append(temp) if results and order == "ts": results = sorted(results, key=lambda x: x[5], reverse=True) if results and order == "sensitive": results = sorted(results, key=lambda x: x[-1], reverse=True) return results
def get_positive_weibo_detail(ts, social_sensors, keywords_list, size, sentiment_type=1): former_mid_list = query_mid_list(ts - time_interval, keywords_list, time_segment, social_sensors) # 前一段时间内的微博mid list current_mid_list = query_mid_list(ts, keywords_list, time_interval, social_sensors) mid_list = [] mid_list.extend(former_mid_list) mid_list.extend(current_mid_list) query_body = { "query": { "filtered": { "filter": { "bool": { "must": [ { "range": { "timestamp": { "gte": ts - time_interval, "lt": ts } } }, ], "should": [{ "terms": { "root_mid": mid_list } }, { "terms": { "mid": mid_list } }, { "terms": { "keywords_string": keywords_list } }] } } } }, "sort": { "timestamp": { "order": "desc" } }, "size": 100 } #if social_sensors and int(sentiment_type) == 1: # query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"terms":{"uid": social_sensors}}) if int(sentiment_type) == 1 or int(sentiment_type) == 0: query_body["query"]["filtered"]["filter"]["bool"]["must"].append( {"term": { "sentiment": sentiment_type }}) else: query_body["query"]["filtered"]["filter"]["bool"]["must"] = [{ "terms": { "sentiment": ["2", "3"] } }] # 判断当前ts和ts-time_interval是否属于同一天,确定查询哪个es datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts - time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) # 1. 聚合原创微博mid list if datetime == datetime_1 and exist_es: search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] elif datetime != datetime_1 and exist_es_1: search_results = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] uid_list = [] results = [] if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) if uid_list: portrait_result = es_profile.mget( index=profile_index_name, doc_type=profile_index_type, body={"ids": uid_list}, fields=['nick_name', 'photo_url'])["docs"] for i in range(len(uid_list)): item = search_results[i]['_source'] temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type temp.append(item['uid']) if portrait_result[i]['found']: temp.append(portrait_result[i]["fields"]["nick_name"][0]) temp.append(portrait_result[i]["fields"]["photo_url"][0]) else: temp.append("unknown") temp.append("") temp.append(item["text"]) temp.append(item["sentiment"]) temp.append(ts2date(item['timestamp'])) temp.append(item['geo']) keywords_set = set(item['keywords_string'].split('&')) common_keywords = set(keywords_list) & keywords_set temp.append(list(common_keywords)) temp.append(item['message_type']) results.append(temp) return results
def get_origin_weibo_detail(ts, user, task_name, size, order, message_type=1): _id = user + '-' + task_name task_detail = es_user_portrait.get(index=index_sensing_task, doc_type=_id, id=ts)['_source'] print '37', index_sensing_task, _id mid_value = json.loads(task_detail['mid_topic_value']) duplicate_dict = json.loads(task_detail['duplicate_dict']) tmp_duplicate_dict = dict() for k, v in duplicate_dict.iteritems(): try: tmp_duplicate_dict[v].append(k) except: tmp_duplicate_dict[v] = [k, v] if message_type == 1: weibo_detail = json.loads(task_detail['origin_weibo_detail']) elif message_type == 2: weibo_detail = json.loads(task_detail['retweeted_weibo_detail']) else: weibo_detail = json.loads(task_detail['sensitive_weibo_detail']) weibo_detail_list = [] if weibo_detail: for iter_mid, item in weibo_detail.iteritems(): tmp = [] tmp.append(iter_mid) tmp.append(item[iter_mid]) tmp.append(item['retweeted']) tmp.append(item['comment']) weibo_detail_list.append(tmp) mid_list = weibo_detail.keys() print len(mid_list) results = [] query_body = { "query": { "filtered": { "filter": { "terms": { "mid": mid_list } } } }, "size": 1000, "sort": { "timestamp": { "order": "desc" } } } index_list = [] datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts - DAY) index_name = flow_text_index_name_pre + datetime print es_text exist_es = es_text.indices.exists(index_name) print exist_es if exist_es: index_list.append(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) if exist_es_1: index_list.append(index_name_1) if index_list and mid_list: search_results = es_text.search(index=index_list, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] uid_list = [] text_dict = dict() # 文本信息 portrait_dict = dict() # 背景信息 sort_results = [] if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) text_dict[item['_id']] = item['_source'] # _id是mid if uid_list: portrait_result = es_profile.mget( index=profile_index_name, doc_type=profile_index_type, body={"ids": uid_list}, fields=['nick_name', 'photo_url'])["docs"] for item in portrait_result: if item['found']: portrait_dict[item['_id']] = { "nick_name": item["fields"]["nick_name"][0], "photo_url": item["fields"]["photo_url"][0] } else: portrait_dict[item['_id']] = { "nick_name": item['_id'], "photo_url": "" } if order == "total": sorted_list = sorted(weibo_detail_list, key=lambda x: x[1], reverse=True)[:10] elif order == "retweeted": sorted_list = sorted(weibo_detail_list, key=lambda x: x[2], reverse=True)[:10] elif order == "comment": sorted_list = sorted(weibo_detail_list, key=lambda x: x[3], reverse=True)[:10] else: sorted_list = weibo_detail_list count_n = 0 results_dict = dict() mid_index_dict = dict() for item in sorted_list: # size mid = item[0] iter_text = text_dict.get(mid, {}) temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, keywords_string, message_type if iter_text: uid = iter_text['uid'] temp.append(uid) iter_portrait = portrait_dict.get(uid, {}) if iter_portrait: temp.append(iter_portrait['nick_name']) temp.append(iter_portrait['photo_url']) else: temp.extend([uid, '']) temp.append(iter_text["text"]) temp.append(iter_text["sentiment"]) temp.append(ts2date(iter_text['timestamp'])) temp.append(iter_text['geo']) if message_type == 1: temp.append(1) elif message_type == 2: temp.append(3) else: temp.append(iter_text['message_type']) #jln 提取关键词 f_key = get_weibo_single(iter_text['text']) temp.append( sorted(f_key.iteritems(), key=lambda x: x[1], reverse=True)) temp.append(item[2]) temp.append(item[3]) temp.append(iter_text.get('sensitive', 0)) temp.append(iter_text['timestamp']) temp.append(mid_value[mid]) temp.append(mid) results.append(temp) count_n += 1 results = sorted(results, key=operator.itemgetter(-4, -2, -6), reverse=True) # -4 -2 -3 sort_results = [] count = 0 for item in results: sort_results.append([item]) mid_index_dict[item[-1]] = count count += 1 if tmp_duplicate_dict: remove_list = [] value_list = tmp_duplicate_dict.values() # [[mid, mid], ] for item in value_list: tmp = [] for mid in item: if mid_index_dict.get(mid, 0): tmp.append(mid_index_dict[mid]) if len(tmp) > 1: tmp_min = min(tmp) else: continue tmp.remove(tmp_min) for iter_count in tmp: sort_results[tmp_min].extend(sort_results[iter_count]) remove_list.append(sort_results[iter_count]) if remove_list: for item in remove_list: sort_results.remove(item) return sort_results
def get_task_detail_2(task_name, ts, user): results = dict() index_name = task_name _id = user + "-" + task_name task_detail = es.get(index=index_manage_sensing_task, doc_type=task_doc_type, id=_id)["_source"] task_name = task_detail['task_name'] social_sensors = json.loads(task_detail['social_sensors']) history_status = json.loads(task_detail['history_status']) start_time = task_detail['create_at'] create_by = task_detail['create_by'] stop_time = task_detail['stop_time'] remark = task_detail.get('remark', '') portrait_detail = [] count = 0 # 计数 top_influence = get_top_influence("influence") top_activeness = get_top_influence("activeness") top_importance = get_top_influence("importance") if social_sensors: search_results = es.mget(index=portrait_index_name, doc_type=portrait_index_type, body={"ids":social_sensors}, fields=SOCIAL_SENSOR_INFO)['docs'] for item in search_results: temp = [] if item['found']: for iter_item in SOCIAL_SENSOR_INFO: if iter_item == "topic_string": temp.append(item["fields"][iter_item][0].split('&')) elif iter_item == "activeness": temp.append(math.log(item['fields']['activeness'][0]/float(top_activeness)*9+1, 10)*100) elif iter_item == "importance": temp.append(math.log(item['fields']['importance'][0]/float(top_importance)*9+1, 10)*100) elif iter_item == "influence": temp.append(math.log(item['fields']['influence'][0]/float(top_influence)*9+1, 10)*100) else: temp.append(item["fields"][iter_item][0]) portrait_detail.append(temp) portrait_detail = sorted(portrait_detail, key=lambda x:x[5], reverse=True) time_series = [] # 时间 #positive_sentiment_list = [] # 情绪列表 #neutral_sentiment_list = [] #negetive_sentiment_list = [] all_weibo_list = [] origin_weibo_list = [] # 微博列表 retweeted_weibo_list = [] #retweeted_weibo_count = [] # 别人转发他的数量 #comment_weibo_count = [] #total_number_count = [] #burst_time_list = [] # 爆发时间列表 important_user_set = set() # 重要人物列表 out_portrait_users = set() # 未入库 ts = int(ts) time_series = history_status #for item in history_status: # if int(item[0]) <= ts: # time_series.append(item[0]) # 到目前为止的所有的时间戳 # get detail task information from es if time_series: flow_detail = es.mget(index=index_sensing_task, doc_type=_id, body={"ids": time_series})['docs'] else: flow_detail = {} if flow_detail: for item in flow_detail: item = item['_source'] timestamp = item['timestamp'] #sentiment_distribution = json.loads(item["sentiment_distribution"]) #positive_sentiment_list.append(int(sentiment_distribution['1'])) #negetive_sentiment_list.append(int(sentiment_distribution['2'])+int(sentiment_distribution['3']) \ # +int(sentiment_distribution['4'])+int(sentiment_distribution['5'])+int(sentiment_distribution['6'])) #neutral_sentiment_list.append(int(sentiment_distribution['0'])) origin_weibo_list.append(item["origin_weibo_number"]) # real retweeted_weibo_list.append(item['retweeted_weibo_number']) # real all_weibo_list.append(item["origin_weibo_number"]+item['retweeted_weibo_number']) #retweeted_weibo_count.append(item['retweeted_weibo_count']) #comment_weibo_count.append(item['comment_weibo_count']) #total_number_count.append(item['weibo_total_number']) temp_important_user_list = json.loads(item['important_users']) unfiltered_users = json.loads(item['unfilter_users']) temp_out_portrait_users = set(unfiltered_users) - set(temp_important_user_list) # 未入库 important_user_set = important_user_set | set(temp_important_user_list) out_portrait_users = out_portrait_users | set(temp_out_portrait_users) #burst_reason = item.get("burst_reason", "") #if burst_reason: # burst_time_list.append([timestamp, count, burst_reason]) count += 1 #################################################################################### # 统计爆发原因,下相应的结论 """ weibo_variation_count = 0 weibo_variation_time = [] sentiment_variation_count = 0 sentiment_variation_time = [] sensitive_variation_count = 0 # sensitive sensitive_variation_time = [] # sensitive common_variation_count = 0 common_variation_time = [] if burst_time_list: for item in burst_time_list: tmp_common = 0 x1 = 0 x2 = 0 x3 = 0 if signal_count_varition in item[2]: weibo_variation_count += 1 weibo_variation_time.append([ts2date_min(item[0]), total_number_count[item[1]]]) x1 = total_number_count[item[1]] tmp_common += 1 if signal_sentiment_varition in item[2]: tmp_common += 1 sentiment_variation_count += 1 x2 = negetive_sentiment_list[item[1]] sentiment_variation_time.append([ts2date_min(item[0]), negetive_sentiment_list[item[1]]]) if signal_sensitive_variation in item[2]: tmp_common += 1 sensitive_variation_count += 1 x3 = sensitive_total_number_list[item[1]] sensitive_variation_time.append([ts2date_min(item[0]), all_weibo_list[item[1]]]) if tmp_common >= 2: common_variation_count += 1 common_variation_time.append([ts2date_min(item[0]), x1, x2, x3]) warning_conclusion = remark variation_distribution = [] if weibo_variation_count: variation_distribution.append(weibo_variation_time) else: variation_distribution.append([]) if sentiment_variation_count: variation_distribution.append(sentiment_variation_time) else: variation_distribution.append([]) if sensitive_variation_count: variation_distribution.append(sensitive_variation_time) else: variation_distribution.append([]) if common_variation_count: variation_distribution.append(common_variation_time) else: variation_distribution.append([]) results['warning_conclusion'] = warning_conclusion results['variation_distribution'] = variation_distribution # 每个用户的热度 """ # 获取重要用户的个人信息 important_uid_list = list(important_user_set) out_portrait_users_list = list(out_portrait_users) social_sensor_set = set(social_sensors) user_detail_info = [] # out_user_detail_info = [] if important_uid_list: user_results = es.mget(index=portrait_index_name, doc_type=portrait_index_type, body={"ids":important_uid_list}, fields=['uid', 'uname', 'domain', 'topic_string', "photo_url", 'importance', 'influence', 'activeness'])['docs'] for item in user_results: if item['found']: temp = [] #if int(item['fields']['importance'][0]) < IMPORTANT_USER_THRESHOULD: # continue temp.append(item['fields']['uid'][0]) uname = item['fields']['uname'][0] if not uname or uname == "未知": uname = item['fields']['uid'][0] temp.append(uname) temp.append(item['fields']['photo_url'][0]) temp.append(item['fields']['domain'][0]) temp.append(item['fields']['topic_string'][0].split('&')) #hot_count = count_hot_uid(item['fields']['uid'][0], start_time, stop_time) #temp.append(hot_count) temp.append(math.log(item['fields']['importance'][0]/float(top_importance)*9+1, 10)*100) temp.append(math.log(item['fields']['influence'][0]/float(top_influence)*9+1, 10)*100) temp.append(math.log(item['fields']['activeness'][0]/float(top_activeness)*9+1, 10)*100) if item['fields']['uid'][0] in social_sensor_set: temp.append(1) else: temp.append(0) user_detail_info.append(temp) # 排序 if user_detail_info: user_detail_info = sorted(user_detail_info, key=lambda x:x[6], reverse=True) else: user_detail_info = [] if out_portrait_users_list: profile_results = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":out_portrait_users_list})["docs"] bci_index = "bci_" + ts2datetime(ts-DAY).replace('-','') influence_results = es.mget(index=bci_index, doc_type="bci", body={"ids":out_portrait_users_list}, fields=["user_index"])['docs'] bci_results = es_profile.mget(index="bci_history", doc_type="bci", body={"ids":out_portrait_users_list}, fields=['user_fansnum'])['docs'] top_influence = get_top_all_influence("user_index", ts) count = 0 if profile_results: for item in profile_results: temp = [] if item['found']: temp.append(item['_source']['uid']) if item['_source']['nick_name']: temp.append(item['_source']['nick_name']) else: temp.append(item['_source']['uid']) temp.append(item['_source']['user_location']) #temp.append(item['_source']['fansnum']) else: temp.append(item['_id']) temp.append(item['_id']) temp.extend(['']) try: user_fansnum = bci_results[count]["fields"]["user_fansnum"][0] except: user_fansnum = 0 temp.append(user_fansnum) temp_influ = influence_results[count] if temp_influ.get('found', 0): user_index = temp_influ['fields']['user_index'][0] temp.append(math.log(user_index/float(top_influence)*9+1, 10)*100) else: temp.append(0) count += 1 out_user_detail_info.append(temp) print len(out_user_detail_info) if len(out_user_detail_info): print "sort" out_user_detail_info = sorted(out_user_detail_info, key=lambda x:x[4], reverse=True) revise_time_series = [] for item in time_series: revise_time_series.append(ts2date_min(item)) results['important_user_detail'] = user_detail_info results['out_portrait_user_detail'] = out_user_detail_info #results['burst_time'] = burst_time_list # 爆发时间点,以及爆发原因 results['time_series'] = revise_time_series #results['positive_sentiment_list'] = positive_sentiment_list #esults['negetive_sentiment_list'] = negetive_sentiment_list #results['neutral_sentiment_list'] = neutral_sentiment_list results['all_weibo_list'] = all_weibo_list results['origin_weibo_list'] = origin_weibo_list results['retweeted_weibo_list'] = retweeted_weibo_list #results['comment_weibo_count'] = comment_weibo_count #results['retweeted_weibo_count'] = retweeted_weibo_count #results['total_number_list'] = total_number_count results['social_sensors_detail'] = portrait_detail return results
def get_temporal_rank(task_type, sort="retweeted", number=100): number = int(number) - 1 if int(task_type) == 0: # 到目前位置 sort_list = r.zrange("influence_%s" % sort, 0, number, withscores=True, desc=True) elif int(task_type) == 1: sort_list = r.zrange("influence_%s_1" % sort, 0, number, withscores=True, desc=True) elif int(task_type) == 2: sort_list = r.zrange("influence_%s_2" % sort, 0, number, withscores=True, desc=True) elif int(task_type) == 3: sort_list = r.zrange("influence_%s_3" % sort, 0, number, withscores=True, desc=True) else: sort_list = r.zrange("influence_%s_4" % sort, 0, number, withscores=True, desc=True) uid_list = [] for item in sort_list: uid_list.append(item[0]) if sort == "retweeted": other = "comment" else: other = "retweeted" results = [] # 查看背景信息 if uid_list: profile_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids": uid_list})["docs"] bci_result = es_user_profile.mget( index="bci_history", doc_type="bci", body={"ids": uid_list}, _source=False, fields=['user_fansnum', "weibo_month_sum"])["docs"] count = 0 for item in profile_result: _id = item['_id'] index = profile_result.index(item) tmp = [] tmp.append(item['_id']) if item['found']: item = item['_source'] tmp.append(item['nick_name']) tmp.append(item['statusnum']) tmp.append(item['user_location']) tmp.append(item['fansnum']) else: tmp.extend(['', 0, '', 0]) try: user_fansnum = bci_result[count]['fields']['user_fansnum'][0] tmp[4] = user_fansnum except: pass try: weibo_number = bci_result[count]['fields']["weibo_month_sum"][ 0] tmp[2] = weibo_number except: pass count_1 = int(sort_list[index][1]) if int(task_type) == 0: tmp_count = r.zscore("influence_%s" % other, _id) if tmp_count: count_2 = int(tmp_count) else: count_2 = 0 else: tmp_count = r.zscore("influence_%s_%s" % (other, task_type), _id) if tmp_count: count_2 = int(tmp_count) else: count_2 = 0 if sort == "retweeted": tmp.append(count_1) tmp.append(count_2) else: tmp.append(count_2) tmp.append(count_1) results.append(tmp) count += 1 if uid_list: count = 0 portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, body={"ids": uid_list})["docs"] for item in portrait_result: if item['found']: results[count].append("1") else: results[count].append("0") count += 1 return results
def get_retweet_weibo_detail(ts, user, task_name, size, text_type, type_value): _id = user + '-' + task_name task_detail = es_user_portrait.get(index=index_sensing_task, doc_type=_id, id=ts)['_source'] origin_weibo_detail = json.loads(task_detail['origin_weibo_detail']) retweeted_weibo_detail = json.loads(task_detail['retweeted_weibo_detail']) mid_list = [] mid_list.extend(origin_weibo_detail.keys()) mid_list.extend(retweeted_weibo_detail.keys()) query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte": ts - time_interval, "lt": ts } }}, {"terms": {"root_mid": mid_list}} ] } } } }, "sort": {"timestamp": {"order": "desc"}}, "size": 100 } if text_type == "message_type": query_body['query']['filtered']['filter']['bool']['must'].append({"term":{text_type: type_value}}) if text_type == "sentiment": #if isinstance(type_value, str): if len(type_value) == 1: query_body['query']['filtered']['filter']['bool']['must'].append({"term":{text_type: type_value}}) else: query_body['query']['filtered']['filter']['bool']['must'].append({"terms":{text_type: type_value}}) datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) # 1. 查询微博 if datetime == datetime_1 and exist_es: search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] elif datetime != datetime_1 and exist_es_1: search_results = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] #print search_results # 2. 获取微博相关信息 results = [] uid_list = [] if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) if uid_list: portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"] for i in range(len(uid_list)): item = search_results[i]['_source'] temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type temp.append(item['uid']) if portrait_result[i]['found']: temp.append(portrait_result[i]["fields"]["nick_name"][0]) temp.append(portrait_result[i]["fields"]["photo_url"][0]) else: temp.append(item['uid']) temp.append("") temp.append(item["text"]) #print item['text'] temp.append(item["sentiment"]) temp.append(ts2date(item['timestamp'])) temp.append(item['geo']) temp.append(item["message_type"]) results.append(temp) return results
def get_retweet_weibo_detail(ts, user, task_name, size, text_type, type_value): _id = user + '-' + task_name task_detail = es_user_portrait.get(index=index_sensing_task, doc_type=_id, id=ts)['_source'] origin_weibo_detail = json.loads(task_detail['origin_weibo_detail']) retweeted_weibo_detail = json.loads(task_detail['retweeted_weibo_detail']) mid_list = [] mid_list.extend(origin_weibo_detail.keys()) mid_list.extend(retweeted_weibo_detail.keys()) query_body = { "query": { "filtered": { "filter": { "bool": { "must": [{ "range": { "timestamp": { "gte": ts - time_interval, "lt": ts } } }, { "terms": { "root_mid": mid_list } }] } } } }, "sort": { "timestamp": { "order": "desc" } }, "size": 100 } if text_type == "message_type": query_body['query']['filtered']['filter']['bool']['must'].append( {"term": { text_type: type_value }}) if text_type == "sentiment": #if isinstance(type_value, str): if len(type_value) == 1: query_body['query']['filtered']['filter']['bool']['must'].append( {"term": { text_type: type_value }}) else: query_body['query']['filtered']['filter']['bool']['must'].append( {"terms": { text_type: type_value }}) datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts - time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) # 1. 查询微博 if datetime == datetime_1 and exist_es: search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] elif datetime != datetime_1 and exist_es_1: search_results = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] #print search_results # 2. 获取微博相关信息 results = [] uid_list = [] if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) if uid_list: portrait_result = es_profile.mget( index=profile_index_name, doc_type=profile_index_type, body={"ids": uid_list}, fields=['nick_name', 'photo_url'])["docs"] for i in range(len(uid_list)): item = search_results[i]['_source'] temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type temp.append(item['uid']) if portrait_result[i]['found']: temp.append(portrait_result[i]["fields"]["nick_name"][0]) temp.append(portrait_result[i]["fields"]["photo_url"][0]) else: temp.append(item['uid']) temp.append("") temp.append(item["text"]) #print item['text'] temp.append(item["sentiment"]) temp.append(ts2date(item['timestamp'])) temp.append(item['geo']) temp.append(item["message_type"]) results.append(temp) return results
def search_user_info(es, index_name, doc_type, uid, result_name): try: retweet_result = es.get(index=index_name, doc_type=doc_type, id=uid)['_source'] except: return None if retweet_result: retweet_dict = json.loads(retweet_result[result_name]) sorted_list = sorted(retweet_dict.iteritems(), key=lambda x: x[1], reverse=True)[:20] uid_list = [i[0] for i in sorted_list if i[0] != uid] portrait_result = [] try: user_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={'ids': uid_list})['docs'] except: user_result = [] try: bci_history_result = es_bci_history.mget( index=bci_history_index_name, doc_type=bci_history_index_type, body={'ids': uid_list}, fields=fields)['docs'] except: bci_history_result = [] ## print bci_history_result iter_count = 0 out_portrait_list = [] for out_user_item in user_result: uid = out_user_item['_id'] if out_user_item['found'] == True: source = out_user_item['_source'] uname = source['nick_name'] photo_url = source['photo_url'] if uname == '': uname = u'未知' #location = source['user_location'] friendsnum = source['friendsnum'] else: uname = u'未知' location = '' friendsnum = '' photo_url = 'unknown' #add index from bci_history try: bci_history_item = bci_history_result[iter_count] except: bci_history_item = {'found': False} if bci_history_item['found'] == True: fansnum = bci_history_item['fields'][fields[0]][0] user_weibo_count = bci_history_item['fields'][fields[1]][0] user_friendsnum = bci_history_item['fields'][fields[2]][0] influence = bci_history_item['fields'][fields[3]][0] else: fansnum = '' user_weibo_count = '' user_friendsnum = '' influence = '' #retweet_count = int(retweet_dict[uid]) count = retweet_dict[uid] out_portrait_list.append({ 'uid': uid, 'photo_url': photo_url, 'count': count, 'uname': uname, 'influence': influence, 'fansnum': fansnum, 'friendsnum': user_friendsnum, 'weibo_count': user_weibo_count }) #location, iter_count += 1 return out_portrait_list else: return None
def search_fans(uid, top_count): results = {} now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) db_number = get_db_num(now_date_ts) be_comment_index_name = be_comment_index_name_pre + str(db_number) be_retweet_index_name = be_retweet_index_name_pre + str(db_number) result = {} be_retweet_inter_dict = {} be_comment_inter_dict = {} center_uid = uid try: be_retweet_result = es_retweet.get(index=be_retweet_index_name, doc_type=be_retweet_index_type, id=uid)['_source'] except: be_retweet_result = {} if be_retweet_result: be_retweet_uid_dict = json.loads(be_retweet_result['uid_be_retweet']) else: be_retweet_uid_dict = {} # # print "be_retweet_uid_dict", be_retweet_uid_dict try: be_comment_result = es_be_comment.get(index=be_comment_index_name, doc_type=be_comment_index_type, id=uid)['_source'] except: be_comment_result = {} if be_comment_result: be_comment_uid_dict = json.loads(be_comment_result['uid_be_comment']) else: be_comment_uid_dict = {} # # print "be_comment_uid_dict", be_comment_uid_dict fans_result = union_dict(be_retweet_uid_dict, be_comment_uid_dict) fans_user_set = set(fans_result.keys()) fans_list = list(fans_user_set) # # print "fans_list", fans_list all_fans_dict = {} for fans_user in fans_list: if fans_user != center_uid: all_fans_dict[fans_user] = fans_result[fans_user] sort_all_fans_dict = sorted(all_fans_dict.items(), key=lambda x: x[1], reverse=True) all_fans_uid_list = [] all_fans_uid_list_all = [item[0] for item in sort_all_fans_dict] count = 0 for i in all_fans_uid_list_all: count += 1 all_fans_uid_list.append(i) if count == 1000: break # # print all_fans_uid_list out_portrait_list = all_fans_uid_list #use to get user information from user profile out_portrait_result = {} try: out_user_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={'ids': out_portrait_list })['docs'] except: out_user_result = [] #add index from bci_history try: bci_history_result = es_bci_history.mget( index=bci_history_index_name, doc_type=bci_history_index_type, body={'ids': out_portrait_list}, fields=fields)['docs'] except: bci_history_result = [] iter_count = 0 out_portrait_list = [] for out_user_item in out_user_result: uid = out_user_item['_id'] if out_user_item['found'] == True: source = out_user_item['_source'] uname = source['nick_name'] photo_url = source['photo_url'] if uname == '': uname = u'未知' location = source['user_location'] friendsnum = source['friendsnum'] else: uname = u'未知' location = '' friendsnum = '' photo_url = 'unknown' #add index from bci_history try: bci_history_item = bci_history_result[iter_count] except: bci_history_item = {'found': False} # # print bci_history_item if bci_history_item['found'] == True: fansnum = bci_history_item['fields'][fields[0]][0] user_weibo_count = bci_history_item['fields'][fields[1]][0] user_friendsnum = bci_history_item['fields'][fields[2]][0] influence = bci_history_item['fields'][fields[3]][0] else: fansnum = '' user_weibo_count = '' user_friendsnum = '' fans_count = int(all_fans_dict[uid]) out_portrait_list.append({ 'uid': uid, 'photo_url': photo_url, 'uname': uname, 'count': fans_count, 'fansnum': fansnum, 'friendsnum': user_friendsnum, 'weibo_count': user_weibo_count }) iter_count += 1 return out_portrait_list
def search_follower(uid, top_count): results = {} now_ts = time.time() db_number = get_db_num(now_ts) index_name = be_retweet_index_name_pre + str(db_number) # return search_user_info(es_retweet,index_name,retweet_index_type,uid,'uid_be_retweet') center_uid = uid try: retweet_result = es_retweet.get(index=index_name, doc_type=be_retweet_index_type, id=uid)['_source'] except: return None retweet_dict = {} if retweet_result: retweet_dict_old = json.loads(retweet_result['uid_be_retweet']) for key in retweet_dict_old: retweet_dict[key] = int(retweet_dict_old[key]) sorted_list = sorted(retweet_dict.iteritems(), key=lambda x: x[1], reverse=True)[:20] uid_list = [i[0] for i in sorted_list if i[0] != uid] portrait_result = [] try: user_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={'ids': uid_list})['docs'] except: user_result = [] try: bci_history_result = es_bci_history.mget( index=bci_history_index_name, doc_type=bci_history_index_type, body={'ids': uid_list}, fields=fields)['docs'] except: bci_history_result = [] # # print bci_history_result iter_count = 0 out_portrait_list = [] for out_user_item in user_result: uid = out_user_item['_id'] if out_user_item['found'] == True: source = out_user_item['_source'] uname = source['nick_name'] photo_url = source['photo_url'] if uname == '': uname = u'未知' #location = source['user_location'] friendsnum = source['friendsnum'] else: uname = u'未知' location = '' friendsnum = '' photo_url = '' #add index from bci_history try: bci_history_item = bci_history_result[iter_count] except: bci_history_item = {'found': False} if bci_history_item['found'] == True: fansnum = bci_history_item['fields'][fields[0]][0] user_weibo_count = bci_history_item['fields'][fields[1]][0] user_friendsnum = bci_history_item['fields'][fields[2]][0] influence = bci_history_item['fields'][fields[3]][0] else: fansnum = '' user_weibo_count = '' user_friendsnum = '' influence = '' #retweet_count = int(retweet_dict[uid]) # print uid count = retweet_dict[uid] # print count out_portrait_list.append({ 'uid': uid, 'photo_url': photo_url, 'count': count, 'uname': uname, 'influence': influence, 'fansnum': fansnum, 'friendsnum': user_friendsnum, 'weibo_count': user_weibo_count }) #location, iter_count += 1 return out_portrait_list else: return None
def search_bidirect_interaction(uid, top_count): now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) db_number = get_db_num(now_date_ts) retweet_index_name = retweet_index_name_pre + str(db_number) be_retweet_index_name = be_retweet_index_name_pre + str(db_number) comment_index_name = comment_index_name_pre + str(db_number) be_comment_index_name = be_comment_index_name_pre + str(db_number) results = {} retweet_inter_dict = {} comment_inter_dict = {} center_uid = uid #bidirect interaction in retweet and be_retweet try: retweet_result = es_retweet.get(index=retweet_index_name, doc_type=retweet_index_type, id=uid)['_source'] except: retweet_result = {} if retweet_result: retweet_uid_dict = json.loads(retweet_result['uid_retweet']) else: retweet_uid_dict = {} retweet_uid_list = retweet_uid_dict.keys() try: be_retweet_result = es_retweet.get(index=be_retweet_index_name, doc_type=be_retweet_index_type, id=uid)['_source'] except: be_retweet_result = {} if be_retweet_result: be_retweet_uid_dict = json.loads(be_retweet_result['uid_be_retweet']) else: be_retweet_uid_dict = {} #bidirect interaction in comment and be_comment try: comment_result = es_comment.get(index=comment_index_name, doc_type=comment_index_type, id=uid)['_source'] except: comment_result = {} if comment_result: comment_uid_dict = json.loads(comment_result['uid_comment']) else: comment_uid_dict = {} comment_uid_list = comment_uid_dict.keys() try: be_comment_result = es_comment.get(index=be_coment_index_name, doc_type=be_comment_index_type, id=uid)['_source'] except: be_comment_result = {} if be_comment_result: be_comment_uid_dict = json.loads(be_comment_result['uid_be_comment']) else: be_comment_uid_dict = {} #get bidirect_interaction dict #all_interaction_dict = union_dict(retweet_inter_dict, comment_inter_dict) retweet_comment_result = union_dict(retweet_uid_dict, comment_uid_dict) be_retweet_comment_result = union_dict(be_retweet_uid_dict, be_comment_uid_dict) interaction_user_set = set(retweet_comment_result.keys()) & set( be_retweet_comment_result.keys()) interaction_user_list = list(interaction_user_set) all_interaction_dict = {} for interaction_user in interaction_user_list: if interaction_user != center_uid: all_interaction_dict[interaction_user] = retweet_comment_result[ interaction_user] + be_retweet_comment_result[interaction_user] sort_all_interaction_dict = sorted(all_interaction_dict.items(), key=lambda x: x[1], reverse=True) #get in_portrait_list, in_portrait_results and out_portrait_list all_interaction_uid_list = [item[0] for item in sort_all_interaction_dict] ## print all_interaction_uid_list # if RUN_TYPE == 0: # all_interaction_dict = {'2029036025':3,'1282005885':2,'2549228714':2,'1809833450':1} # all_interaction_uid_list = ['2029036025', '1282005885', '2549228714', '1809833450'] out_portrait_list = all_interaction_uid_list #use to get user information from user profile out_portrait_result = {} try: out_user_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={'ids': out_portrait_list })['docs'] except: out_user_result = [] #add index from bci_history try: bci_history_result = es_bci_history.mget( index=bci_history_index_name, doc_type=bci_history_index_type, body={'ids': out_portrait_list}, fields=fields)['docs'] except: bci_history_result = [] iter_count = 0 out_portrait_list = [] for out_user_item in out_user_result: uid = out_user_item['_id'] if out_user_item['found'] == True: source = out_user_item['_source'] uname = source['nick_name'] photo_url = source['photo_url'] if uname == '': uname = u'未知' location = source['user_location'] friendsnum = source['friendsnum'] else: uname = u'未知' location = '' friendsnum = '' photo_url = 'unknown' #add index from bci_history try: bci_history_item = bci_history_result[iter_count] except: bci_history_item = {'found': False} # # print bci_history_item if bci_history_item['found'] == True: fansnum = bci_history_item['fields'][fields[0]][0] user_weibo_count = bci_history_item['fields'][fields[1]][0] user_friendsnum = bci_history_item['fields'][fields[2]][0] influence = bci_history_item['fields'][fields[3]][0] else: fansnum = '' user_weibo_count = '' user_friendsnum = '' interaction_count = int(all_interaction_dict[uid]) out_portrait_list.append({ 'uid': uid, 'photo_url': photo_url, 'uname': uname, 'count': interaction_count, 'fansnum': fansnum, 'friendsnum': user_friendsnum, 'weibo_count': user_weibo_count }) iter_count += 1 return out_portrait_list
def get_sensitive_text_detail(task_name, ts, user, order): _id = user + '-' + task_name task_detail = es.get(index=index_sensing_task, doc_type=_id, id=ts)['_source'] weibo_detail = json.loads(task_detail['sensitive_weibo_detail']) weibo_detail_list = [] if weibo_detail: for iter_mid, item in weibo_detail.iteritems(): tmp = [] tmp.append(iter_mid) tmp.append(item[iter_mid]) tmp.append(item['retweeted']) tmp.append(item['comment']) weibo_detail_list.append(tmp) mid_list = weibo_detail.keys() results = [] query_body = { "query":{ "filtered":{ "filter":{ "terms":{"mid": mid_list} } } } } index_list = [] datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-DAY) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: index_list.append(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) if exist_es_1: index_list.append(index_name_1) if index_list and mid_list: search_results = es_text.search(index=index_list, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] uid_list = [] text_dict = dict() # 文本信息 portrait_dict = dict() # 背景信息 if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) text_dict[item['_id']] = item['_source'] # _id是mid if uid_list: portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"] for item in portrait_result: if item['found']: portrait_dict[item['_id']] = {"nick_name": item["fields"]["nick_name"][0], "photo_url": item["fields"]["photo_url"][0]} else: portrait_dict[item['_id']] = {"nick_name": item['_id'], "photo_url":""} if order == "total": sorted_list = sorted(weibo_detail_list, key=lambda x:x[1], reverse=True) elif order == "retweeted": sorted_list = sorted(weibo_detail_list, key=lambda x:x[2], reverse=True) elif order == "comment": sorted_list = sorted(weibo_detail_list, key=lambda x:x[3], reverse=True) else: sorted_list = weibo_detail_list count_n = 0 for item in sorted_list: mid = item[0] iter_text = text_dict.get(mid, {}) temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type if iter_text: uid = iter_text['uid'] temp.append(uid) iter_portrait = portrait_dict.get(uid, {}) if iter_portrait: temp.append(iter_portrait['nick_name']) temp.append(iter_portrait['photo_url']) else: temp.extend([uid,'']) temp.append(iter_text["text"]) temp.append(iter_text["sentiment"]) temp.append(ts2date(iter_text['timestamp'])) temp.append(iter_text['geo']) temp.append(iter_text['message_type']) temp.append(item[2]) temp.append(item[3]) temp.append(iter_text.get('sensitive', 0)) count_n += 1 results.append(temp) if results and order == "ts": results = sorted(results, key=lambda x:x[5], reverse=True) if results and order == "sensitive": results = sorted(results, key=lambda x:x[-1], reverse=True) return results
def search_bidirect_interaction(uid, top_count): now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) db_number = get_db_num(now_date_ts) retweet_index_name = retweet_index_name_pre + str(db_number) be_retweet_index_name = be_retweet_index_name_pre + str(db_number) comment_index_name = comment_index_name_pre + str(db_number) be_comment_index_name = be_comment_index_name_pre + str(db_number) results = {} retweet_inter_dict = {} comment_inter_dict = {} center_uid = uid #bidirect interaction in retweet and be_retweet try: retweet_result = es_retweet.get(index=retweet_index_name, doc_type=retweet_index_type, id=uid)['_source'] except: retweet_result = {} if retweet_result: retweet_uid_dict = json.loads(retweet_result['uid_retweet']) else: retweet_uid_dict = {} retweet_uid_list = retweet_uid_dict.keys() try: be_retweet_result = es_retweet.get(index=be_retweet_index_name, doc_type=be_retweet_index_type, id=uid)['_source'] except: be_retweet_result = {} if be_retweet_result: be_retweet_uid_dict = json.loads(be_retweet_result['uid_be_retweet']) else: be_retweet_uid_dict = {} #bidirect interaction in comment and be_comment try: comment_result = es_comment.get(index=comment_index_name, doc_type=comment_index_type, id=uid)['_source'] except: comment_result = {} if comment_result: comment_uid_dict = json.loads(comment_result['uid_comment']) else: comment_uid_dict = {} comment_uid_list = comment_uid_dict.keys() try: be_comment_result = es_comment.get(index=be_coment_index_name, doc_type=be_comment_index_type, id=uid)['_source'] except: be_comment_result = {} if be_comment_result: be_comment_uid_dict = json.loads(be_comment_result['uid_be_comment']) else: be_comment_uid_dict = {} #get bidirect_interaction dict #all_interaction_dict = union_dict(retweet_inter_dict, comment_inter_dict) retweet_comment_result = union_dict(retweet_uid_dict, comment_uid_dict) be_retweet_comment_result = union_dict(be_retweet_uid_dict, be_comment_uid_dict) interaction_user_set = set(retweet_comment_result.keys()) & set(be_retweet_comment_result.keys()) interaction_user_list = list(interaction_user_set) all_interaction_dict = {} for interaction_user in interaction_user_list: if interaction_user != center_uid: all_interaction_dict[interaction_user] = retweet_comment_result[interaction_user] + be_retweet_comment_result[interaction_user] sort_all_interaction_dict = sorted(all_interaction_dict.items(), key=lambda x:x[1], reverse=True) #get in_portrait_list, in_portrait_results and out_portrait_list all_interaction_uid_list = [item[0] for item in sort_all_interaction_dict] #print all_interaction_uid_list # if RUN_TYPE == 0: # all_interaction_dict = {'2029036025':3,'1282005885':2,'2549228714':2,'1809833450':1} # all_interaction_uid_list = ['2029036025', '1282005885', '2549228714', '1809833450'] out_portrait_list = all_interaction_uid_list #use to get user information from user profile out_portrait_result = {} try: out_user_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={'ids':out_portrait_list})['docs'] except: out_user_result = [] #add index from bci_history try: bci_history_result = es_bci_history.mget(index=bci_history_index_name, doc_type=bci_history_index_type, body={'ids': out_portrait_list}, fields=fields)['docs'] except: bci_history_result = [] iter_count = 0 out_portrait_list = [] for out_user_item in out_user_result: uid = out_user_item['_id'] if out_user_item['found'] == True: source = out_user_item['_source'] uname = source['nick_name'] photo_url = source['photo_url'] if uname == '': uname = u'未知' location = source['user_location'] friendsnum = source['friendsnum'] else: uname = u'未知' location = '' friendsnum = '' photo_url = 'unknown' #add index from bci_history try: bci_history_item = bci_history_result[iter_count] except: bci_history_item = {'found': False} # print bci_history_item if bci_history_item['found'] == True: fansnum = bci_history_item['fields'][fields[0]][0] user_weibo_count = bci_history_item['fields'][fields[1]][0] user_friendsnum = bci_history_item['fields'][fields[2]][0] influence = bci_history_item['fields'][fields[3]][0] else: fansnum = '' user_weibo_count = '' user_friendsnum = '' interaction_count = int(all_interaction_dict[uid]) out_portrait_list.append({'uid':uid,'photo_url':photo_url,'uname':uname, 'count':interaction_count, 'fansnum':fansnum,'friendsnum': user_friendsnum,'weibo_count': user_weibo_count}) iter_count += 1 return out_portrait_list
def get_positive_weibo_detail(ts, social_sensors, keywords_list, size, sentiment_type=1): former_mid_list = query_mid_list(ts-time_interval, keywords_list, time_segment, social_sensors) # 前一段时间内的微博mid list current_mid_list = query_mid_list(ts, keywords_list, time_interval, social_sensors) mid_list = [] mid_list.extend(former_mid_list) mid_list.extend(current_mid_list) query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte": ts - time_interval, "lt": ts } }}, ], "should":[ {"terms": {"root_mid": mid_list}}, {"terms": {"mid": mid_list}}, {"terms":{"keywords_string": keywords_list}} ] } } } }, "sort": {"timestamp": {"order": "desc"}}, "size": 100 } #if social_sensors and int(sentiment_type) == 1: # query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"terms":{"uid": social_sensors}}) if int(sentiment_type) == 1 or int(sentiment_type) == 0: query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"term":{"sentiment":sentiment_type}}) else: query_body["query"]["filtered"]["filter"]["bool"]["must"] = [{"terms":{"sentiment": ["2", "3"]}}] # 判断当前ts和ts-time_interval是否属于同一天,确定查询哪个es datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) # 1. 聚合原创微博mid list if datetime == datetime_1 and exist_es: search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] elif datetime != datetime_1 and exist_es_1: search_results = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] uid_list = [] results = [] if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) if uid_list: portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"] for i in range(len(uid_list)): item = search_results[i]['_source'] temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type temp.append(item['uid']) if portrait_result[i]['found']: temp.append(portrait_result[i]["fields"]["nick_name"][0]) temp.append(portrait_result[i]["fields"]["photo_url"][0]) else: temp.append("unknown") temp.append("") temp.append(item["text"]) temp.append(item["sentiment"]) temp.append(ts2date(item['timestamp'])) temp.append(item['geo']) keywords_set = set(item['keywords_string'].split('&')) common_keywords = set(keywords_list) & keywords_set temp.append(list(common_keywords)) temp.append(item['message_type']) results.append(temp) return results
def search_follower(uid, top_count): results = {} now_ts = time.time() db_number = get_db_num(now_ts) index_name = be_retweet_index_name_pre + str(db_number) # return search_user_info(es_retweet,index_name,retweet_index_type,uid,'uid_be_retweet') center_uid = uid try: retweet_result = es_retweet.get(index=index_name, doc_type=be_retweet_index_type, id=uid)['_source'] except: return None retweet_dict={} if retweet_result: retweet_dict_old = json.loads(retweet_result['uid_be_retweet']) for key in retweet_dict_old: retweet_dict[key]=int(retweet_dict_old[key]) sorted_list = sorted(retweet_dict.iteritems(),key=lambda x:x[1],reverse=True)[:20] uid_list = [i[0] for i in sorted_list if i[0] != uid] portrait_result = [] try: user_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={'ids':uid_list})['docs'] except: user_result = [] try: bci_history_result = es_bci_history.mget(index=bci_history_index_name, doc_type=bci_history_index_type, body={'ids':uid_list}, fields=fields)['docs'] except: bci_history_result = [] # print bci_history_result iter_count = 0 out_portrait_list = [] for out_user_item in user_result: uid = out_user_item['_id'] if out_user_item['found'] == True: source = out_user_item['_source'] uname = source['nick_name'] photo_url = source['photo_url'] if uname == '': uname = u'未知' #location = source['user_location'] friendsnum = source['friendsnum'] else: uname = u'未知' location = '' friendsnum = '' photo_url = '' #add index from bci_history try: bci_history_item = bci_history_result[iter_count] except: bci_history_item = {'found': False} if bci_history_item['found']==True: fansnum = bci_history_item['fields'][fields[0]][0] user_weibo_count = bci_history_item['fields'][fields[1]][0] user_friendsnum = bci_history_item['fields'][fields[2]][0] influence = bci_history_item['fields'][fields[3]][0] else: fansnum = '' user_weibo_count = '' user_friendsnum = '' influence = '' #retweet_count = int(retweet_dict[uid]) print uid count = retweet_dict[uid] print count out_portrait_list.append({'uid':uid,'photo_url':photo_url,'count':count,'uname':uname,'influence':influence,'fansnum':fansnum, 'friendsnum':user_friendsnum,'weibo_count':user_weibo_count})#location, iter_count += 1 return out_portrait_list else: return None
def get_origin_weibo_detail(ts, user, task_name, size, order, message_type=1): _id = user + '-' + task_name task_detail = es_user_portrait.get(index=index_sensing_task, doc_type=_id, id=ts)['_source'] print '37',index_sensing_task,_id mid_value = json.loads(task_detail['mid_topic_value']) duplicate_dict = json.loads(task_detail['duplicate_dict']) tmp_duplicate_dict = dict() for k,v in duplicate_dict.iteritems(): try: tmp_duplicate_dict[v].append(k) except: tmp_duplicate_dict[v] = [k, v] if message_type == 1: weibo_detail = json.loads(task_detail['origin_weibo_detail']) elif message_type == 2: weibo_detail = json.loads(task_detail['retweeted_weibo_detail']) else: weibo_detail = json.loads(task_detail['sensitive_weibo_detail']) weibo_detail_list = [] if weibo_detail: for iter_mid, item in weibo_detail.iteritems(): tmp = [] tmp.append(iter_mid) tmp.append(item[iter_mid]) tmp.append(item['retweeted']) tmp.append(item['comment']) weibo_detail_list.append(tmp) mid_list = weibo_detail.keys() print len(mid_list) results = [] query_body = { "query":{ "filtered":{ "filter":{ "terms":{"mid": mid_list} } } }, "size": 1000, "sort": {"timestamp": {"order": "desc"}} } index_list = [] datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-DAY) index_name = flow_text_index_name_pre + datetime print es_text exist_es = es_text.indices.exists(index_name) print exist_es if exist_es: index_list.append(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) if exist_es_1: index_list.append(index_name_1) if index_list and mid_list: search_results = es_text.search(index=index_list, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] uid_list = [] text_dict = dict() # 文本信息 portrait_dict = dict() # 背景信息 sort_results = [] if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) text_dict[item['_id']] = item['_source'] # _id是mid if uid_list: portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"] for item in portrait_result: if item['found']: portrait_dict[item['_id']] = {"nick_name": item["fields"]["nick_name"][0], "photo_url": item["fields"]["photo_url"][0]} else: portrait_dict[item['_id']] = {"nick_name": item['_id'], "photo_url":""} if order == "total": sorted_list = sorted(weibo_detail_list, key=lambda x:x[1], reverse=True)[:10] elif order == "retweeted": sorted_list = sorted(weibo_detail_list, key=lambda x:x[2], reverse=True)[:10] elif order == "comment": sorted_list = sorted(weibo_detail_list, key=lambda x:x[3], reverse=True)[:10] else: sorted_list = weibo_detail_list count_n = 0 results_dict = dict() mid_index_dict = dict() for item in sorted_list: # size mid = item[0] iter_text = text_dict.get(mid, {}) temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, keywords_string, message_type if iter_text: uid = iter_text['uid'] temp.append(uid) iter_portrait = portrait_dict.get(uid, {}) if iter_portrait: temp.append(iter_portrait['nick_name']) temp.append(iter_portrait['photo_url']) else: temp.extend([uid,'']) temp.append(iter_text["text"]) temp.append(iter_text["sentiment"]) temp.append(ts2date(iter_text['timestamp'])) temp.append(iter_text['geo']) if message_type == 1: temp.append(1) elif message_type == 2: temp.append(3) else: temp.append(iter_text['message_type']) #jln 提取关键词 f_key = get_weibo_single(iter_text['text']) temp.append(sorted(f_key.iteritems(),key=lambda x:x[1],reverse=True)) temp.append(item[2]) temp.append(item[3]) temp.append(iter_text.get('sensitive', 0)) temp.append(iter_text['timestamp']) temp.append(mid_value[mid]) temp.append(mid) results.append(temp) count_n += 1 results = sorted(results, key=operator.itemgetter(-4, -2, -6), reverse=True) # -4 -2 -3 sort_results = [] count = 0 for item in results: sort_results.append([item]) mid_index_dict[item[-1]] = count count += 1 if tmp_duplicate_dict: remove_list = [] value_list = tmp_duplicate_dict.values() # [[mid, mid], ] for item in value_list: tmp = [] for mid in item: if mid_index_dict.get(mid, 0): tmp.append(mid_index_dict[mid]) if len(tmp) > 1: tmp_min = min(tmp) else: continue tmp.remove(tmp_min) for iter_count in tmp: sort_results[tmp_min].extend(sort_results[iter_count]) remove_list.append(sort_results[iter_count]) if remove_list: for item in remove_list: sort_results.remove(item) return sort_results
def search_fans(uid,top_count): results = {} now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) db_number = get_db_num(now_date_ts) be_comment_index_name = be_comment_index_name_pre + str(db_number) be_retweet_index_name = be_retweet_index_name_pre + str(db_number) result = {} be_retweet_inter_dict = {} be_comment_inter_dict = {} center_uid = uid try: be_retweet_result = es_retweet.get(index = be_retweet_index_name,doc_type=be_retweet_index_type,id=uid)['_source'] except: be_retweet_result = {} if be_retweet_result: be_retweet_uid_dict = json.loads(be_retweet_result['uid_be_retweet']) else: be_retweet_uid_dict = {} # print "be_retweet_uid_dict", be_retweet_uid_dict try: be_comment_result = es_be_comment.get(index=be_comment_index_name, doc_type=be_comment_index_type, id=uid)['_source'] except: be_comment_result = {} if be_comment_result: be_comment_uid_dict = json.loads(be_comment_result['uid_be_comment']) else: be_comment_uid_dict = {} # print "be_comment_uid_dict", be_comment_uid_dict fans_result = union_dict(be_retweet_uid_dict,be_comment_uid_dict) fans_user_set = set(fans_result.keys()) fans_list = list(fans_user_set) # print "fans_list", fans_list all_fans_dict = {} for fans_user in fans_list: if fans_user != center_uid: all_fans_dict[fans_user] = fans_result[fans_user] sort_all_fans_dict = sorted(all_fans_dict.items(), key=lambda x:x[1], reverse=True) all_fans_uid_list=[] all_fans_uid_list_all = [item[0] for item in sort_all_fans_dict] count = 0 for i in all_fans_uid_list_all: count += 1 all_fans_uid_list.append(i) if count == 1000: break # print all_fans_uid_list out_portrait_list = all_fans_uid_list #use to get user information from user profile out_portrait_result = {} try: out_user_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={'ids':out_portrait_list})['docs'] except: out_user_result = [] #add index from bci_history try: bci_history_result = es_bci_history.mget(index=bci_history_index_name, doc_type=bci_history_index_type, body={'ids': out_portrait_list}, fields=fields)['docs'] except: bci_history_result = [] iter_count = 0 out_portrait_list = [] for out_user_item in out_user_result: uid = out_user_item['_id'] if out_user_item['found'] == True: source = out_user_item['_source'] uname = source['nick_name'] photo_url = source['photo_url'] if uname == '': uname = u'未知' location = source['user_location'] friendsnum = source['friendsnum'] else: uname = u'未知' location = '' friendsnum = '' photo_url = 'unknown' #add index from bci_history try: bci_history_item = bci_history_result[iter_count] except: bci_history_item = {'found': False} # print bci_history_item if bci_history_item['found'] == True: fansnum = bci_history_item['fields'][fields[0]][0] user_weibo_count = bci_history_item['fields'][fields[1]][0] user_friendsnum = bci_history_item['fields'][fields[2]][0] influence = bci_history_item['fields'][fields[3]][0] else: fansnum = '' user_weibo_count = '' user_friendsnum = '' fans_count = int(all_fans_dict[uid]) out_portrait_list.append({'uid':uid,'photo_url':photo_url,'uname':uname, 'count':fans_count, 'fansnum':fansnum,'friendsnum': user_friendsnum,'weibo_count': user_weibo_count}) iter_count += 1 return out_portrait_list
def portrait_user_vary(es, number, active_index, active_type, portrait_index, portrait_type, field="vary"): return_list = [] index_exist = es.indices.exists(index=active_index) if not index_exist: return "no active_index exist" sys.exit(0) count_s = 0 count_c = 0 start = 0 rank = 1 try: while 1: search_list = [] user_list = search_k(es, active_index, active_type, start, field, 100) start += 100 for item in user_list: uid = item.get('uid', '0') # obtain uid, notice "uid" or "user" search_list.append(uid) # uid list search_result = es_portrait.mget(index="user_portrait", doc_type="user", body={"ids": search_list}, _source=True)["docs"] profile_result = es_profile.mget(index="weibo_user", doc_type="user", body={"ids": search_list}, _source=True)["docs"] for item in search_result: count_c += 1 if item["found"]: info = ['', '', '', '', '', '1'] info[0] = rank index = search_result.index(item) if profile_result[index]['found']: info[1] = profile_result[index]['_source'].get( 'photo_url', '') info[3] = profile_result[index]['_source'].get( 'nick_name', '') info[2] = search_result[index].get('_id', '') info[4] = user_list[index]['vary'] return_list.append(info) rank += 1 if rank == int(number) + 1: return return_list if count_c > 10000: break except RequestError: print "timeout" return return_list