def co_search(es, user_list, bulk_action, count_n, tb): search_list = [] for item in user_list: uid = item.get('uid', '0') # obtain uid, notice "uid" or "user" search_list.append(uid) search_result = es.mget(index=index_destination, doc_type=index_destination_doctype, body={"ids": search_list}, _source=False)["docs"] search_list = [] for item in search_result: if not item['found']: user_info = {} user_info['uid'] = item['_id'] user_info['low_number'] = 0 xdata = expand_index_action(user_info) bulk_action.extend([xdata[0], xdata[1]]) count_n += 1 if count_n % 1000 == 0: es.bulk(bulk_action, index=index_destination, doc_type=index_destination_doctype, timeout=30) bulk_action = [] print count_n if count_n % 10000 == 0: ts = time.time() print "count_n %s per %s second" % (count_n, ts - tb) print "count %s " % count tb = ts return bulk_action, count_n, tb
def co_search(es, user_list, bulk_action, count_n, tb): search_list = [] for item in user_list: uid = item.get('uid', '0') # obtain uid, notice "uid" or "user" search_list.append(uid) search_result = es.mget(index=index_destination, doc_type=index_destination_doctype, body={"ids": search_list}, _source=False)["docs"] search_list = [] for item in search_result: if not item['found']: user_info = {} user_info['uid'] = item['_id'] user_info['low_number'] = 0 xdata = expand_index_action(user_info) bulk_action.extend([xdata[0], xdata[1]]) count_n += 1 if count_n % 1000 == 0: es.bulk(bulk_action, index=index_destination, doc_type=index_destination_doctype, timeout=30) bulk_action = [] print count_n if count_n % 10000 == 0: ts = time.time() print "count_n %s per %s second" %(count_n, ts-tb) print "count %s " % count tb = ts return bulk_action, count_n, tb
def filter_in(top_user_set): results = [] try: in_results = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, body={'ids':list(top_user_set)}) except Exception as e: print 'cron/recommend_in/recommend_in.py&error-2&' filter_list = [item['_id'] for item in in_results['docs'] if item['found'] is True] results = set(top_user_set) - set(filter_list) return results
def filter_in(top_user_set): results = [] try: in_results = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, body={'ids':list(top_user_set)}) except Exception as e: print 'cron/recommend_in/recommend_in.py&error-2&' filter_list = [item['_id'] for item in in_results['docs'] if item['found'] is True] results = set(top_user_set) - set(filter_list) return results
def all_makeup_info(uid_list , sort_norm , time): es = es_user_profile field_bci ,field_sen, field_weibo = get_all_filed(sort_norm , time) search_result = es.mget(index=WEBUSER_INDEX_NAME , doc_type=WEBUSER_INDEX_TYPE, body={"ids":uid_list})["docs"] current_ts = datetime2ts(ts2datetime(TIME.time()-DAY)) bci_result = es.mget(index="bci_history", doc_type="bci", body={"ids":uid_list}, fields=[field_bci, "user_fansnum", field_weibo, "weibo_month_sum"])["docs"] sen_result = es.mget(index=SESHIS_INDEX_NAME, doc_type=SESHIS_INDEX_TYPE, body={"ids":uid_list}, fields=[field_sen])["docs"] in_portrait = es_user_portrait.mget(index=USER_INDEX_NAME, doc_type=USER_INDEX_TYPE, body={"ids":uid_list}, _source=False)["docs"] results = [] #fans_result = es_user_profile.mget(index="bci_history", doc_type="bci", body={"ids":uid_list}, fields=["user_fansnum"], _source=False)["docs"] bci_max = get_max_value(es_user_profile, "bci_history", "bci", field_bci) sen_max = get_max_value(es_user_profile, "sensitive_history", "sensitive", field_sen) for i in range(len(uid_list)): tmp = dict() tmp['uid'] = uid_list[i] if search_result[i]['found']: iter_item = search_result[i]['_source'] tmp['location'] = iter_item['user_location'] tmp['uname'] = iter_item['nick_name'] tmp['photo_url'] = iter_item['photo_url'] else: tmp['location'] = None tmp['uname'] = tmp['uid'] tmp['photo_url'] = 'unknown' if in_portrait[i]['found']: tmp['is_warehousing'] = True else: tmp['is_warehousing'] = False if bci_result[i]['found']: try: bci_value = bci_result[i]['fields'][field_bci][0] tmp['bci'] = math.log(bci_value/float(bci_max)*9+1,10)*100 except: tmp['bci'] = 0 try: tmp['fans'] = bci_result[i]['fields']["user_fansnum"][0] except: tmp['fans'] = '' try: tmp["weibo_count"] = bci_result[i]['fields']["weibo_month_sum"][0] except: tmp["weibo_count"] = '' else: tmp['bci'] = None tmp['fans'] = None tmp["weibo_count"] = None if sen_result[i]['found']: try: sen_value = sen_result[i]['fields'][field_sen][0] tmp['sen'] = math.log(sen_value/float(sen_max)*9+1,10)*100 except: tmp['sen'] = 0 else: tmp['sen'] = None results.append(tmp) return results
def all_makeup_info(uid_list , sort_norm , time): es = es_user_profile field_bci ,field_sen, field_weibo = get_all_filed(sort_norm , time) search_result = es.mget(index=WEBUSER_INDEX_NAME , doc_type=WEBUSER_INDEX_TYPE, body={"ids":uid_list})["docs"] current_ts = datetime2ts(ts2datetime(TIME.time()-DAY)) bci_result = es.mget(index="bci_history", doc_type="bci", body={"ids":uid_list}, fields=[field_bci, "user_fansnum", field_weibo, "weibo_month_sum"])["docs"] sen_result = es.mget(index=SESHIS_INDEX_NAME, doc_type=SESHIS_INDEX_TYPE, body={"ids":uid_list}, fields=[field_sen])["docs"] in_portrait = es_user_portrait.mget(index=USER_INDEX_NAME, doc_type=USER_INDEX_TYPE, body={"ids":uid_list}, _source=False)["docs"] results = [] #fans_result = es_user_profile.mget(index="bci_history", doc_type="bci", body={"ids":uid_list}, fields=["user_fansnum"], _source=False)["docs"] bci_max = get_max_value(es_user_profile, "bci_history", "bci", field_bci) sen_max = get_max_value(es_user_profile, "sensitive_history", "sensitive", field_sen) for i in range(len(uid_list)): tmp = dict() tmp['uid'] = uid_list[i] if search_result[i]['found']: iter_item = search_result[i]['_source'] tmp['location'] = iter_item['user_location'] tmp['uname'] = iter_item['nick_name'] else: tmp['location'] = None tmp['uname'] = tmp['uid'] if in_portrait[i]['found']: tmp['is_warehousing'] = True else: tmp['is_warehousing'] = False if bci_result[i]['found']: try: bci_value = bci_result[i]['fields'][field_bci][0] tmp['bci'] = math.log(bci_value/float(bci_max)*9+1,10)*100 except: tmp['bci'] = 0 try: tmp['fans'] = bci_result[i]['fields']["user_fansnum"][0] except: tmp['fans'] = 0 try: tmp["weibo_count"] = bci_result[i]['fields']["weibo_month_sum"][0] except: tmp["weibo_count"] = 0 else: tmp['bci'] = None tmp['fans'] = None tmp["weibo_count"] = None if sen_result[i]['found']: try: sen_value = sen_result[i]['fields'][field_sen][0] tmp['sen'] = math.log(sen_value/float(sen_max)*9+1,10)*100 except: tmp['sen'] = 0 else: tmp['sen'] = None results.append(tmp) return results
def in_makeup_info(uid_list , sort_norm , time): es = es_user_portrait search_results = [] results = [] ts = datetime2ts(ts2datetime(TIME.time()-DAY)) field_bci , field_sen ,field_imp ,field_act = get_in_filed(sort_norm,time) field_dict = {"uid":"uid","uname":"uname","location":"location","topic":"topic_string","domain":"domain","fans":"fansnum", "act":"activeness", "imp":"importance", "bci":"influence", "sen":"sensitive"} if uid_list: search_results = es_user_portrait.mget(index=USER_INDEX_NAME, doc_type=USER_INDEX_TYPE, body={"ids":uid_list}, _source=False, fields=["uid","uname","location","topic_string","domain","fansnum", "influence", "importance", "activeness", "sensitive"])["docs"] bci_results = es.mget(index=BCI_INDEX_NAME, doc_type=BCI_INDEX_TYPE, body={"ids":uid_list}, _source=False, fields=[field_bci,"user_fansnum", "weibo_month_sum"])["docs"] imp_results = es.mget(index=IMP_INDEX_NAME, doc_type=IMP_INDEX_TYPE, body={"ids":uid_list}, _source=False, fields=[field_imp])["docs"] act_results = es.mget(index=ACT_INDEX_NAME, doc_type=ACT_INDEX_TYPE, body={"ids":uid_list}, _source=False, fields=[field_act])["docs"] sen_results = es.mget(index=SES_INDEX_NAME, doc_type=SES_INDEX_TYPE, body={"ids":uid_list}, _source=False, fields=[field_sen])["docs"] results = [] for i in range(len(uid_list)): item = dict() if not search_results[i].get('found', 0): continue for k,v in field_dict.iteritems(): item[k] = search_results[i]["fields"][v][0] if k == "uname" and not item[k]: item[k] = uid_list[i] try: act_value = act_results[i]['fields'][field_act][0] item['act'] = act_value except: item['act'] = 0 try: imp_value = imp_results[i]['fields'][field_imp][0] item['ipm'] = imp_value except: item['ipm'] = 0 try: user_fansnum = bci_results[i]['fields']['user_fansnum'][0] item['fans'] = user_fansnum except: item['fans'] = 0 try: bci_value = bci_results[i]['fields'][field_bci][0] item['bci'] = bci_value except: item['bci'] = 0 try: sen_value = sen_results[i]['fields'][field_sen][0] tmp['sen'] = sen_value except: item['sen'] = 0 results.append(item) return results
def in_makeup_info(uid_list , sort_norm , time): es = es_user_portrait search_results = [] results = [] ts = datetime2ts(ts2datetime(TIME.time()-DAY)) field_bci , field_sen ,field_imp ,field_act = get_in_filed(sort_norm,time) field_dict = {"uid":"uid","uname":"uname","location":"location","topic":"topic_string","domain":"domain","fans":"fansnum", "act":"activeness", "imp":"importance", "bci":"influence", "sen":"sensitive"} if uid_list: search_results = es_user_portrait.mget(index=USER_INDEX_NAME, doc_type=USER_INDEX_TYPE, body={"ids":uid_list}, _source=False, fields=["uid","uname","location","topic_string","domain","fansnum", "influence", "importance", "activeness", "sensitive"])["docs"] bci_results = es.mget(index=BCI_INDEX_NAME, doc_type=BCI_INDEX_TYPE, body={"ids":uid_list}, _source=False, fields=[field_bci,"user_fansnum", "weibo_month_sum"])["docs"] imp_results = es.mget(index=IMP_INDEX_NAME, doc_type=IMP_INDEX_TYPE, body={"ids":uid_list}, _source=False, fields=[field_imp])["docs"] act_results = es.mget(index=ACT_INDEX_NAME, doc_type=ACT_INDEX_TYPE, body={"ids":uid_list}, _source=False, fields=[field_act])["docs"] sen_results = es.mget(index=SES_INDEX_NAME, doc_type=SES_INDEX_TYPE, body={"ids":uid_list}, _source=False, fields=[field_sen])["docs"] results = [] for i in range(len(uid_list)): item = dict() if not search_results[i].get('found', 0): continue for k,v in field_dict.iteritems(): item[k] = search_results[i]["fields"][v][0] if k == "uname" and not item[k]: item[k] = uid_list[i] try: act_value = act_results[i]['fields'][field_act][0] item['act'] = act_value except: item['act'] = 0 try: imp_value = imp_results[i]['fields'][field_imp][0] item['ipm'] = imp_value except: item['ipm'] = 0 try: user_fansnum = bci_results[i]['fields']['user_fansnum'][0] item['fans'] = user_fansnum except: item['fans'] = '' try: bci_value = bci_results[i]['fields'][field_bci][0] item['bci'] = bci_value except: item['bci'] = 0 try: sen_value = sen_results[i]['fields'][field_sen][0] tmp['sen'] = sen_value except: item['sen'] = 0 results.append(item) return results
def filter_in(top_user_set): results = [] try: in_results = es_user_portrait.mget(index='user_portrait', doc_type='user', body={'ids':list(top_user_set)}) except Exception as e: raise e filter_list = [item['_id'] for item in in_results['docs'] if item['found'] is True] print 'before filter in:', len(top_user_set) print 'filter_list:', len(filter_list) results = set(top_user_set) - set(filter_list) print 'after filter in:', len(results) return results
def get_bci_detail(): uid_list = [] with open("uid_list_0520.txt", 'rb') as f: for item in f: uid_list.append(item.strip()) print uid_list index_name = "bci_20160522" bci_results = es_user_portrait.mget(index=index_name, doc_type="bci", body={"ids":uid_list})["docs"] with open("bci_detail_0522.txt", "wb") as f: for item in bci_results: if item["found"]: f.write(json.dumps(item["_source"])+"\n")
def get_bci_detail(): uid_list = [] with open("uid_list_0520.txt", 'rb') as f: for item in f: uid_list.append(item.strip()) print uid_list index_name = "bci_20160522" bci_results = es_user_portrait.mget(index=index_name, doc_type="bci", body={"ids": uid_list})["docs"] with open("bci_detail_0522.txt", "wb") as f: for item in bci_results: if item["found"]: f.write(json.dumps(item["_source"]) + "\n")
def get_temporal_rank(timestamp): index = get_queue_index(timestamp) index_ts = "influence_timestamp_" + str(index) uid_list = r.zrange(index_ts, 0, 10000, desc=True) user_info = [] in_portrait = [] # 入库 if uid_list: search_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, body={"ids":uid_list}, field=SOCIAL_SENSOR_INFO)["docs"] for item in search_result: if item["found"]: temp = [] in_portrait.append(item['_id']) for iter_key in SOCIAL_SENSOR_INFO: """
def es_km_storage(uid_list): es_results = es_user_portrait.mget(index="user_portrait_1222", doc_type="user", body={"ids":uid_list})["docs"] in_list = [] out_list = [] bulk_action = [] for item in es_results: if item["found"]: in_list.append(item["_id"]) bulk_action.append(item["_source"]) else: out_list.append(item["_id"]) if bulk_action: es_km.bulk(bulk_action, index='user_portrait', doc_type="user", timeout=60) return in_list, out_list
def get_forward_numerical_info(task_name, ts, create_by): results = [] ts_series = [] for i in range(1, forward_n + 1): ts_series.append(ts - i * time_interval) # check if detail es of task exists doctype = create_by + "-" + task_name index_exist = es_user_portrait.indices.exists_type(index_sensing_task, doctype) if not index_exist: print "new create task detail index" mappings_sensing_task(doctype) if ts_series: search_results = es_user_portrait.mget(index=index_sensing_task, doc_type=doctype, body={"ids": ts_series})[ "docs" ] found_count = 0 average_origin = [] average_retweeted = [] average_commet = [] average_total = [] average_negetive = [] for item in search_results: if item["found"]: temp = item["_source"] sentiment_dict = json.loads(temp["sentiment_distribution"]) average_total.append(int(temp["weibo_total_number"])) average_negetive.append( int(sentiment_dict["2"]) + int(sentiment_dict["3"]) + int(sentiment_dict["4"]) + int(sentiment_dict["5"]) + int(sentiment_dict["6"]) ) found_count += 1 if found_count > initial_count: number_mean = np.mean(average_total) number_std = np.std(average_total) sentiment_mean = np.mean(average_negetive) sentiment_std = np.mean(average_negetive) results = [1, number_mean, number_std, sentiment_mean, sentiment_std] else: results = [0] return results
def get_forward_numerical_info(task_name, ts, create_by): results = [] ts_series = [] for i in range(1, forward_n + 1): ts_series.append(ts - i * time_interval) # check if detail es of task exists doctype = create_by + "-" + task_name index_exist = es_user_portrait.indices.exists_type(index_sensing_task, doctype) if not index_exist: print "new create task detail index" mappings_sensing_task(doctype) if ts_series: search_results = es_user_portrait.mget(index=index_sensing_task, doc_type=doctype, body={"ids": ts_series})['docs'] found_count = 0 average_origin = [] average_retweeted = [] average_commet = [] average_total = [] average_negetive = [] for item in search_results: if item['found']: temp = item['_source'] sentiment_dict = json.loads(temp['sentiment_distribution']) average_total.append(int(temp['weibo_total_number'])) average_negetive.append( int(sentiment_dict["2"]) + int(sentiment_dict['3']) + int(sentiment_dict['4']) + int(sentiment_dict['5']) + int(sentiment_dict['6'])) found_count += 1 if found_count > initial_count: number_mean = np.mean(average_total) number_std = np.std(average_total) sentiment_mean = np.mean(average_negetive) sentiment_std = np.mean(average_negetive) results = [ 1, number_mean, number_std, sentiment_mean, sentiment_std ] else: results = [0] return results
def get_influence_value(date_time, field_name, uid_list): datename = ts2datetime(date_time - DAY) new_datetime = datename[0:4] + datename[5:7] + datename[8:10] bci_index_name = weibo_bci_index_name_pre + new_datetime index_value_list = [] try: result = es_user_portrait.mget(index=bci_index_name, doc_type=weibo_bci_index_type, body={'ids': uid_list}, _source=True)['docs'] for item in result: # print 'item_influence::',item # print 'item_type::',type(item) if item['found']: index_value_list.append(item['_source']['user_index']) except Exception, e: print '影响力查询错误::', e
def search_attention(uid): stat_results = dict() results = dict() for db_num in R_DICT: r = R_DICT[db_num] ruid_results = r.hgetall('retweet_'+str(uid)) if ruid_results: for ruid in ruid_results: if ruid != uid: try: stat_results[ruid] += ruid_results[ruid] except: stat_results[ruid] = ruid_results[ruid] # print 'results:', stat_results if not stat_results: return [None, 0] try: sort_state_results = sorted(stat_results.items(), key=lambda x:x[1], reverse=True)[:20] except: return [None, 0] print 'sort_state_results:', sort_state_results uid_list = [item[0] for item in sort_state_results] es_profile_results = es_user_profile.mget(index='weibo_user', doc_type='user', body={'ids':uid_list})['docs'] es_portrait_results = es_user_portrait.mget(index='user_portrait', doc_type='user', body={'ids':uid_list})['docs'] result_list = dict() for i in range(len(es_profile_results)): item = es_profile_results[i] uid = item['_id'] try: source = item['_source'] uname = source['nick_name'] except: uname = u'未知' # identify uid is in the user_portrait portrait_item = es_portrait_results[i] try: source = portrait_item[i] in_status = 1 except: in_status = 0 result_list[uid] = [uid,[uname, stat_results[uid], in_status]] return [result_list, len(stat_results)]
def search_attention(uid): stat_results = dict() results = dict() for db_num in R_DICT: r = R_DICT[db_num] ruid_results = r.hgetall("retweet_" + str(uid)) if ruid_results: for ruid in ruid_results: if ruid != uid: try: stat_results[ruid] += ruid_results[ruid] except: stat_results[ruid] = ruid_results[ruid] # print 'results:', stat_results if not stat_results: return [None, 0] try: sort_state_results = sorted(stat_results.items(), key=lambda x: x[1], reverse=True)[:20] except: return [None, 0] print "sort_state_results:", sort_state_results uid_list = [item[0] for item in sort_state_results] es_profile_results = es_user_profile.mget(index="weibo_user", doc_type="user", body={"ids": uid_list})["docs"] es_portrait_results = es_user_portrait.mget(index="user_portrait", doc_type="user", body={"ids": uid_list})["docs"] result_list = dict() for i in range(len(es_profile_results)): item = es_profile_results[i] uid = item["_id"] try: source = item["_source"] uname = source["nick_name"] except: uname = u"未知" # identify uid is in the user_portrait portrait_item = es_portrait_results[i] try: source = portrait_item[i] in_status = 1 except: in_status = 0 result_list[uid] = [uid, [uname, stat_results[uid], in_status]] return [result_list, len(stat_results)]
def co_search(add_info, update_bci_key, former_bci_key, now_ts): uid_list = add_info.keys() evaluate_history_results = es_user_portrait.mget(index=COPY_USER_PORTRAIT_INFLUENCE, doc_type=COPY_USER_PORTRAIT_INFLUENCE_TYPE,body={'ids':uid_list})['docs'] iter_count = 0 bulk_action = [] for uid in uid_list: item = evaluate_history_results[iter_count] if item['found']: user_history_item = item['_source'] #更新新的字段 user_history_item.update(add_info[uid]) user_history_item['bci_day_change'] = user_history_item[update_bci_key] - user_history_item.get(former_bci_key, 0) user_history_item['bci_week_change'] = user_history_item[update_bci_key] - user_history_item.get('bci_week_ave', 0) user_history_item['bci_month_change'] = user_history_item[update_bci_key] - user_history_item.get('bci_month_ave', 0) user_history_item['bci_week_ave'], user_history_item['bci_week_var'], user_history_item['bci_week_sum'] = compute_week(user_history_item, now_ts) user_history_item['bci_month_ave'], user_history_item['bci_month_var'], user_history_item['bci_month_sum'] = compute_month(user_history_item, now_ts) if user_history_item[update_bci_key] < LOW_INFLUENCE_THRESHOULD: user_history_item['low_number'] += 1 else: user_history_item['low_number'] = 0 else: user_history_item = dict() user_history_item.update(add_info[uid]) user_history_item["uid"] = uid user_history_item.update(add_info[uid]) user_history_item['bci_day_change'] = user_history_item[update_bci_key] user_history_item['bci_week_change'] = user_history_item[update_bci_key] user_history_item['bci_month_change'] = user_history_item[update_bci_key] user_history_item['bci_week_ave'], user_history_item['bci_week_var'], user_history_item['bci_week_sum'] = compute_week(user_history_item, now_ts) user_history_item['bci_month_ave'], user_history_item['bci_month_var'], user_history_item['bci_month_sum'] = compute_month(user_history_item, now_ts) if user_history_item[update_bci_key] < LOW_INFLUENCE_THRESHOULD: user_history_item['low_number'] = 1 iter_count += 1 try: user_history_item.pop(del_bci_key) except: pass action = {'index':{'_id': uid}} bulk_action.extend([action, user_history_item]) if bulk_action: es_cluster.bulk(bulk_action, index=COPY_USER_PORTRAIT_INFLUENCE, doc_type=COPY_USER_PORTRAIT_INFLUENCE_TYPE,timeout=600) print iter_count
def es_km_storage(uid_list): es_results = es_user_portrait.mget(index=remote_portrait_name, doc_type=portrait_type, body={"ids": uid_list})["docs"] in_list = [] out_list = [] bulk_action = [] for item in es_results: if item["found"]: in_list.append(item["_id"]) bulk_action.append(item["_source"]) else: out_list.append(item["_id"]) if bulk_action: es_km.bulk(bulk_action, index=portrait_name, doc_type=portrait_type, timeout=60) return out_list
def filter_out(all_user_set): out_results = [] all_user_list = list(all_user_set) all_count = len(all_user_set) out_count = 0 iter_count = 0 while out_count < RECOMMEND_IN_OUT_SIZE: iter_user_list = all_user_list[iter_count: iter_count + RECOMMEND_IN_ITER_COUNT] if iter_user_list == []: break #out portrait try: in_portrait_result = es_user_portrait.mget(index=portrait_index_name, \ doc_type=portrait_index_type, body={'ids': iter_user_list})['docs'] except: in_portrait_result = [] for in_item in in_portrait_result: if in_item['found'] == False: out_count += 1 out_results.append(in_item['_id']) iter_count += RECOMMEND_IN_ITER_COUNT return out_results
def filter_out(all_user_set): out_results = [] all_user_list = list(all_user_set) all_count = len(all_user_set) out_count = 0 iter_count = 0 while out_count < RECOMMEND_IN_OUT_SIZE: iter_user_list = all_user_list[iter_count: iter_count + RECOMMEND_IN_ITER_COUNT] if iter_user_list == []: break #out portrait try: in_portrait_result = es_user_portrait.mget(index=portrait_index_name, \ doc_type=portrait_index_type, body={'ids': iter_user_list})['docs'] except: in_portrait_result = [] for in_item in in_portrait_result: if in_item['found'] == False: out_count += 1 out_results.append(in_item['_id']) iter_count += RECOMMEND_IN_ITER_COUNT return out_results
def co_search(add_info, update_bci_key, former_bci_key, now_ts): uid_list = add_info.keys() evaluate_history_results = es_user_portrait.mget(index=COPY_USER_PORTRAIT_SENSITIVE, doc_type=COPY_USER_PORTRAIT_SENSITIVE_TYPE,body={'ids':uid_list})['docs'] iter_count = 0 bulk_action = [] for uid in uid_list: item = evaluate_history_results[iter_count] if item['found']: user_history_item = item['_source'] #更新新的字段 user_history_item.update(add_info[uid]) user_history_item['sensitive_day_change'] = user_history_item[update_bci_key] - user_history_item.get(former_bci_key, 0) user_history_item['sensitive_week_change'] = user_history_item[update_bci_key] - user_history_item.get('sensitive_week_ave', 0) user_history_item['sensitive_month_change'] = user_history_item[update_bci_key] - user_history_item.get('sensitive_month_ave', 0) user_history_item['sensitive_week_ave'], user_history_item['sensitive_week_var'], user_history_item['sensitive_week_sum'] = compute_week(user_history_item, now_ts) user_history_item['sensitive_month_ave'], user_history_item['sensitive_month_var'], user_history_item['sensitive_month_sum'] = compute_month(user_history_item, now_ts) else: user_history_item = dict() user_history_item.update(add_info[uid]) user_history_item["uid"] = uid user_history_item.update(add_info[uid]) user_history_item['sensitive_day_change'] = user_history_item[update_bci_key] user_history_item['sensitive_week_change'] = user_history_item[update_bci_key] user_history_item['sensitive_month_change'] = user_history_item[update_bci_key] user_history_item['sensitive_week_ave'], user_history_item['sensitive_week_var'], user_history_item['sensitive_week_sum'] = compute_week(user_history_item, now_ts) user_history_item['sensitive_month_ave'], user_history_item['sensitive_month_var'], user_history_item['sensitive_month_sum'] = compute_month(user_history_item, now_ts) iter_count += 1 try: user_history_item.pop(del_bci_key) except: pass action = {'index':{'_id': uid}} bulk_action.extend([action, user_history_item]) if bulk_action: es_cluster.bulk(bulk_action, index=COPY_USER_PORTRAIT_SENSITIVE, doc_type=COPY_USER_PORTRAIT_SENSITIVE_TYPE,timeout=600) print iter_count
def specific_keywords_burst_dection(task_detail): task_name = task_detail[0] social_sensors = task_detail[1] keywords_list = task_detail[2] sensitive_words = task_detail[3] stop_time = task_detail[4] forward_warning_status = task_detail[5] ts = int(task_detail[7]) forward_result = get_forward_numerical_info(task_name, ts, keywords_list) # 之前时间阶段内的原创微博list forward_origin_weibo_list = query_mid_list(ts-time_interval, keywords_list, forward_time_range) # 当前阶段内原创微博list current_mid_list = query_mid_list(ts, keywords_list, time_interval) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(forward_origin_weibo_list) print "all mid list: ", len(all_mid_list) # 查询当前的原创微博和之前12个小时的原创微博在当前时间内的转发和评论数, 聚合按照message_type statistics_count = query_related_weibo(ts, all_mid_list, time_interval, keywords_list) current_total_count = statistics_count['total_count'] # 当前阶段内所有微博总数 print "current all weibo: ", statistics_count current_origin_count = statistics_count['origin'] current_retweeted_count = statistics_count['retweeted'] current_comment_count = statistics_count['comment'] # 针对敏感微博的监测,给定传感器和敏感词的前提下,只要传感器的微博里提及到敏感词即会认为是预警 # 聚合当前时间内积极、中性、悲伤、愤怒情绪分布 # sentiment_dict = {"0": "neutral", "1":"positive", "2":"sad", "3": "anger"} sentiment_count = {"0": 0, "1": 0, "2": 0, "3": 0} datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-time_interval) if datetime != datetime_1: index_name = flow_text_index_name_pre + datetime_1 else: index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: search_results = aggregation_sentiment_related_weibo(ts, all_mid_list, time_interval, keywords_list) sentiment_count = search_results print "sentiment_count: ", sentiment_count negetive_count = sentiment_count['2'] + sentiment_count['3'] # 聚合当前时间内重要的人 important_uid_list = [] if exist_es: #search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=aggregation_sensor_keywords(ts-time_interval, ts, [], "root_uid", size=IMPORTANT_USER_NUMBER))['aggregations']['all_keywords']['buckets'] search_results = query_hot_weibo(ts, all_mid_list, time_interval, keywords_list, aggregation_field="root_uid", size=100) important_uid_list = search_results.keys() if datetime != datetime_1: index_name_1 = flow_text_index_name_pre + datetime_1 if es_text.indices.exists(index_name_1): #search_results_1 = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=aggregation_sensor_keywords(ts-time_interval, ts, [], "root_uid", size=IMPORTANT_USER_NUMBER))['aggregations']['all_keywords']['buckets'] search_results_1 = query_hot_weibo(ts, all_mid_list, time_interval, keywords_list, aggregation_field="root_uid", size=100) if search_results_1: for item in search_results_1: important_uid_list.append(item['key']) # 根据获得uid_list,从人物库中匹配重要人物 if important_uid_list: important_results = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, body={"ids": important_uid_list})['docs'] else: important_results = {} filter_important_list = [] # uid_list if important_results: for item in important_results: if item['found']: if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD: filter_important_list.append(item['_id']) print filter_important_list # 6. 敏感词识别,如果传感器的微博中出现这么一个敏感词,那么就会预警------PS.敏感词是一个 sensitive_origin_weibo_number = 0 sensitive_retweeted_weibo_number = 0 sensitive_comment_weibo_number = 0 sensitive_total_weibo_number = 0 if sensitive_words: query_sensitive_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte": ts - time_interval, "lt": ts }} }, {"terms": {"keywords_string": sensitive_words}} ] } } } }, "aggs":{ "all_list":{ "terms":{"field": "message_type"} } } } if social_sensors: query_sensitive_body['query']['filtered']['filter']['bool']['must'].append({"terms":{"uid": social_sensors}}) sensitive_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_sensitive_body)['aggregations']['all_list']["buckets"] if sensitive_results: for item in sensitive_results: if int(item["key"]) == 1: sensitive_origin_weibo_number = item['doc_count'] elif int(item["key"]) == 2: sensitive_comment_weibo_number = item['doc_count'] elif int(item["key"]) == 3: sensitive_retweeted_weibo_number = item["doc_count"] else: pass sensitive_total_weibo_number = sensitive_origin_weibo_number + sensitive_comment_weibo_number + sensitive_retweeted_weibo_number burst_reason = signal_nothing_variation warning_status = signal_nothing finish = unfinish_signal # "0" process_status = "1" if sensitive_total_weibo_number > WARNING_SENSITIVE_COUNT: # 敏感微博的数量异常 print "======================" if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track else: warning_status = signal_brust burst_reason = signal_sensitive_variation if forward_result[0]: # 根据移动平均判断是否有时间发生 mean_count = forward_result[1] std_count = forward_result[2] mean_sentiment = forward_result[3] std_sentiment = forward_result[4] if current_total_count > mean_count+1.96*std_count: # 异常点发生 print "=====================================================" if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track else: warning_status = signal_brust burst_reason += signal_count_varition # 数量异常 if negetive_count > mean_sentiment+1.96*std_sentiment: warning_status = signal_brust burst_reason += signal_sentiment_varition # 负面情感异常, "12"表示两者均异常 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track if int(stop_time) <= ts: # 检查任务是否已经完成 finish = finish_signal process_status = "0" # 7. 感知到的事, all_mid_list tmp_burst_reason = burst_reason topic_list = [] # 判断是否有敏感微博出现:有,则聚合敏感微博,replace;没有,聚合普通微博 if burst_reason: # 有事情发生 text_list = [] mid_set = set() if signal_sensitive_variation in burst_reason: query_sensitive_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte": ts - time_interval, "lt": ts }} }, {"terms": {"keywords_string": sensitive_words}} ] } } } }, "size": 10000 } if social_sensors: query_sensitive_body['query']['filtered']['filter']['bool']['must'].append({"terms":{"uid": social_sensors}}) sensitive_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_sensitive_body)['hits']['hits'] if sensitive_results: for item in sensitive_results: iter_mid = item['_source']['mid'] iter_text = item['_source']['text'] temp_dict = dict() temp_dict["mid"] = iter_mid temp_dict["text"] = iter_text if iter_mid not in mid_set: text_list.append(temp_dict) # 整理后的文本,mid,text mid_set.add(iter_mid) burst_reason.replace(signal_sensitive_variation, "") current_origin_mid_list = query_mid_list(ts, keywords_list, time_interval, 1) print "current_origin_mid_list:", len(current_origin_mid_list) if burst_reason and current_mid_list: origin_sensing_text = es_text.mget(index=index_name, doc_type=flow_text_index_type, body={"ids": current_origin_mid_list}, fields=["mid", "text"])["docs"] if origin_sensing_text: for item in origin_sensing_text: if item["found"]: iter_mid = item["fields"]["mid"][0] iter_text = item["fields"]["text"][0] temp_dict = dict() temp_dict["mid"] = iter_mid temp_dict["text"] = iter_text if iter_mid not in mid_set: text_list.append(temp_dict) # 整理后的文本,mid,text mid_set.add(iter_mid) if len(text_list) == 1: top_word = freq_word(text_list[0]) topic_list = [top_word.keys()] elif len(text_list) == 0: topic_list = [] tmp_burst_reason = "" #没有相关微博,归零 print "***********************************" else: feature_words, input_word_dict = tfidf(text_list) #生成特征词和输入数据 word_label, evaluation_results = kmeans(feature_words, text_list) #聚类 inputs = text_classify(text_list, word_label, feature_words) clustering_topic = cluster_evaluation(inputs) print "========================================================================================" print "=========================================================================================" sorted_dict = sorted(clustering_topic.items(), key=lambda x:x[1], reverse=True) topic_list = [] if sorted_dict: for item in sorted_dict: topic_list.append(word_label[item[0]]) print "topic_list, ", topic_list if not topic_list: warning_status = signal_nothing tmp_burst_reason = signal_nothing_variation results = dict() results['origin_weibo_number'] = current_origin_count results['retweeted_weibo_number'] = current_retweeted_count results['comment_weibo_number'] = current_comment_count results['weibo_total_number'] = current_total_count results['sensitive_origin_weibo_number'] = sensitive_origin_weibo_number results['sensitive_retweeted_weibo_number'] = sensitive_retweeted_weibo_number results['sensitive_comment_weibo_number'] = sensitive_comment_weibo_number results['sensitive_weibo_total_number'] = sensitive_total_weibo_number results['sentiment_distribution'] = json.dumps(sentiment_count) results['important_users'] = json.dumps(filter_important_list) results['burst_reason'] = tmp_burst_reason results['timestamp'] = ts if tmp_burst_reason: results['clustering_topic'] = json.dumps(topic_list) # es存储当前时段的信息 doctype = task_name es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results) # 更新manage social sensing的es信息 temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=task_name)['_source'] temporal_result['warning_status'] = warning_status temporal_result['burst_reason'] = tmp_burst_reason temporal_result['finish'] = finish temporal_result['processing_status'] = process_status history_status = json.loads(temporal_result['history_status']) history_status.append([ts, ' '.join(keywords_list), warning_status]) temporal_result['history_status'] = json.dumps(history_status) es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=task_name, body=temporal_result) return "1"
new_user_item['aver_influence'] = aver_influence new_user_item['aver_importance'] = aver_importance action = {'index':{'_id': uid}} bulk_action.extend([action, new_user_item]) iter_count += 1 es_user_portrait.bulk(bulk_action, index=copy_portrait_index_name, doc_type=copy_portrait_index_type) bulk_action = [] add_info = {} iter_count = 0 break except Exception, e: raise e if len(add_info)!=0: uid_list = add_info.keys() evaluate_history_results = es_user_portrait.mget(index=copy_portrait_index_name, doc_type=copy_portrait_index_type, body={'ids':uid_list})['docs'] ''' del_date = ts2datetime(time.time() - DAY*31) del_activeness_key = 'activeness_'+del_date del_influence_key = del_date ''' iter_count = 0 for uid in uid_list: try: user_history_item = evaluate_history_results[iter_count]['_source'] except: user_history_item = {} try: user_history_item.pop(del_activeness_key) user_history_item.pop(del_influence_key) user_history_item.pop(del_importance_key)
def social_sensing(task_detail): # 任务名 传感器 终止时间 之前状态 创建者 时间 task_name = task_detail[0] social_sensors = task_detail[1] stop_time = task_detail[2] create_by = task_detail[3] ts = int(task_detail[4]) print ts2date(ts) # PART 1 #forward_result = get_forward_numerical_info(task_name, ts, create_by) # 之前时间阶段内的原创微博list/retweeted forward_origin_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range) forward_retweeted_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range, 3) # 当前阶段内原创微博list current_mid_list = query_mid_list(ts, social_sensors, time_interval) current_retweeted_mid_list = query_mid_list(ts, social_sensors, time_interval, 3) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(current_retweeted_mid_list) all_mid_list.extend(forward_origin_weibo_list) all_mid_list.extend(forward_retweeted_weibo_list) all_origin_list = [] all_origin_list.extend(current_mid_list) all_origin_list.extend(forward_origin_weibo_list) all_retweeted_list = [] all_retweeted_list.extend(current_retweeted_mid_list) all_retweeted_list.extend(forward_retweeted_weibo_list)#被转发微博的mid/root-mid print "all mid list: ", len(all_mid_list) #print "all_origin_list", all_origin_list #print "all_retweeted_list", all_retweeted_list # 查询微博在当前时间内的转发和评论数, 聚合按照message_type statistics_count = query_related_weibo(ts, all_mid_list, time_interval) if all_origin_list: origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval) # 原创微博详情 else: origin_weibo_detail = {} if all_retweeted_list: retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval) # 转发微博详情 else: retweeted_weibo_detail = {} current_total_count = statistics_count['total_count'] # 当前阶段内所有微博总数 current_retweeted_count = statistics_count['retweeted'] current_comment_count = statistics_count['comment'] # 聚合当前时间内重要的人 important_uid_list = [] datetime = ts2datetime(ts-time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: search_results = get_important_user(ts, all_mid_list, time_interval) important_uid_list = search_results # 根据获得uid_list,从人物库中匹配重要人物 if important_uid_list: important_results = es_user_portrait.mget(index=portrait_index_name,doc_type=portrait_index_type, body={"ids": important_uid_list})['docs'] else: important_results = [] filter_important_list = [] # uid_list if important_results: for item in important_results: if item['found']: #if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD: filter_important_list.append(item['_id']) print "filter_important_list", filter_important_list print "important_results", important_uid_list #判断感知 finish = unfinish_signal # "0" process_status = "1" if int(stop_time) <= ts: # 检查任务是否已经完成 finish = finish_signal process_status = "0" # 感知到的事, all_mid_list sensitive_text_list = [] # 有事件发生时开始 if 1: index_list = [] important_words = [] datetime_1 = ts2datetime(ts) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es = es_text.indices.exists(index=index_name_1) if exist_es: index_list.append(index_name_1) datetime_2 = ts2datetime(ts-DAY) index_name_2 = flow_text_index_name_pre + datetime_2 exist_es = es_text.indices.exists(index=index_name_2) if exist_es: index_list.append(index_name_2) if index_list and all_mid_list: query_body = { "query":{ "filtered":{ "filter":{ "terms":{"mid": all_mid_list} } } }, "size": 5000 } search_results = es_text.search(index=index_list, doc_type="text", body=query_body)['hits']['hits'] tmp_sensitive_warning = "" text_dict = dict() # 文本信息 mid_value = dict() # 文本赋值 duplicate_dict = dict() # 重合字典 portrait_dict = dict() # 背景信息 classify_text_dict = dict() # 分类文本 classify_uid_list = [] duplicate_text_list = [] sensitive_words_dict = dict() if search_results: for item in search_results: iter_uid = item['_source']['uid'] iter_mid = item['_source']['mid'] iter_text = item['_source']['text'].encode('utf-8', 'ignore') iter_sensitive = item['_source'].get('sensitive', 0) duplicate_text_list.append({"_id":iter_mid, "title": "", "content":iter_text}) if iter_sensitive: tmp_sensitive_warning = signal_sensitive_variation #涉及到敏感词的微博 sensitive_words_dict[iter_mid] = iter_sensitive keywords_dict = json.loads(item['_source']['keywords_dict']) personal_keywords_dict = dict() for k, v in keywords_dict.iteritems(): k = k.encode('utf-8', 'ignore') personal_keywords_dict[k] = v classify_text_dict[iter_mid] = personal_keywords_dict classify_uid_list.append(iter_uid) # 去重 if duplicate_text_list: dup_results = duplicate(duplicate_text_list) for item in dup_results: if item['duplicate']: duplicate_dict[item['_id']] = item['same_from'] # 分类 if classify_text_dict: classify_results = topic_classfiy(classify_uid_list, classify_text_dict) mid_value = dict() #print "classify_results: ", classify_results for k,v in classify_results.iteritems(): # mid:value mid_value[k] = topic_value_dict[v[0]] sensitive_weibo_detail = {} if sensitive_words_dict: sensitive_mid_list = sensitive_words_dict.keys() sensitivie_weibo_detail = query_hot_weibo(ts, sensitive_mid_list, time_interval) results = dict() results['mid_topic_value'] = json.dumps(mid_value) results['duplicate_dict'] = json.dumps(duplicate_dict) results['sensitive_words_dict'] = json.dumps(sensitive_words_dict) results['sensitive_weibo_detail'] = json.dumps(sensitive_weibo_detail) results['origin_weibo_number'] = len(all_origin_list) results['retweeted_weibo_number'] = len(all_retweeted_list) results['origin_weibo_detail'] = json.dumps(origin_weibo_detail) results['retweeted_weibo_detail'] = json.dumps(retweeted_weibo_detail) results['retweeted_weibo_count'] = current_retweeted_count results['comment_weibo_count'] = current_comment_count results['weibo_total_number'] = current_total_count results['important_users'] = json.dumps(filter_important_list) results['unfilter_users'] = json.dumps(important_uid_list) results['timestamp'] = ts #results['clustering_topic'] = json.dumps(topic_list) # es存储当前时段的信息 doctype = create_by + '-' + task_name es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results) # 更新manage social sensing的es信息 temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=doctype)['_source'] temporal_result['finish'] = finish temporal_result['processing_status'] = process_status history_status = json.loads(temporal_result['history_status']) history_status.append(ts) temporal_result['history_status'] = json.dumps(history_status) es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=doctype, body=temporal_result) return "1"
def scan_index_history(): s_re = scan(es_user_portrait, query={'query':{'match_all':{}}, 'size':1000}, index=portrait_index_name, doc_type=portrait_index_type) bulk_action = [] add_info = {} count = 0 start_ts = time.time() now_date = ts2datetime(start_ts - DAY) now_date = '2013-09-06' #now_date_string = ''.join(now_date.split('-')) now_date_string = now_date activeness_key = 'activeness_'+now_date_string #influence_key = now_date_string influence_key = now_date_string importance_key = "importance_" + now_date_string del_date = ts2datetime(time.time() - DAY*31) #del_date_string = ''.join(del_date.split('-')) del_date_string = del_date del_activeness_key = 'activeness_'+del_date_string #del_influence_key = del_date_string del_influence_key = del_date_string del_importance_key = "importance_" + del_date_string #get max value for importance and activeness max_activeness = get_max_index('activeness') max_influence = get_max_index('influence') max_importance = get_max_index('importance') while True: try: scan_re = s_re.next()['_source'] count += 1 uid = scan_re['uid'] activeness_key = 'activeness_'+now_date_string influence_key = now_date_string importance_key = "importance_" + now_date_string #save to normal activeness and normal influence activeness_value = scan_re['activeness'] influence_value = scan_re['influence'] importance_value = scan_re['importance'] normal_activeness = normal_index(activeness_value, max_activeness) normal_influence = normal_index(influence_value, max_influence) normal_importance = normal_index(importance_value, max_importance) add_info[uid] = {activeness_key:normal_activeness, influence_key:normal_influence, importance_key:normal_importance} if count % 1000==0: uid_list = add_info.keys() evaluate_history_results = es_user_portrait.mget(index=copy_portrait_index_name, doc_type=copy_portrait_index_type, body={'ids':uid_list})['docs'] ''' del_date = ts2datetime(time.time() - DAY*31) del_date_string = ''.join(s) del_activeness_key = 'activeness_'+del_date del_influence_key = del_date ''' iter_count = 0 for uid in uid_list: try: user_history_item = evaluate_history_results[iter_count]['_source'] except: user_history_item = {} try: user_history_item.pop(del_activeness_key) user_history_item.pop(del_influence_key) user_history_item.pop(del_importance_key) except: pass new_user_item = dict(user_history_item, **add_info[uid]) # yuankun-20151229 if add_info[uid][influence_key] < LOW_INFLUENCE_THRESHOULD:#更新活跃情况,出库 try: new_user_item["low_number"] += 1 except: new_user_item["low_number"] = 1 else: new_user_item["low_number"] = 0 aver_activeness, aver_influence, aver_importance = average_value(new_user_item) new_user_item['aver_activeness'] = aver_activeness new_user_item['aver_influence'] = aver_influence new_user_item['aver_importance'] = aver_importance #print 'add_info:', add_info[uid] #print 'user_history_item:', user_history_item #print 'new_user_item:', new_user_item action = {'index':{'_id': uid}} #print 'action:', action bulk_action.extend([action, new_user_item]) es_user_portrait.bulk(bulk_action, index=copy_portrait_index_name, doc_type=copy_portrait_index_type) bulk_action = [] add_info = {} iter_count = 0 end_ts = time.time() print '%s sec count 1000' % (end_ts - start_ts) except StopIteration: print 'all done' if len(add_info) != 0: uid_list = add_info.keys() evaluate_history_results = es_user_portrait.mget(index=copy_portrait_index_name, doc_type=copy_portrait_index_type, body={'ids':uid_list})['docs'] ''' del_date = ts2datetime(time.time() - DAY*31) del_activeness_key = 'activeness_'+del_date del_influence_key = del_date ''' iter_count = 0 for uid in uid_list: try: user_history_item = evaluate_history_results[iter_count]['_source'] except: user_history_item = {} try: user_history_item.pop(del_activeness_key) user_history_item.pop(del_influence_key) user_history_item.pop(del_importance_key) except: pass new_user_item = dict(user_history_item, **add_info[uid]) if add_info[uid][influence_key] < LOW_INFLUENCE_THRESHOULD: try: new_user_item["low_number"] += 1 except: new_user_item["low_number"] = 1 else: new_user_item["low_number"] = 0 aver_activeness, aver_influence, aver_importance = average_value(new_user_item) new_user_item['aver_activeness'] = aver_activeness new_user_item['aver_influence'] = aver_influence new_user_item['aver_importance'] = aver_importance action = {'index':{'_id': uid}} bulk_action.extend([action, new_user_item]) iter_count += 1 es_user_portrait.bulk(bulk_action, index=copy_portrait_index_name, doc_type=copy_portrait_index_type) bulk_action = [] add_info = {} iter_count = 0 break except Exception, e: raise e
def key_words_search(task_id, search_type , pre , during , start_time , keyword_list , search_key = '' , sort_norm = '', sort_scope = '' ,time = 7 , isall = False, number = 100): number = int(number) should = [] for key in keyword_list: if search_type == "hashtag": should.append({"prefix":{"text": "#" + key + "#"}}) else: should.append({"wildcard":{"text": "*" +key + "*"}}) index_list = [] date = ts2datetime(start_time) index_name = pre + date while during: if es_flow_text.indices.exists(index=index_name): index_list.append(index_name) start_time = start_time + DAY date = ts2datetime(start_time) index_name = pre + date during -= 1 print index_list uid_set = set() text_results = [] sorted_text_results = [] query_body = { "query":{ "bool":{ "must":should } }, "sort":{"user_fansnum":{"order":"desc"}}, "size":5000 } results = es_flow_text.search(index = index_list , doc_type = 'text' , body = query_body, _source=False, fields=["uid", "user_fansnum","text", "message_type", "sentiment","timestamp", "geo", "retweeted", "comment"])["hits"]["hits"] id_index = 0 index_list = [] un_uid_list = [] for item in results : if item['fields']['uid'][0] not in uid_set: uid_set.add(item['fields']['uid'][0]) un_uid_list.append(item['fields']['uid'][0]) index_list.append(id_index) id_index += 1 #get_all_filed(sort_norm , time) uid_list = [] print "un_uid_list: ", len(un_uid_list) portrait_list = [] count = 0 in_index = 0 if not isall and un_uid_list : # 库内 portrait_results = es_user_portrait.mget(index=USER_INDEX_NAME, doc_type=USER_INDEX_TYPE, body={"ids":un_uid_list}, _source=False, fields=['uname'])["docs"] for item in portrait_results: if item["found"]: portrait_list.append(item['_id']) nick_name = item['fields']['uname'][0] if nick_name == 'unknown': nick_name = item['_id'] index = index_list[in_index] weibo_url = weiboinfo2url(results[index]['fields']['uid'][0], results[index]['_id']) text_results.extend([results[index]['fields']['uid'][0], results[index]['fields']['user_fansnum'][0], results[index]['fields']['text'][0], results[index]['fields']['message_type'][0], results[index]['fields']['sentiment'][0], ts2date(results[index]['fields']['timestamp'][0]), results[index]['fields']['geo'][0], results[index]['fields']['retweeted'][0], results[index]['fields']['comment'][0], nick_name, weibo_url]) count += 1 if count == number: break print "portrait_len, ", len(portrait_list) in_index += 1 if portrait_list: uid_list = in_sort_filter(time,sort_norm ,sort_scope ,None , portrait_list , True, number) # sort for iter_uid in uid_list: iter_index = portrait_list.index(iter_uid) sorted_text_results.append(text_results[i]) elif un_uid_list: profile_result = es_user_profile.mget(index="weibo_user", doc_type="user", body={"ids":un_uid_list}, fields=['nick_name'])["docs"] for i in range(len(profile_result)): index = index_list[i] try: nick_name = profile_result[i]['fields']['nick_name'][0] except: nick_name = un_uid_list[i] item = results[index] weibo_url = weiboinfo2url(item['fields']['uid'][0], results[index]['_id']) text_results.append([item['fields']['uid'][0], item['fields']['user_fansnum'][0], item['fields']['text'][0], item['fields']['message_type'][0], item['fields']['sentiment'][0], ts2date(item['fields']['timestamp'][0]), results[index]['fields']['geo'][0], results[index]['fields']['retweeted'][0], results[index]['fields']['comment'][0], nick_name, weibo_url]) if i == number: break uid_list = all_sort_filter(un_uid_list[:number] , sort_norm , time ,True, number) sorted_text_results = [] f = open("small.txt", "wb") for iter_uid in uid_list: iter_index = un_uid_list.index(iter_uid) f.write(str(iter_uid)+"\n") sorted_text_results.append(text_results[iter_index]) f.close() print "filter_uid_list: ", len(uid_list) if uid_list: results = make_up_user_info(uid_list,isall,time,sort_norm) else: results = [] print "results: ", len(results) # 修改状态 task_detail = es_user_portrait.get(index=USER_RANK_KEYWORD_TASK_INDEX , doc_type=USER_RANK_KEYWORD_TASK_TYPE, id=task_id) item = task_detail['_source'] item['status'] = 1 item['result'] = json.dumps(results) item['text_results'] = json.dumps(sorted_text_results) item['number'] = len(results) es_user_portrait.index(index = USER_RANK_KEYWORD_TASK_INDEX , doc_type=USER_RANK_KEYWORD_TASK_TYPE , id=task_id, body=item) return "1"
def social_sensing(task_detail): # 任务名 传感器 终止时间 之前状态 创建者 时间 task_name = task_detail[0] social_sensors = task_detail[1] stop_time = task_detail[2] forward_warning_status = task_detail[3] create_by = task_detail[4] ts = int(task_detail[5]) new = int(task_detail[6]) print ts2date(ts) # PART 1 forward_result = get_forward_numerical_info(task_name, ts, create_by) # 之前时间阶段内的原创微博list/retweeted forward_origin_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range) forward_retweeted_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range, 3) # 当前阶段内原创微博list current_mid_list = query_mid_list(ts, social_sensors, time_interval) current_retweeted_mid_list = query_mid_list(ts, social_sensors, time_interval, 3) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(current_retweeted_mid_list) all_mid_list.extend(forward_origin_weibo_list) all_mid_list.extend(forward_retweeted_weibo_list) all_origin_list = [] all_origin_list.extend(current_mid_list) all_origin_list.extend(forward_origin_weibo_list) all_retweeted_list = [] all_retweeted_list.extend(current_retweeted_mid_list) all_retweeted_list.extend(forward_retweeted_weibo_list)#被转发微博的mid/root-mid print "all mid list: ", len(all_mid_list) #print "all_origin_list", all_origin_list #print "all_retweeted_list", all_retweeted_list # 查询微博在当前时间内的转发和评论数, 聚合按照message_type statistics_count = query_related_weibo(ts, all_mid_list, time_interval) if all_origin_list: origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval) # 原创微博详情 else: origin_weibo_detail = {} if all_retweeted_list: retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval) # 转发微博详情 else: retweeted_weibo_detail = {} current_total_count = statistics_count['total_count'] # 当前阶段内所有微博总数 current_retweeted_count = statistics_count['retweeted'] current_comment_count = statistics_count['comment'] # PART 2 # 聚合当前时间内积极、中性、悲伤、愤怒情绪分布 # sentiment_dict = {"0": "neutral", "1":"positive", "2":"sad", "3": "anger"} sentiment_count = {"0": 0, "1": 0, "2": 0, "3": 0} search_results = aggregation_sentiment_related_weibo(ts, all_mid_list, time_interval) sentiment_count = search_results print "sentiment_count: ", sentiment_count negetive_key = ["2", "3", "4", "5", "6"] negetive_count = 0 for key in negetive_key: negetive_count += sentiment_count[key] # 聚合当前时间内重要的人 important_uid_list = [] datetime = ts2datetime(ts-time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: search_results = get_important_user(ts, all_mid_list, time_interval) important_uid_list = search_results # 根据获得uid_list,从人物库中匹配重要人物 if important_uid_list: important_results = es_user_portrait.mget(index=portrait_index_name,doc_type=portrait_index_type, body={"ids": important_uid_list})['docs'] else: important_results = [] filter_important_list = [] # uid_list if important_results: for item in important_results: if item['found']: #if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD: filter_important_list.append(item['_id']) #判断感知 burst_reason = signal_nothing_variation warning_status = signal_nothing finish = unfinish_signal # "0" process_status = "1" if forward_result[0]: # 根据移动平均判断是否有时间发生 mean_count = forward_result[1] std_count = forward_result[2] mean_sentiment = forward_result[3] std_sentiment = forward_result[4] if mean_count >= MEAN_COUNT and current_total_count > mean_count+1.96*std_count or current_total_count >= len(all_mid_list)*AVERAGE_COUNT: # 异常点发生 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track else: warning_status = signal_brust burst_reason += signal_count_varition # 数量异常 if negetive_count > mean_sentiment+1.96*std_sentiment and mean_sentiment >= MEAN_COUNT or negetive_count >= len(all_mid_list)*AVERAGE_COUNT: warning_status = signal_brust burst_reason += signal_sentiment_varition # 负面情感异常, "12"表示两者均异常 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track if int(stop_time) <= ts: # 检查任务是否已经完成 finish = finish_signal process_status = "0" # 感知到的事, all_mid_list tmp_burst_reason = burst_reason topic_list = [] sensitive_text_list = [] # 有事件发生时开始 #if warning_status: if 1: index_list = [] important_words = [] datetime_1 = ts2datetime(ts) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es = es_text.indices.exists(index=index_name_1) if exist_es: index_list.append(index_name_1) datetime_2 = ts2datetime(ts-DAY) index_name_2 = flow_text_index_name_pre + datetime_2 exist_es = es_text.indices.exists(index=index_name_2) if exist_es: index_list.append(index_name_2) if index_list and all_mid_list: query_body = { "query":{ "filtered":{ "filter":{ "terms":{"mid": all_mid_list} } } }, "size": 2000 } search_results = es_text.search(index=index_list, doc_type="text", body=query_body)['hits']['hits'] text_list = [] tmp_sensitive_warning = "" sensitive_words_dict = dict() if search_results: for item in search_results: iter_mid = item['_source']['mid'] iter_text = item['_source']['text'] iter_sensitive = item['_source'].get('sensitive', 0) if iter_sensitive: tmp_sensitive_warning = signal_sensitive_variation #涉及到敏感词的微博 sensitive_words_dict[iter_mid] = iter_sensitive temp_dict = dict() temp_dict["mid"] = iter_mid temp_dict["text"] = iter_text text_list.append(temp_dict) if tmp_sensitive_warning: warning_status = signal_brust burst_reason += signal_sensitive_variation sensitive_weibo_detail = {} if sensitive_words_dict: sensitive_mid_list = sensitive_words_dict.keys() sensitivie_weibo_detail = query_hot_weibo(ts, sensitive_mid_list, time_interval) """ if len(text_list) == 1: top_word = freq_word(text_list[0]) topic_list = [top_word.keys()] elif len(text_list) == 0: topic_list = [] tmp_burst_reason = "" #没有相关微博,归零 print "no relate weibo text" else: feature_words, input_word_dict = tfidf(text_list) #生成特征词和输入数据 word_label, evaluation_results = kmeans(feature_words, text_list) #聚类 inputs = text_classify(text_list, word_label, feature_words) clustering_topic = cluster_evaluation(inputs) print "clustering weibo topic" sorted_dict = sorted(clustering_topic.items(), key=lambda x:x[1], reverse=True) topic_list = [] if sorted_dict: for item in sorted_dict: if item[0] != "other": topic_list.append(word_label[item[0]]) print "topic list: ", len(topic_list) """ results = dict() if sensitive_weibo_detail: print "sensitive_weibo_detail: ", sensitive_weibo_detail results['sensitive_words_dict'] = json.dumps(sensitive_words_dict) results['sensitive_weibo_detail'] = json.dumps(sensitive_weibo_detail) results['origin_weibo_number'] = len(all_origin_list) results['retweeted_weibo_number'] = len(all_retweeted_list) results['origin_weibo_detail'] = json.dumps(origin_weibo_detail) results['retweeted_weibo_detail'] = json.dumps(retweeted_weibo_detail) results['retweeted_weibo_count'] = current_retweeted_count results['comment_weibo_count'] = current_comment_count results['weibo_total_number'] = current_total_count results['sentiment_distribution'] = json.dumps(sentiment_count) results['important_users'] = json.dumps(filter_important_list) results['unfilter_users'] = json.dumps(important_uid_list) results['burst_reason'] = tmp_burst_reason results['timestamp'] = ts #results['clustering_topic'] = json.dumps(topic_list) # es存储当前时段的信息 doctype = create_by + '-' + task_name es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results) # 更新manage social sensing的es信息 if not new: temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=doctype)['_source'] temporal_result['warning_status'] = warning_status temporal_result['burst_reason'] = tmp_burst_reason temporal_result['finish'] = finish temporal_result['processing_status'] = process_status history_status = json.loads(temporal_result['history_status']) history_status.append([ts, task_name, warning_status]) temporal_result['history_status'] = json.dumps(history_status) es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=doctype, body=temporal_result) else: print "test" return "1"
def get_scan_results(): result_dict = {} gender_result = {'1':0, '2':0} verified_result = {'yes':0, 'no':0} location_result = {} activity_geo_result = {} keywords_result = {} hashtag_result = {} topic_result = {} online_pattern_result = {} domain_result = {} no_gender_count = 0 no_verified_count = 0 no_location_count = 0 no_activity_geo_count = 0 no_keywords_count = 0 no_hashtag_count = 0 no_topic_count = 0 no_online_pattern_count = 0 no_domain_count = 0 s_re = scan(es, query={'query':{'match_all':{}}, 'size':100}, index=index_name, doc_type=index_type) print 's_re:', s_re activity_count = 0 while True: portrait_uid_list = [] while True: try: scan_re = s_re.next()['_source'] # gender ratio count portrait_uid_list.append(scan_re['uid']) #print 'portrait_uid_list:', len(portrait_uid_list) try: gender_result[str(scan_re['gender'])] += 1 except: no_gender_count += 1 # verified ratio count try: verified_result[str(scan_re['verified'])] += 1 except: no_verified_count += 1 # loation top try: location = scan_re['location'] if len(location.split(' '))>1: location = location.split(' ')[0] try: location_result[location] += 1 except: location_result[location] = 1 except: no_location_count += 1 # activity geo try: activity_geo = scan_re['activity_geo_dict'] if scan_re: activity_geo_dict = json.loads(activity_geo) for geo in activity_geo_dict: geo_list = geo.split('\t') if geo_list[0]==u'中国' and len(geo_list)>=2: province = geo_list[1] try: activity_geo_result[province] += activity_geo_dict[geo] except: activity_geo_result[province] = activity_geo_dict[geo] except: no_activity_geo_count += 1 # keywords try: keywords = json.loads(scan_re['keywords']) if keywords: for word in keywords: try: keywords_result[word] += keywords[word] except: keywords_result[word] = keywords[word] except: no_keywords_count += 1 # hashtag top try: hashtag_dict = json.loads(scan_re['hashtag_dict']) if hashtag_dict: for tag in hashtag_dict: try: hashtag_result[tag] += hashtag_dict[tag] except: hashtag_result[tag] = hashtag_dict[tag] except: no_hashtag_count += 1 # topic top try: topic = json.loads(scan_re['topic']) if topic: for item in topic: try: topic_result[item] += 1 except: topic_result[item] = 1 except: no_topic_count += 1 # online pattern top try: online_pattern = json.loads(scan_re['online_pattern']) if online_pattern: for item in online_pattern: try: online_pattern_result[item] += online_pattern[item] except: online_pattern_result[item] = online_pattern[item] except: no_online_pattern_count += 1 # domain top try: domain = scan_re['domain'] if domain: domain_list = domain.split('_') for item in domain_list: try: domain_result[item] += 1 except: domain_result[item] = 1 except: no_domain_count += 1 except StopIteration: print 'all done' # gender ratio count count = sum(gender_result.values()) gender_ratio = {'1':float(gender_result['1']) / count, '2':float(gender_result['2']) / count} #print 'gender ratio:', gender_ratio activity_result = es.mget(index='20130907', doc_type='bci', body={'ids':portrait_uid_list})['docs'] for activity_item in activity_result: if activity_item['found']: activity_count += 1 #print 'activity_count:', activity_count result_dict['activity_count'] = float(activity_count) / count result_dict['gender_ratio'] = json.dumps(gender_ratio) # verified ratio count count = sum(verified_result.values()) if count==0: verified_ratio = {'yes':0.5, 'no':0.5} else: verified_ratio = {'yes':float(verified_result['yes']) / count, 'no':float(verified_result['no'])/count} #print 'verified ratio:', verified_ratio result_dict['verified_ratio'] = json.dumps(verified_ratio) # location top if location_result: sort_location = sorted(location_result.items(), key=lambda x:x[1], reverse=True) location_top = sort_location[:5] else: location_top = {} #print 'location top:', location_top result_dict['location_top'] = json.dumps(location_top) # activity geo top if activity_geo_result: sort_activity_geo = sorted(activity_geo_result.items(), key=lambda x:x[1], reverse=True) activity_geo_top = sort_activity_geo[:50] else: activity_geo_top = {} #print 'activity_geo_top:', activity_geo_top result_dict['activity_geo_top'] = json.dumps(activity_geo_top) # keywords top if keywords_result: sort_keywords = sorted(keywords_result.items(), key=lambda x:x[1], reverse=True) keywords_top = sort_keywords[:50] else: keywords_top = {} #print 'keywords_top:', keywords_top result_dict['keywords_top'] = json.dumps(keywords_top) # hashtag top if hashtag_result: sort_hashtag = sorted(hashtag_result.items(), key=lambda x:x[1], reverse=True) hashtag_top = sort_hashtag[:50] else: hashtag_top = {} #print 'hashtag top:', hashtag_top result_dict['hashtag_top'] = json.dumps(hashtag_top) # topic top if topic_result: sort_topic = sorted(topic_result.items(), key=lambda x:x[1], reverse=True) topic_top = sort_topic[:50] else: topic_top = {} #print 'topic top:', topic_top result_dict['topic_top'] = json.dumps(topic_top) # online_pattern top if online_pattern_result: sort_online_pattern = sorted(online_pattern_result.items(), key=lambda x:x[1], reverse=True) online_pattern_top = sort_online_pattern[:50] else: online_pattern_top = {} #print 'online pattern top:', online_pattern_top result_dict['online_pattern_top'] = json.dumps(online_pattern_top) # domain top if domain_result: sort_domain = sorted(domain_result.items(), key=lambda x:x[1], reverse=True) domain_top = sort_domain[:20] #test: domain_top = [('education',50), ('art', 40), ('lawyer', 30), ('student', 20), ('media', 10), ('oversea',1)] else: domain_top = {} #print 'domain top:', domain_top result_dict['domain_top'] = json.dumps(domain_top) #test need to add domain top user domain_top = [[u'媒体',1],[u'法律人士',1], [u'政府机构人士',1], [u'活跃人士',1], [u'媒体人士',1], [u'商业人士',1],\ [u'高校微博', 1], [u'境内机构', 1], [u'境外机构', 1], [u'民间组织',1], [u'草根',1], [u'其他', 1]] result_dict['domain_top_user'] = json.dumps(get_domain_top_user(domain_top)) #test need to add topic user topic_top = [[u'军事', 1], [u'政治',1], [u'体育',1], [u'计算机',1], [u'民生',1], [u'生活',1],\ [u'娱乐',1], [u'健康',1], [u'交通',1], [u'经济',1], [u'教育',1], [u'自然',1]] result_dict['topic_top_user'] = json.dumps(get_topic_top_user(topic_top)) return result_dict except Exception, r: print Exception, r return result_dict #print 'portrait_uid_list:', len(portrait_uid_list) activity_result = es.mget(index='20130907', doc_type='bci', body={'ids':portrait_uid_list})['docs'] for activity_item in activity_result: if activity_item['found']: activity_count += 1
def get_structure_user(seed_uid_list, structure_dict, filter_dict): structure_user_dict = {} retweet_mark = int(structure_dict['retweet']) comment_mark = int(structure_dict['comment']) hop = int(structure_dict['hop']) retweet_user_dict = {} comment_user_dict = {} #get retweet/comment es db_number now_ts = time.time() db_number = get_db_num(now_ts) #iter to find seed uid list retweet/be_retweet/comment/be_comment user set by hop iter_hop_user_list = seed_uid_list iter_count = 0 all_union_result = dict() while iter_count < hop: # hop number control iter_count += 1 search_user_count = len(iter_hop_user_list) hop_union_result = dict() iter_search_count = 0 while iter_search_count < search_user_count: iter_search_user_list = iter_hop_user_list[iter_search_count: iter_search_count + DETECT_ITER_COUNT] #step1: mget retweet and be_retweet if retweet_mark == 1: retweet_index_name = retweet_index_name_pre + str(db_number) be_retweet_index_name = be_retweet_index_name_pre + str(db_number) #mget retwet try: retweet_result = es_retweet.mget(index=retweet_index_name, doc_type=retweet_index_type, \ body={'ids':iter_search_user_list}, _source=True)['docs'] except: retweet_result = [] #mget be_retweet try: be_retweet_result = es_retweet.mget(index=be_retweet_index_name, doc_type=be_retweet_type, \ body={'ids':iter_search_user_list} ,_source=True)['docs'] except: be_retweet_result = [] #step2: mget comment and be_comment if comment_mark == 1: comment_index_name = comment_index_name_pre + str(db_number) be_comment_index_name = be_comment_index_name_pre + str(db_number) #mget comment try: comment_result = es_comment.mget(index=comment_index_name, doc_type=comment_index_type, \ body={'ids':iter_search_user_list}, _source=True)['docs'] except: comment_result = [] #mget be_comment try: be_comment_result = es_comment.mget(index=be_comment_index_name, doc_type=be_comment_index_type, \ body={'ids':iter_search_user_list}, _source=True)['docs'] except: be_comment_result = [] #step3: union retweet/be_retweet/comment/be_comment result union_count = 0 for iter_search_uid in iter_search_user_list: try: uid_retweet_dict = json.loads(retweet_result[union_count]['_source']['uid_retweet']) except: uid_retweet_dict = {} try: uid_be_retweet_dict = json.loads(be_retweet_result[union_count]['_source']['uid_be_retweet']) except: uid_be_retweet_dict = {} try: uid_comment_dict = json.loads(comment_result[union_count]['_source']['uid_comment']) except: uid_comment_dict = {} try: uid_be_comment_dict = json.loads(be_comment_result[union_count]['_source']['uid_be_comment']) except: uid_be_comment_dict = {} #union four type user set union_result = union_dict(uid_retweet_dict, uid_be_retweet_dict, uid_comment_dict, uid_be_comment_dict) hop_union_result = union_dict(hop_union_result, union_result) #step4: add iter search count iter_search_count += DETECT_ITER_COUNT #pop seed uid self for iter_hop_user_item in iter_hop_user_list: try: hop_union_result.pop(iter_hop_user_item) except: pass #get new iter_hop_user_list iter_hop_user_list = hop_union_result.keys() #get all union result all_union_result = union_dict(all_union_result, hop_union_result) #step5: identify the who is in user_portrait sort_all_union_result = sorted(all_union_result.items(), key=lambda x:x[1], reverse=True) iter_count = 0 all_count = len(sort_all_union_result) in_portrait_result = [] filter_importance_from = filter_dict['importance']['gte'] filter_importance_to = filter_dict['importance']['lt'] filter_influence_from = filter_dict['influence']['gte'] filter_influence_to = filter_dict['influence']['lt'] while iter_count < all_count: iter_user_list = [item[0] for item in sort_all_union_result[iter_count:iter_count + DETECT_ITER_COUNT]] try: portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, \ body={'ids':iter_user_list}, _source=True)['docs'] except: portrait_result = [] for portrait_item in portrait_result: if portrait_item['found'] == True: if portrait_item['_source']['importance'] >= filter_importance_from and portrait_item['_source']['importance'] <= filter_importance_to: if portrait_item['_source']['influence'] >= filter_influence_from and portrait_item['_source']['influence'] <= filter_influence_to: uid = portrait_item['_id'] in_portrait_result.append(uid) if len(in_portrait_result) > (filter_dict['count'] * DETECT_COUNT_EXPAND): break iter_count += DETECT_ITER_COUNT return in_portrait_result
def get_attr_social(uid_list): #test ''' uid_list = ['1514608170', '2729648295', '3288875501', '1660612723', '1785934112',\ '2397686502', '1748065927', '2699434042', '1886419032', '1830325932'] ''' result = {} union_dict = {} union_edge_count = 0 union_weibo_count = 0 union_user_set = set() group_user_set = set(uid_list) be_retweeted_out = 0 be_retweeted_count_out = 0 retweet_relation = [] out_beretweet_relation = [] for uid in uid_list: in_stat_results = dict() out_stat_results = dict() for db_num in r_dict: r_db = r_dict[db_num] ruid_results = r_db.hgetall('retweet_'+str(uid)) #print 'len ruid_result:', len(ruid_results) if ruid_results: for ruid in ruid_results: try: in_stat_results[ruid] += ruid_results[ruid] except: in_stat_results[ruid] = ruid_results[ruid] br_uid_results = r_db.hgetall('be_retweet_'+str(uid)) #print 'len br_uid_results:', len(br_uid_results) if br_uid_results: for br_uid in br_uid_results: try: out_stat_results[br_uid] += br_uid_results[br_uid] except: out_stat_results[br_uid] = br_uid_results[br_uid] # record the retweet relation in group uid uid_retweet_relation = [[uid, user, int(in_stat_results[user])] for user in in_stat_results if user in uid_list and user != uid] retweet_relation.extend(uid_retweet_relation) # record the be_retweet relation out group uid but in user_portrait uid_beretweet_relation = [] uid_beretweet = [user for user in out_stat_results if user not in uid_list] es_portrait_result = es.mget(index='user_portrait', doc_type='user', body={'ids':uid_beretweet})['docs'] for be_retweet_item in es_portrait_result: br_uid = be_retweet_item['_id'] beretweet_count = int(out_stat_results[br_uid]) try: be_retweet_source = be_retweet_item['_source'] if be_retweet_source['influence']>=900: uid_beretweet_relation.append([uid, br_uid, be_retweet_source['uname'], beretweet_count, be_retweet_source['influence']]) except: next out_beretweet_relation.extend(uid_beretweet_relation) retweet_user_set = set(in_stat_results.keys()) union_set = retweet_user_set & (group_user_set - set([uid])) union_edge_count += len(union_set) # count the retweet edge number if union_set: for ruid in union_set: union_weibo_count += int(in_stat_results[ruid]) union_user_set = union_user_set | union_set #use to count the beretweeted by user who is out of the group be_retweeted_user_set = set(out_stat_results.keys()) subtract_set = be_retweeted_user_set - set(uid_list) be_retweeted_out += len(subtract_set) be_retweeted_count_out_list = [int(out_stat_results[br_uid]) for br_uid in subtract_set] #print 'be_retweeted_count_out_list:', be_retweeted_count_out_list be_retweeted_count_out += sum(be_retweeted_count_out_list) result['density'] = float(union_edge_count) / (len(uid_list) * (len(uid_list)-1)) result['retweet_weibo_count'] = float(union_weibo_count) / len(uid_list) result['retweet_user_count'] = float(len(union_user_set)) / len(uid_list) result['be_retweeted_count_out'] = be_retweeted_count_out result['be_retweeted_out'] = be_retweeted_out if retweet_relation!=[]: sort_retweet_relation = sorted(retweet_relation, key=lambda x:x[2], reverse=True) else: sort_retweet_relation = [] result['retweet_relation'] = json.dumps(sort_retweet_relation) if out_beretweet_relation!=[]: sort_out_beretweet_relation = sorted(out_beretweet_relation, key=lambda x:x[4], reverse=True) else: sort_out_beretweet_relation = [] result['out_beretweet_relation'] = json.dumps(sort_out_beretweet_relation) #print 'be_retweeted_out, be_retweeted_count_out:', be_retweeted_out, be_retweeted_count_out #print 'result:', result #print 'out_beretweet_relation:', sort_out_beretweet_relation return result
def social_sensing(task_detail): # 任务名 传感器 终止时间 之前状态 创建者 时间 task_name = task_detail[0] social_sensors = task_detail[1] stop_time = task_detail[2] forward_warning_status = task_detail[3] create_by = task_detail[4] ts = int(task_detail[5]) new = int(task_detail[6]) print ts2date(ts) # PART 1 forward_result = get_forward_numerical_info(task_name, ts, create_by) # 之前时间阶段内的原创微博list/retweeted forward_origin_weibo_list = query_mid_list(ts - time_interval, social_sensors, forward_time_range) forward_retweeted_weibo_list = query_mid_list(ts - time_interval, social_sensors, forward_time_range, 3) # 当前阶段内原创微博list current_mid_list = query_mid_list(ts, social_sensors, time_interval) current_retweeted_mid_list = query_mid_list(ts, social_sensors, time_interval, 3) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(current_retweeted_mid_list) all_mid_list.extend(forward_origin_weibo_list) all_mid_list.extend(forward_retweeted_weibo_list) all_origin_list = [] all_origin_list.extend(current_mid_list) all_origin_list.extend(forward_origin_weibo_list) all_retweeted_list = [] all_retweeted_list.extend(current_retweeted_mid_list) all_retweeted_list.extend( forward_retweeted_weibo_list) #被转发微博的mid/root-mid print "all mid list: ", len(all_mid_list) #print "all_origin_list", all_origin_list #print "all_retweeted_list", all_retweeted_list # 查询微博在当前时间内的转发和评论数, 聚合按照message_type statistics_count = query_related_weibo(ts, all_mid_list, time_interval) if all_origin_list: origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval) # 原创微博详情 else: origin_weibo_detail = {} if all_retweeted_list: retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval) # 转发微博详情 else: retweeted_weibo_detail = {} current_total_count = statistics_count['total_count'] # 当前阶段内所有微博总数 current_retweeted_count = statistics_count['retweeted'] current_comment_count = statistics_count['comment'] # PART 2 # 聚合当前时间内积极、中性、悲伤、愤怒情绪分布 # sentiment_dict = {"0": "neutral", "1":"positive", "2":"sad", "3": "anger"} sentiment_count = {"0": 0, "1": 0, "2": 0, "3": 0} search_results = aggregation_sentiment_related_weibo( ts, all_mid_list, time_interval) sentiment_count = search_results print "sentiment_count: ", sentiment_count negetive_key = ["2", "3", "4", "5", "6"] negetive_count = 0 for key in negetive_key: negetive_count += sentiment_count[key] # 聚合当前时间内重要的人 important_uid_list = [] datetime = ts2datetime(ts - time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: search_results = get_important_user(ts, all_mid_list, time_interval) important_uid_list = search_results # 根据获得uid_list,从人物库中匹配重要人物 if important_uid_list: important_results = es_user_portrait.mget( index=portrait_index_name, doc_type=portrait_index_type, body={"ids": important_uid_list})['docs'] else: important_results = [] filter_important_list = [] # uid_list if important_results: for item in important_results: if item['found']: #if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD: filter_important_list.append(item['_id']) #判断感知 burst_reason = signal_nothing_variation warning_status = signal_nothing finish = unfinish_signal # "0" process_status = "1" if forward_result[0]: # 根据移动平均判断是否有时间发生 mean_count = forward_result[1] std_count = forward_result[2] mean_sentiment = forward_result[3] std_sentiment = forward_result[4] if mean_count >= MEAN_COUNT and current_total_count > mean_count + 1.96 * std_count or current_total_count >= len( all_mid_list) * AVERAGE_COUNT: # 异常点发生 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track else: warning_status = signal_brust burst_reason += signal_count_varition # 数量异常 if negetive_count > mean_sentiment + 1.96 * std_sentiment and mean_sentiment >= MEAN_COUNT or negetive_count >= len( all_mid_list) * AVERAGE_COUNT: warning_status = signal_brust burst_reason += signal_sentiment_varition # 负面情感异常, "12"表示两者均异常 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track if int(stop_time) <= ts: # 检查任务是否已经完成 finish = finish_signal process_status = "0" # 感知到的事, all_mid_list tmp_burst_reason = burst_reason topic_list = [] sensitive_text_list = [] # 有事件发生时开始 #if warning_status: if 1: index_list = [] important_words = [] datetime_1 = ts2datetime(ts) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es = es_text.indices.exists(index=index_name_1) if exist_es: index_list.append(index_name_1) datetime_2 = ts2datetime(ts - DAY) index_name_2 = flow_text_index_name_pre + datetime_2 exist_es = es_text.indices.exists(index=index_name_2) if exist_es: index_list.append(index_name_2) if index_list and all_mid_list: query_body = { "query": { "filtered": { "filter": { "terms": { "mid": all_mid_list } } } }, "size": 5000 } search_results = es_text.search(index=index_list, doc_type="text", body=query_body)['hits']['hits'] tmp_sensitive_warning = "" text_dict = dict() # 文本信息 mid_value = dict() # 文本赋值 duplicate_dict = dict() # 重合字典 portrait_dict = dict() # 背景信息 classify_text_dict = dict() # 分类文本 classify_uid_list = [] duplicate_text_list = [] sensitive_words_dict = dict() if search_results: for item in search_results: iter_uid = item['_source']['uid'] iter_mid = item['_source']['mid'] iter_text = item['_source']['text'].encode( 'utf-8', 'ignore') iter_sensitive = item['_source'].get('sensitive', 0) duplicate_text_list.append({ "_id": iter_mid, "title": "", "content": iter_text }) if iter_sensitive: tmp_sensitive_warning = signal_sensitive_variation #涉及到敏感词的微博 sensitive_words_dict[iter_mid] = iter_sensitive keywords_dict = json.loads( item['_source']['keywords_dict']) personal_keywords_dict = dict() for k, v in keywords_dict.iteritems(): k = k.encode('utf-8', 'ignore') personal_keywords_dict[k] = v classify_text_dict[iter_mid] = personal_keywords_dict classify_uid_list.append(iter_uid) # 去重 if duplicate_text_list: dup_results = duplicate(duplicate_text_list) for item in dup_results: if item['duplicate']: duplicate_dict[item['_id']] = item['same_from'] # 分类 if classify_text_dict: classify_results = topic_classfiy(classify_uid_list, classify_text_dict) mid_value = dict() #print "classify_results: ", classify_results for k, v in classify_results.iteritems(): # mid:value mid_value[k] = topic_value_dict[v[0]] if tmp_sensitive_warning: warning_status = signal_brust burst_reason += signal_sensitive_variation sensitive_weibo_detail = {} if sensitive_words_dict: sensitive_mid_list = sensitive_words_dict.keys() sensitivie_weibo_detail = query_hot_weibo( ts, sensitive_mid_list, time_interval) results = dict() results['mid_topic_value'] = json.dumps(mid_value) results['duplicate_dict'] = json.dumps(duplicate_dict) results['sensitive_words_dict'] = json.dumps(sensitive_words_dict) results['sensitive_weibo_detail'] = json.dumps(sensitive_weibo_detail) results['origin_weibo_number'] = len(all_origin_list) results['retweeted_weibo_number'] = len(all_retweeted_list) results['origin_weibo_detail'] = json.dumps(origin_weibo_detail) results['retweeted_weibo_detail'] = json.dumps(retweeted_weibo_detail) results['retweeted_weibo_count'] = current_retweeted_count results['comment_weibo_count'] = current_comment_count results['weibo_total_number'] = current_total_count results['sentiment_distribution'] = json.dumps(sentiment_count) results['important_users'] = json.dumps(filter_important_list) results['unfilter_users'] = json.dumps(important_uid_list) results['burst_reason'] = tmp_burst_reason results['timestamp'] = ts #results['clustering_topic'] = json.dumps(topic_list) # es存储当前时段的信息 doctype = create_by + '-' + task_name es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results) # 更新manage social sensing的es信息 if not new: temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=doctype)['_source'] temporal_result['warning_status'] = warning_status temporal_result['burst_reason'] = tmp_burst_reason temporal_result['finish'] = finish temporal_result['processing_status'] = process_status history_status = json.loads(temporal_result['history_status']) history_status.append([ts, task_name, warning_status]) temporal_result['history_status'] = json.dumps(history_status) es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=doctype, body=temporal_result) else: print "test" return "1"
def get_seed_user_attribute(seed_user_list, attribute_list): results = {} attribute_query_list = [] #step1: mget user result from user_portrait try: seed_user_portrait = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, \ body={'ids':seed_user_list}, _source=True)['docs'] except: seed_user_portrait = [] #init results dict---result={'location':{}, 'domain':{}, ...} for attribute_item in attribute_list: results[attribute_item] = {} #step2: compute attribute result about attribute_list for seed_user_item in seed_user_portrait: uid = seed_user_item['_id'] if seed_user_item['found'] == True: source = seed_user_item['_source'] #static the attribute #step2.1: location if 'location' in attribute_list: location_value = source['location'] try: results['location'][location_value] += 1 except: results['location'][location_value] = 1 #step2.2: domain if 'domain' in attribute_list: domain_value = source['domain'] try: results['domain'][domain_value] += 1 except: results['domain'][domain_value] = 1 #step2.3: topic_string if 'topic_string' in attribute_list: topic_value_string = source['topic_string'] topic_value_list = topic_value_string.split('&') for topic_item in topic_value_list: try: results['topic_string'][topic_item] += 1 except: results['topic_string'][topic_item] = 1 #step2.4: keywords_string if 'keywords_string' in attribute_list: keywords_value_string = source['keywords_string'] keywords_value_list = keywords_value_string.split('&') for keywords_item in keywords_value_list: try: results['keywords_string'][keywords_item] += 1 except: results['keywords_string'][keywords_item] = 1 #step2.5: hashtag if 'hashtag' in attribute_list: hashtag_value_string = source['hashtag'] hashtag_value_list = hashtag_value_string.split('&') for hashtag_item in hashtag_value_list: try: results['hashtag'][hashtag_item] += 1 except: results['hashtag'][hashtag_item] = 1 #step2.6: activity_geo if 'activity_geo' in attribute_list: activity_geo_dict = json.loads(source['activity_geo_dict'])[-1] for activity_geo_item in activity_geo_dict: try: results['activity_geo'][activity_geo_item] += 1 except: results['activity_geo'][activity_geo_item] = 1 #step2.7: tendency #step2.8: tag #step2.9: remark #step3: get search attribtue value-- new attribute query condition new_attribute_query_condition = [] for item in results: iter_dict = results[item] sort_item_dict = sorted(iter_dict.items(), key=lambda x:x[1], reverse=True) nest_body_list = [] for query_item in sort_item_dict[:3]: item_value = query_item[0] nest_body_list.append({'wildcard':{item: '*'+item_value+'*'}}) new_attribute_query_condition.append({'bool':{'should': nest_body_list}}) return new_attribute_query_condition
def get_tweets_distribute(xnr_user_no): topic_distribute_dict = {} topic_distribute_dict['radar'] = {} uid = xnr_user_no2uid(xnr_user_no) if xnr_user_no: es_results = es.get(index=weibo_xnr_fans_followers_index_name,doc_type=weibo_xnr_fans_followers_index_type,\ id=xnr_user_no)["_source"] followers_list = es_results['followers_list'] if S_TYPE == 'test': uid=PORTRAI_UID followers_list=PORTRAIT_UID_LIST # 关注者topic分布 results = es_user_portrait.mget(index=portrait_index_name,doc_type=portrait_index_type,\ body={'ids':followers_list})['docs'] topic_list_followers = [] for result in results: if result['found'] == True: result = result['_source'] topic_string_first = result['topic_string'].split('&') topic_list_followers.extend(topic_string_first) topic_list_followers_count = Counter(topic_list_followers) #topic_distribute_dict['topic_follower'] = topic_list_followers_count # 虚拟人topic分布 try: xnr_results = es_user_portrait.get(index=portrait_index_name,doc_type=portrait_index_type,\ id=uid)['_source'] topic_string = xnr_results['topic_string'].split('&') topic_xnr_count = Counter(topic_string) #topic_distribute_dict['topic_xnr'] = topic_xnr_count except: topic_xnr_count = {} #topic_distribute_dict['topic_xnr'] = topic_xnr_count # 整理雷达图数据 # if topic_xnr_count: # for topic, value in topic_xnr_count.iteritems(): # try: # topic_value = float(value)/(topic_list_followers_count[topic]) # except: # continue # topic_distribute_dict['radar'][topic] = topic_value if topic_xnr_count: for topic, value in topic_list_followers_count.iteritems(): try: topic_value = float(topic_xnr_count[topic])/value except: continue topic_distribute_dict['radar'][topic] = topic_value # 整理仪表盘数据 mark = 0 if topic_xnr_count: n_topic = len(topic_list_followers_count.keys()) for topic,value in topic_xnr_count.iteritems(): try: mark += float(value)/(topic_list_followers_count[topic]*n_topic) print topic print mark except: continue topic_distribute_dict['mark'] = mark return topic_distribute_dict
def get_follow_group_distribute(xnr_user_no): domain_distribute_dict = {} domain_distribute_dict['radar'] = {} if S_TYPE == 'test': followers_list=PORTRAIT_UID_LIST followers_list_today = FOLLOWERS_TODAY else: # 获取所有关注者 es_results = es.get(index=weibo_xnr_fans_followers_index_name,doc_type=weibo_xnr_fans_followers_index_type,\ id=xnr_user_no)["_source"] followers_list = es_results['followers_list'] # 获取今日关注者 current_time = int(time.time()-DAY) current_date = ts2datetime(current_time) r_uid_list_datetime_index_name = r_followers_uid_list_datetime_pre + current_date followers_results = r_fans_followers.hget(r_uid_list_datetime_index_name,xnr_user_no) followers_list_today = json.loads(followers_results) # 所有关注者领域分布 results = es_user_portrait.mget(index=portrait_index_name,doc_type=portrait_index_type,\ body={'ids':followers_list})['docs'] domain_list_followers = [] for result in results: if result['found'] == True: result = result['_source'] domain_name = result['domain'] domain_list_followers.append(domain_name) domain_list_followers_count = Counter(domain_list_followers) #domain_distribute_dict['domain_follower'] = domain_list_followers_count # 今日关注者 try: today_results = es_user_portrait.mget(index=portrait_index_name,doc_type=portrait_index_type,\ body={'ids':followers_list_today})['docs'] domain_list_followers_today = [] for result in today_results: if result['found'] == True: result = result['_source'] domain_name = result['domain'] domain_list_followers_today.append(domain_name) domain_list_followers_today_count = Counter(domain_list_followers_today) except: domain_list_followers_today_count = {} # 整理雷达图数据 # if domain_list_followers_today_count: # for domain, value in domain_list_followers_today_count.iteritems(): # try: # domain_value = float(value)/(domain_list_followers_count[domain]) # except: # continue # domain_distribute_dict['radar'][domain] = domain_value if domain_list_followers_today_count: for domain, value in domain_list_followers_today_count.iteritems(): try: domain_value = float(domain_list_followers_today_count[domain])/value except: continue domain_distribute_dict['radar'][domain] = domain_value # 整理仪表盘数据 mark = 0 print 'domain_list_followers_today_count::',domain_list_followers_today_count print 'domain_distribute_dict::',domain_distribute_dict if domain_list_followers_today_count: n_domain = len(domain_list_followers_count.keys()) for domain,value in domain_list_followers_today_count.iteritems(): try: mark += float(value)/(domain_list_followers_count[domain]*n_domain) except: continue domain_distribute_dict['mark'] = mark return domain_distribute_dict
def scan_index_history(): s_re = scan(es_user_portrait, query={ 'query': { 'match_all': {} }, 'size': 1000 }, index=portrait_index_name, doc_type=portrait_index_type) bulk_action = [] add_info = {} count = 0 start_ts = time.time() now_date = ts2datetime(start_ts - DAY) now_date = '2013-09-06' #now_date_string = ''.join(now_date.split('-')) now_date_string = now_date activeness_key = 'activeness_' + now_date_string #influence_key = now_date_string influence_key = now_date_string importance_key = "importance_" + now_date_string del_date = ts2datetime(time.time() - DAY * 31) #del_date_string = ''.join(del_date.split('-')) del_date_string = del_date del_activeness_key = 'activeness_' + del_date_string #del_influence_key = del_date_string del_influence_key = del_date_string del_importance_key = "importance_" + del_date_string #get max value for importance and activeness max_activeness = get_max_index('activeness') max_influence = get_max_index('influence') max_importance = get_max_index('importance') while True: try: scan_re = s_re.next()['_source'] count += 1 uid = scan_re['uid'] activeness_key = 'activeness_' + now_date_string influence_key = now_date_string importance_key = "importance_" + now_date_string #save to normal activeness and normal influence activeness_value = scan_re['activeness'] influence_value = scan_re['influence'] importance_value = scan_re['importance'] normal_activeness = normal_index(activeness_value, max_activeness) normal_influence = normal_index(influence_value, max_influence) normal_importance = normal_index(importance_value, max_importance) add_info[uid] = { activeness_key: normal_activeness, influence_key: normal_influence, importance_key: normal_importance } if count % 1000 == 0: uid_list = add_info.keys() evaluate_history_results = es_user_portrait.mget( index=copy_portrait_index_name, doc_type=copy_portrait_index_type, body={'ids': uid_list})['docs'] ''' del_date = ts2datetime(time.time() - DAY*31) del_date_string = ''.join(s) del_activeness_key = 'activeness_'+del_date del_influence_key = del_date ''' iter_count = 0 for uid in uid_list: try: user_history_item = evaluate_history_results[ iter_count]['_source'] except: user_history_item = {} try: user_history_item.pop(del_activeness_key) user_history_item.pop(del_influence_key) user_history_item.pop(del_importance_key) except: pass new_user_item = dict(user_history_item, **add_info[uid]) # yuankun-20151229 if add_info[uid][ influence_key] < LOW_INFLUENCE_THRESHOULD: #更新活跃情况,出库 try: new_user_item["low_number"] += 1 except: new_user_item["low_number"] = 1 else: new_user_item["low_number"] = 0 aver_activeness, aver_influence, aver_importance = average_value( new_user_item) new_user_item['aver_activeness'] = aver_activeness new_user_item['aver_influence'] = aver_influence new_user_item['aver_importance'] = aver_importance #print 'add_info:', add_info[uid] #print 'user_history_item:', user_history_item #print 'new_user_item:', new_user_item action = {'index': {'_id': uid}} #print 'action:', action bulk_action.extend([action, new_user_item]) es_user_portrait.bulk(bulk_action, index=copy_portrait_index_name, doc_type=copy_portrait_index_type) bulk_action = [] add_info = {} iter_count = 0 end_ts = time.time() print '%s sec count 1000' % (end_ts - start_ts) except StopIteration: print 'all done' if len(add_info) != 0: uid_list = add_info.keys() evaluate_history_results = es_user_portrait.mget( index=copy_portrait_index_name, doc_type=copy_portrait_index_type, body={'ids': uid_list})['docs'] ''' del_date = ts2datetime(time.time() - DAY*31) del_activeness_key = 'activeness_'+del_date del_influence_key = del_date ''' iter_count = 0 for uid in uid_list: try: user_history_item = evaluate_history_results[ iter_count]['_source'] except: user_history_item = {} try: user_history_item.pop(del_activeness_key) user_history_item.pop(del_influence_key) user_history_item.pop(del_importance_key) except: pass new_user_item = dict(user_history_item, **add_info[uid]) if add_info[uid][influence_key] < LOW_INFLUENCE_THRESHOULD: try: new_user_item["low_number"] += 1 except: new_user_item["low_number"] = 1 else: new_user_item["low_number"] = 0 aver_activeness, aver_influence, aver_importance = average_value( new_user_item) new_user_item['aver_activeness'] = aver_activeness new_user_item['aver_influence'] = aver_influence new_user_item['aver_importance'] = aver_importance action = {'index': {'_id': uid}} bulk_action.extend([action, new_user_item]) iter_count += 1 es_user_portrait.bulk(bulk_action, index=copy_portrait_index_name, doc_type=copy_portrait_index_type) bulk_action = [] add_info = {} iter_count = 0 break except Exception, e: raise e
def social_sensing(task_detail): # 任务名 传感器 终止时间 之前状态 创建者 时间 task_name = task_detail[0] social_sensors = task_detail[1] stop_time = task_detail[2] forward_warning_status = task_detail[3] create_by = task_detail[4] ts = int(task_detail[5]) new = int(task_detail[6]) print ts2date(ts) # PART 1 forward_result = get_forward_numerical_info(task_name, ts, create_by) # 之前时间阶段内的原创微博list/retweeted forward_origin_weibo_list = query_mid_list(ts - time_interval, social_sensors, forward_time_range) forward_retweeted_weibo_list = query_mid_list(ts - time_interval, social_sensors, forward_time_range, 3) # 当前阶段内原创微博list current_mid_list = query_mid_list(ts, social_sensors, time_interval) current_retweeted_mid_list = query_mid_list(ts, social_sensors, time_interval, 3) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(current_retweeted_mid_list) all_mid_list.extend(forward_origin_weibo_list) all_mid_list.extend(forward_retweeted_weibo_list) all_origin_list = [] all_origin_list.extend(current_mid_list) all_origin_list.extend(forward_origin_weibo_list) all_retweeted_list = [] all_retweeted_list.extend(current_retweeted_mid_list) all_retweeted_list.extend(forward_retweeted_weibo_list) # 被转发微博的mid/root-mid print "all mid list: ", len(all_mid_list) # print "all_origin_list", all_origin_list # print "all_retweeted_list", all_retweeted_list # 查询微博在当前时间内的转发和评论数, 聚合按照message_type statistics_count = query_related_weibo(ts, all_mid_list, time_interval) if all_origin_list: origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval) # 原创微博详情 else: origin_weibo_detail = {} if all_retweeted_list: retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval) # 转发微博详情 else: retweeted_weibo_detail = {} current_total_count = statistics_count["total_count"] # 当前阶段内所有微博总数 current_retweeted_count = statistics_count["retweeted"] current_comment_count = statistics_count["comment"] # PART 2 # 聚合当前时间内积极、中性、悲伤、愤怒情绪分布 # sentiment_dict = {"0": "neutral", "1":"positive", "2":"sad", "3": "anger"} sentiment_count = {"0": 0, "1": 0, "2": 0, "3": 0} search_results = aggregation_sentiment_related_weibo(ts, all_mid_list, time_interval) sentiment_count = search_results print "sentiment_count: ", sentiment_count negetive_key = ["2", "3", "4", "5", "6"] negetive_count = 0 for key in negetive_key: negetive_count += sentiment_count[key] # 聚合当前时间内重要的人 important_uid_list = [] datetime = ts2datetime(ts - time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: search_results = get_important_user(ts, all_mid_list, time_interval) important_uid_list = search_results # 根据获得uid_list,从人物库中匹配重要人物 if important_uid_list: important_results = es_user_portrait.mget( index=portrait_index_name, doc_type=portrait_index_type, body={"ids": important_uid_list} )["docs"] else: important_results = [] filter_important_list = [] # uid_list if important_results: for item in important_results: if item["found"]: # if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD: filter_important_list.append(item["_id"]) # 判断感知 burst_reason = signal_nothing_variation warning_status = signal_nothing finish = unfinish_signal # "0" process_status = "1" if forward_result[0]: # 根据移动平均判断是否有时间发生 mean_count = forward_result[1] std_count = forward_result[2] mean_sentiment = forward_result[3] std_sentiment = forward_result[4] if ( mean_count >= MEAN_COUNT and current_total_count > mean_count + 1.96 * std_count or current_total_count >= len(all_mid_list) * AVERAGE_COUNT ): # 异常点发生 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track else: warning_status = signal_brust burst_reason += signal_count_varition # 数量异常 if ( negetive_count > mean_sentiment + 1.96 * std_sentiment and mean_sentiment >= MEAN_COUNT or negetive_count >= len(all_mid_list) * AVERAGE_COUNT ): warning_status = signal_brust burst_reason += signal_sentiment_varition # 负面情感异常, "12"表示两者均异常 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track if int(stop_time) <= ts: # 检查任务是否已经完成 finish = finish_signal process_status = "0" # 感知到的事, all_mid_list tmp_burst_reason = burst_reason topic_list = [] sensitive_text_list = [] # 有事件发生时开始 # if warning_status: if 1: index_list = [] important_words = [] datetime_1 = ts2datetime(ts) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es = es_text.indices.exists(index=index_name_1) if exist_es: index_list.append(index_name_1) datetime_2 = ts2datetime(ts - DAY) index_name_2 = flow_text_index_name_pre + datetime_2 exist_es = es_text.indices.exists(index=index_name_2) if exist_es: index_list.append(index_name_2) if index_list and all_mid_list: query_body = {"query": {"filtered": {"filter": {"terms": {"mid": all_mid_list}}}}, "size": 5000} search_results = es_text.search(index=index_list, doc_type="text", body=query_body)["hits"]["hits"] tmp_sensitive_warning = "" text_dict = dict() # 文本信息 mid_value = dict() # 文本赋值 duplicate_dict = dict() # 重合字典 portrait_dict = dict() # 背景信息 classify_text_dict = dict() # 分类文本 classify_uid_list = [] duplicate_text_list = [] sensitive_words_dict = dict() if search_results: for item in search_results: iter_uid = item["_source"]["uid"] iter_mid = item["_source"]["mid"] iter_text = item["_source"]["text"].encode("utf-8", "ignore") iter_sensitive = item["_source"].get("sensitive", 0) duplicate_text_list.append({"_id": iter_mid, "title": "", "content": iter_text}) if iter_sensitive: tmp_sensitive_warning = signal_sensitive_variation # 涉及到敏感词的微博 sensitive_words_dict[iter_mid] = iter_sensitive keywords_dict = json.loads(item["_source"]["keywords_dict"]) personal_keywords_dict = dict() for k, v in keywords_dict.iteritems(): k = k.encode("utf-8", "ignore") personal_keywords_dict[k] = v classify_text_dict[iter_mid] = personal_keywords_dict classify_uid_list.append(iter_uid) # 去重 if duplicate_text_list: dup_results = duplicate(duplicate_text_list) for item in dup_results: if item["duplicate"]: duplicate_dict[item["_id"]] = item["same_from"] # 分类 if classify_text_dict: classify_results = topic_classfiy(classify_uid_list, classify_text_dict) mid_value = dict() # print "classify_results: ", classify_results for k, v in classify_results.iteritems(): # mid:value mid_value[k] = topic_value_dict[v[0]] if tmp_sensitive_warning: warning_status = signal_brust burst_reason += signal_sensitive_variation sensitive_weibo_detail = {} if sensitive_words_dict: sensitive_mid_list = sensitive_words_dict.keys() sensitivie_weibo_detail = query_hot_weibo(ts, sensitive_mid_list, time_interval) results = dict() results["mid_topic_value"] = json.dumps(mid_value) results["duplicate_dict"] = json.dumps(duplicate_dict) results["sensitive_words_dict"] = json.dumps(sensitive_words_dict) results["sensitive_weibo_detail"] = json.dumps(sensitive_weibo_detail) results["origin_weibo_number"] = len(all_origin_list) results["retweeted_weibo_number"] = len(all_retweeted_list) results["origin_weibo_detail"] = json.dumps(origin_weibo_detail) results["retweeted_weibo_detail"] = json.dumps(retweeted_weibo_detail) results["retweeted_weibo_count"] = current_retweeted_count results["comment_weibo_count"] = current_comment_count results["weibo_total_number"] = current_total_count results["sentiment_distribution"] = json.dumps(sentiment_count) results["important_users"] = json.dumps(filter_important_list) results["unfilter_users"] = json.dumps(important_uid_list) results["burst_reason"] = tmp_burst_reason results["timestamp"] = ts # results['clustering_topic'] = json.dumps(topic_list) # es存储当前时段的信息 doctype = create_by + "-" + task_name es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results) # 更新manage social sensing的es信息 if not new: temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=doctype)[ "_source" ] temporal_result["warning_status"] = warning_status temporal_result["burst_reason"] = tmp_burst_reason temporal_result["finish"] = finish temporal_result["processing_status"] = process_status history_status = json.loads(temporal_result["history_status"]) history_status.append([ts, task_name, warning_status]) temporal_result["history_status"] = json.dumps(history_status) es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=doctype, body=temporal_result) else: print "test" return "1"
def get_attr_bci(uid_list): results = [] now_ts = time.time() now_date = ts2datetime(now_ts - 24*3600) ts = datetime2ts(now_date) #test ts = datetime2ts('2013-09-07') user_results = {} # {'uid':{'origin_max..':[], ''}} total_weibo_number = 0 fans_number = 0 origin_weibo_number = 0 retweeted_weibo_number = 0 origin_weibo_retweeted_total_number = 0 origin_weibo_comment_total_number = 0 retweeted_weibo_retweeted_total_number = 0 retweeted_weibo_comment_total_number = 0 origin_weibo_retweeted_top = 0 origin_weibo_comment_top = 0 retweeted_weibo_retweeted_top = 0 retweeted_weibo_comment_top = 0 influence_dict = {} for i in range(0, 7): timestamp = ts - i*24*3600 date = ts2datetime(timestamp) hash_key = ''.join(date.split('-')) es_user_results = es_cluster.mget(index=hash_key, doc_type='bci', body={'ids':uid_list})['docs'] for user_dict in es_user_results: try: user_item = user_dict['_source'] except: next uid = user_item['user'] total_weibo_number += user_item['origin_weibo_number'] total_weibo_number += user_item['retweeted_weibo_number'] # yuankun revise origin_weibo_number += user_item['origin_weibo_number'] retweeted_weibo_number += user_item['retweeted_weibo_number'] origin_weibo_retweeted_top += user_item['origin_weibo_retweeted_top_number'] origin_weibo_comment_top += user_item['origin_weibo_comment_top_number'] retweeted_weibo_retweeted_top += user_item['retweeted_weibo_retweeted_top_number'] retweeted_weibo_comment_top += user_item['retweeted_weibo_comment_top_number'] #print 'user_item:', user_item if uid in user_results: try: user_results[uid]['origin_weibo_retweeted_top'].append([user_item['origin_weibo_retweeted_top_number'], user_item['origin_weibo_top_retweeted_id']]) user_results[uid]['origin_weibo_comment_top'].append([user_item['origin_weibo_comment_top_number'], user_item['origin_weibo_top_comment_id']]) user_results[uid]['retweeted_weibo_retweeted_top'].append([user_item['retweeted_weibo_retweeted_top_number'], user_item['retweeted_weibo_top_retweeted_id']]) user_results[uid]['retweeted_weibo_comment_top'].append([user_item['retweeted_weibo_comment_top_number'], user_item['retweeted_weibo_top_comment_id']]) except: user_results[uid]['origin_weibo_retweeted_top'] = [[user_item['origin_weibo_retweeted_top_number'], user_item['origin_weibo_top_retweeted_id']]] user_results[uid]['origin_weibo_comment_top'] = [[user_item['origin_weibo_comment_top_number'], user_item['origin_weibo_top_comment_id']]] user_results[uid]['retweeted_weibo_retweeted_top'] = [[user_item['retweeted_weibo_retweeted_top_number'], user_item['retweeted_weibo_top_retweeted_id']]] user_results[uid]['retweeted_weibo_comment_top'] = [[user_item['retweeted_weibo_comment_top_number'], user_item['retweeted_weibo_top_comment_id']]] else: #print 'user_item:', [[user_item['origin_weibo_retweeted_top_number'], user_item['origin_weibo_top_retweeted_id']]] user_results[uid] = {'origin_weibo_retweeted_top':[[user_item['origin_weibo_retweeted_top_number'], user_item['origin_weibo_top_retweeted_id']]]} user_results[uid] = {'origin_weibo_comment_top': [[user_item['origin_weibo_comment_top_number'], user_item['origin_weibo_top_comment_id']]]} user_results[uid] = {'retweeted_weibo_retweeted_top': [[user_item['retweeted_weibo_retweeted_top_number'], user_item['retweeted_weibo_top_retweeted_id']]]} user_results[uid] = {'retweeted_weibo_comment_top': [[user_item['retweeted_weibo_comment_top_number'], user_item['retweeted_weibo_top_comment_id']]]} # yuankun need #print 'fan_num:', user_item['user_fansnum'], type(user_item['user_fansnum']), type(fans_number) fans_number += int(user_item['user_fansnum']) origin_weibo_retweeted_total_number += user_item['origin_weibo_retweeted_total_number'] origin_weibo_comment_total_number += user_item['origin_weibo_comment_total_number'] retweeted_weibo_retweeted_total_number += user_item['retweeted_weibo_retweeted_total_number'] retweeted_weibo_comment_total_number += user_item['retweeted_weibo_comment_total_number'] user_portrait_result = es.mget(index='user_portrait', doc_type='user', body={'ids':uid_list})['docs'] #print 'user_portrait_result:', user_portrait_result[0] # get activeness max & importance max & influence max to normalize evaluate_max_result = get_evaluate_max() for user_portrait in user_portrait_result: #print 'user_portrait:', user_portrait try: user_portrait_dict = user_portrait['_source'] #print 'user_portrait_dict:', user_portrait_dict uname = user_portrait_dict['uname'] importance = user_portrait_dict['importance'] normal_importance = math.log((importance / evaluate_max_result['importance']) * 9 + 1, 10) * 100 activeness = user_portrait_dict['activeness'] normal_activeness = math.log(activeness / evaluate_max_result['activeness'] * 9 + 1, 10) * 100 influence = user_portrait_dict['influence'] normal_influence = math.log(influence / evaluate_max_result['influence'] * 9 + 1, 10) * 100 except: uname = '' normal_importance = '' normal_activeness = '' normal_influence = '' #print 'user_portrait_dict:', user_portrait_dict uid = user_portrait_dict['uid'] user_item_dict = user_results[uid] origin_weibo_retweeted_top_item = sorted(user_item_dict['origin_weibo_retweeted_top'], key=lambda x:x[0], reverse=True)[0] origin_weibo_comment_top_item = sorted(user_item_dict['origin_weibo_comment_top'], key=lambda x:x[0], reverse=True)[0] retweeted_weibo_retweeted_top_item = sorted(user_item_dict['retweeted_weibo_retweeted_top'], key=lambda x:x[0], reverse=True)[0] retweeted_weibo_comment_top_item = sorted(user_item_dict['retweeted_weibo_comment_top'], key=lambda x:x[0], reverse=True)[0] results.append([uid, uname, normal_activeness, normal_importance, normal_influence, origin_weibo_retweeted_top_item ,\ origin_weibo_comment_top_item, retweeted_weibo_retweeted_top_item, \ retweeted_weibo_comment_top_item]) #yuankun need influence_dict['origin_weibo_retweeted_average_number'] = origin_weibo_retweeted_total_number/origin_weibo_number/7 influence_dict['origin_weibo_comment_average_number'] = origin_weibo_comment_total_number/origin_weibo_number/7 influence_dict['retweeted_weibo_retweeted_average_number'] = retweeted_weibo_retweeted_total_number/retweeted_weibo_number/7 influence_dict['retweeted_weibo_comment_average_number'] = retweeted_weibo_comment_total_number/retweeted_weibo_number/7 influence_dict['origin_weibo_retweeted_top_number'] = origin_weibo_retweeted_top/len(uid_list)/7 influence_dict['origin_weibo_comment_top_number'] = origin_weibo_comment_top/len(uid_list)/7 influence_dict['retweeted_weibo_retweeted_top_number'] = retweeted_weibo_retweeted_top/len(uid_list)/7 influence_dict['retweeted_weibo_comment_top_number'] = retweeted_weibo_comment_top/len(uid_list)/7 influence_dict['fans_number'] = fans_number influence_dict['total_weibo_number'] = total_weibo_number #print 'results:', results return {'user_influence_list': json.dumps(results), 'total_weibo_number': total_weibo_number}, influence_dict
def social_sensing(task_detail): # 任务名 传感器 终止时间 之前状态 创建者 时间 task_name = task_detail[0] social_sensors = task_detail[1] stop_time = task_detail[2] forward_warning_status = task_detail[3] create_by = task_detail[4] ts = int(task_detail[5]) # PART 1 forward_result = get_forward_numerical_info(task_name, ts, create_by) # 之前时间阶段内的原创微博list/retweeted forward_origin_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range) forward_retweeted_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range, 3) # 当前阶段内原创微博list current_mid_list = query_mid_list(ts, social_sensors, time_interval) current_retweeted_mid_list = query_mid_list(ts, social_sensors, time_interval, 3) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(current_retweeted_mid_list) all_mid_list.extend(forward_origin_weibo_list) all_mid_list.extend(forward_retweeted_weibo_list) all_origin_list = [] all_origin_list.extend(current_mid_list) all_origin_list.extend(forward_origin_weibo_list) all_retweeted_list = [] all_retweeted_list.extend(current_retweeted_mid_list) all_retweeted_list.extend(forward_retweeted_weibo_list)#被转发微博的mid/root-mid print "all mid list: ", len(all_mid_list) print "all_origin_list", all_origin_list print "all_retweeted_list", all_retweeted_list # 查询微博在当前时间内的转发和评论数, 聚合按照message_type statistics_count = query_related_weibo(ts, all_mid_list, time_interval) if all_origin_list: origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval) # 原创微博详情 else: origin_weibo_detail = {} if all_retweeted_list: retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval) # 转发微博详情 else: retweeted_weibo_detail = {} current_total_count = statistics_count['total_count'] # 当前阶段内所有微博总数 current_retweeted_count = statistics_count['retweeted'] current_comment_count = statistics_count['comment'] # PART 2 # 聚合当前时间内积极、中性、悲伤、愤怒情绪分布 # sentiment_dict = {"0": "neutral", "1":"positive", "2":"sad", "3": "anger"} sentiment_count = {"0": 0, "1": 0, "2": 0, "3": 0} search_results = aggregation_sentiment_related_weibo(ts, all_mid_list, time_interval) sentiment_count = search_results print "sentiment_count: ", sentiment_count negetive_key = ["2", "3", "4", "5", "6"] negetive_count = 0 for key in negetive_key: negetive_count += sentiment_count[key] # 聚合当前时间内重要的人 important_uid_list = [] datetime = ts2datetime(ts-time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: search_results = get_important_user(ts, all_mid_list, time_interval) important_uid_list = search_results.keys() # 根据获得uid_list,从人物库中匹配重要人物 if important_uid_list: important_results = es_user_portrait.mget(index=portrait_index_name,doc_type=portrait_index_type, body={"ids": important_uid_list})['docs'] else: important_results = {} filter_important_list = [] # uid_list if important_results: for item in important_results: if item['found']: #if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD: filter_important_list.append(item['_id']) print filter_important_list #判断感知 burst_reason = signal_nothing_variation warning_status = signal_nothing finish = unfinish_signal # "0" process_status = "1" if forward_result[0]: # 根据移动平均判断是否有时间发生 mean_count = forward_result[1] std_count = forward_result[2] mean_sentiment = forward_result[3] std_sentiment = forward_result[4] if mean_count >= MEAN_COUNT and current_total_count > mean_count+1.96*std_count or current_total_count >= len(social_sensors)*0.2*AVERAGE_COUNT: # 异常点发生 print "=====================================================" if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track else: warning_status = signal_brust burst_reason += signal_count_varition # 数量异常 if negetive_count > mean_sentiment+1.96*std_sentiment and mean_sentiment >= MEAN_COUNT or negetive_count >= len(social_sensors)*0.2*AVERAGE_COUNT: warning_status = signal_brust burst_reason += signal_sentiment_varition # 负面情感异常, "12"表示两者均异常 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track if int(stop_time) <= ts: # 检查任务是否已经完成 finish = finish_signal process_status = "0" # 感知到的事, all_mid_list tmp_burst_reason = burst_reason topic_list = [] # 有事件发生时开始 if warning_status: index_list = [] important_words = [] datetime_1 = ts2datetime(ts) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es = es_text.indices.exists(index=index_name_1) if exist_es: index_list.append(index_name_1) datetime_2 = ts2datetime(ts-DAY) index_name_2 = flow_text_index_name_pre + datetime_2 exist_es = es_text.indices.exists(index=index_name_2) if exist_es: index_list.append(index_name_2) if index_list and all_mid_list: query_body = { "query":{ "filtered":{ "filter":{ "terms":{"mid": all_mid_list} } } }, "size": 2000 } search_results = es_text.search(index=index_list, doc_type="text", body=query_body)['hits']['hits'] text_list = [] if search_results: for item in search_results: iter_mid = item['_source']['mid'] iter_text = item['_source']['text'] temp_dict = dict() temp_dict["mid"] = iter_mid temp_dict["text"] = iter_text text_list.append(temp_dict) for item in text_list: print item['text'] if len(text_list) == 1: top_word = freq_word(text_list[0]) topic_list = [top_word.keys()] elif len(text_list) == 0: topic_list = [] tmp_burst_reason = "" #没有相关微博,归零 print "***********************************" else: feature_words, input_word_dict = tfidf(text_list) #生成特征词和输入数据 word_label, evaluation_results = kmeans(feature_words, text_list) #聚类 inputs = text_classify(text_list, word_label, feature_words) clustering_topic = cluster_evaluation(inputs) print "===============================================================" print "===============================================================" sorted_dict = sorted(clustering_topic.items(), key=lambda x:x[1], reverse=True) topic_list = [] if sorted_dict: for item in sorted_dict: topic_list.append(word_label[item[0]]) print "topic_list, ", topic_list #if not topic_list: # warning_status = signal_nothing # tmp_burst_reason = signal_nothing_variation results = dict() results['origin_weibo_number'] = len(all_origin_list) results['retweeted_weibo_number'] = len(all_retweeted_list) results['origin_weibo_detail'] = json.dumps(origin_weibo_detail) results['retweeted_weibo_detail'] = json.dumps(retweeted_weibo_detail) results['retweeted_weibo_count'] = current_retweeted_count results['comment_weibo_count'] = current_comment_count results['weibo_total_number'] = current_total_count results['sentiment_distribution'] = json.dumps(sentiment_count) results['important_users'] = json.dumps(filter_important_list) results['unfilter_users'] = json.dumps(important_uid_list) results['burst_reason'] = tmp_burst_reason results['timestamp'] = ts if tmp_burst_reason: results['clustering_topic'] = json.dumps(topic_list) # es存储当前时段的信息 doctype = create_by + '-' + task_name es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results) # 更新manage social sensing的es信息 temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=doctype)['_source'] temporal_result['warning_status'] = warning_status temporal_result['burst_reason'] = tmp_burst_reason temporal_result['finish'] = finish temporal_result['processing_status'] = process_status history_status = json.loads(temporal_result['history_status']) history_status.append([ts, task_name, warning_status]) temporal_result['history_status'] = json.dumps(history_status) es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=doctype, body=temporal_result) return "1"
def get_attr_portrait(uid_list): result = {} index_name = 'user_portrait' index_type = 'user' user_dict_list = es.mget(index=index_name, doc_type=index_type, body={'ids':uid_list})['docs'] #print 'user_dict:', user_dict_list gender_ratio = dict() verified_ratio = dict() online_pattern_ratio = dict() domain_ratio = dict() topic_ratio = dict() emoticon_ratio = dict() keyword_ratio = dict() importance_list = [] activeness_list = [] influence_list = [] psycho_status_ratio = dict() psycho_feature_ratio = dict() hashtag_ratio = dict() activity_geo_ratio = dict() for user_dict in user_dict_list: user_dict = user_dict['_source'] #attr1 gender ratio gender = user_dict['gender'] if gender: try: gender_ratio[gender] += 1 except: gender_ratio[gender] = 1 #attr2 verified ratio verified = user_dict['verified'] if verified: try: verified_ratio[verified] += 1 except: verified_ratio[verified] = 1 #attr3 online pattern online_pattern = user_dict['online_pattern'] if online_pattern: online_pattern = json.loads(online_pattern) for pattern in online_pattern: try: online_pattern_ratio[pattern] += 1 except: online_pattern_ratio[pattern] = 1 #attr4 domain domain_string = user_dict['domain'] if domain_string: domain_list = domain_string.split('_') for domain in domain_list: try: domain_ratio[domain] += 1 except: domain_ratio[domain] = 1 #attr5 topic topic_string = user_dict['topic'] if topic_string: topic_dict = json.loads(topic_string) for topic in topic_dict: try: topic_ratio[topic] += 1 except: topic_ratio[topic] = 1 #attr6 emoticon emoticon_string = user_dict['emoticon'] if emoticon_string: emoticon_dict = json.loads(emoticon_string) for emoticon in emoticon_dict: try: emoticon_ratio[emoticon] += 1 except: emoticon_ratio[emoticon] = 1 #attr7 keywords keyword_string = user_dict['keywords'] if keyword_string: keyword_dict = json.loads(keyword_string) for keyword in keyword_dict: try: keyword_ratio[keyword] += keyword_dict[keyword] except: keyword_ratio[keyword] = keyword_dict[keyword] #attr8 importance distribution importance = user_dict['importance'] importance_rank = get_index_rank(importance, 'importance') importance_list.append(int(importance_rank)) #attr9 activeness distribution activeness = user_dict['activeness'] activeness_rank = get_index_rank(activeness, 'activeness') activeness_list.append(int(activeness_rank)) #attr10 influence distribution influence = user_dict['influence'] influence_rank = get_index_rank(influence, 'influence') influence_list.append(int(influence_rank)) #attr11 psycho_status ratio psycho_status_string = user_dict['psycho_status'] if psycho_status_string: psycho_status_dict = json.loads(psycho_status_string) for psycho_status in psycho_status_dict: try: psycho_status_ratio[psycho_status] += psycho_status_dict[psycho_status] except: psycho_status_ratio[psycho_status] = psycho_status_dict[psycho_status] #attr12 psycho_feature ratio psycho_feature_string = user_dict['psycho_feature'] if psycho_feature_string: psycho_feature_list = psycho_feature_string.split('_') for psycho_feature in psycho_feature_list: try: psycho_feature_ratio[psycho_feature] += 1 except: psycho_feature_ratio[psycho_feature] = 1 #attr13 activity geo ratio activity_geo_string = user_dict['activity_geo_dict'] if activity_geo_string: activity_geo_dict = json.loads(activity_geo_string) for activity_geo in activity_geo_dict: city_list = activity_geo.split('\t') city = city_list[len(city_list)-1] try: activity_geo_ratio[city] += activity_geo_dict[activity_geo] except: activity_geo_ratio[city] = activity_geo_dict[activity_geo] #attr14 hashtag hashtag_string = user_dict['hashtag_dict'] if hashtag_string: hashtag_dict = json.loads(hashtag_string) for hashtag in hashtag_dict: try: hashtag_ratio[hashtag] += hashtag_dict[hashtag] except: hashtag_ratio[hashtag] = hashtag_dict[hashtag] #print 'importance_list:', importance_list p, t = np.histogram(importance_list, bins=5, normed=False) importance_his = [p.tolist(), t.tolist()] #print 'importance_his:', importance_his p, t = np.histogram(activeness_list, bins=5, normed=False) activeness_his = [p.tolist(), t.tolist()] p, t = np.histogram(influence_list, bins=5, normed=False) influence_his = [p.tolist(), t.tolist()] result['gender'] = json.dumps(gender_ratio) result['verified'] = json.dumps(verified_ratio) result['online_pattern'] = json.dumps(online_pattern_ratio) result['domain'] = json.dumps(domain_ratio) result['topic'] = json.dumps(topic_ratio) result['psycho_status'] = json.dumps(psycho_status_ratio) result['psycho_feature'] = json.dumps(psycho_feature_ratio) result['emoticon'] = json.dumps(emoticon_ratio) result['keywords'] = json.dumps(keyword_ratio) result['hashtag'] = json.dumps(hashtag_ratio) result['activity_geo'] = json.dumps(activity_geo_ratio) result['importance_his'] = json.dumps(importance_his) result['activeness_his'] = json.dumps(activeness_his) result['influence_his'] = json.dumps(influence_his) return result
def key_words_search(task_id, search_type, pre, during, start_time, keyword_list, search_key='', sort_norm='', sort_scope='', time=1, isall=False, number=100): number = int(number) should = [] for key in keyword_list: if search_type == "hashtag": should.append({"prefix": {"text": "#" + key + "#"}}) else: should.append({"wildcard": {"text": "*" + key + "*"}}) index_list = [] date = ts2datetime(start_time) index_name = pre + date while during: if es_flow_text.indices.exists(index=index_name): index_list.append(index_name) start_time = start_time + DAY date = ts2datetime(start_time) index_name = pre + date during -= 1 print index_list uid_set = set() text_results = [] query_body = { "query": { "bool": { "must": should } }, "sort": { "user_fansnum": { "order": "desc" } }, "size": 5000 } results = es_flow_text.search(index=index_list, doc_type='text', body=query_body, _source=False, fields=[ "uid", "user_fansnum", "text", "message_type", "sentiment", "timestamp", "geo", "retweeted", "comment" ])["hits"]["hits"] id_index = 0 index_list = [] un_uid_list = [] for item in results: if item['fields']['uid'][0] not in uid_set: uid_set.add(item['fields']['uid'][0]) un_uid_list.append(item['fields']['uid'][0]) index_list.append(id_index) id_index += 1 uid_list = [] print "un_uid_list: ", len(un_uid_list) portrait_list = [] count = 0 in_index = 0 if not isall and un_uid_list: # 库内 portrait_results = es_user_portrait.mget(index=USER_INDEX_NAME, doc_type=USER_INDEX_TYPE, body={"ids": un_uid_list}, _source=False, fields=['uname'])["docs"] for item in portrait_results: if item["found"]: portrait_list.append(item['_id']) nick_name = item['fields']['uname'][0] if nick_name == 'unknown': nick_name = item['_id'] index = index_list[in_index] weibo_url = weiboinfo2url(results[index]['fields']['uid'][0], results[index]['_id']) text_results.extend([ results[index]['fields']['uid'][0], results[index]['fields']['user_fansnum'][0], results[index]['fields']['text'][0], results[index]['fields']['message_type'][0], results[index]['fields']['sentiment'][0], ts2date(results[index]['fields']['timestamp'][0]), results[index]['fields']['geo'][0], results[index]['fields']['retweeted'][0], results[index]['fields']['comment'][0], nick_name, weibo_url ]) count += 1 if count == number: break print "portrait_len, ", len(portrait_list) in_index += 1 if portrait_list: uid_list = in_sort_filter(time, sort_norm, sort_scope, None, portrait_list, True, number) # sort elif un_uid_list: profile_result = es_user_profile.mget(index="weibo_user", doc_type="user", body={"ids": un_uid_list}, fields=['nick_name'])["docs"] for i in range(len(profile_result)): index = index_list[i] try: nick_name = profile_result[i]['fields']['nick_name'][0] except: nick_name = un_uid_list[i] item = results[index] weibo_url = weiboinfo2url(item['fields']['uid'][0], results[index]['_id']) text_results.append([ item['fields']['uid'][0], item['fields']['user_fansnum'][0], item['fields']['text'][0], item['fields']['message_type'][0], item['fields']['sentiment'][0], ts2date(item['fields']['timestamp'][0]), results[index]['fields']['geo'][0], results[index]['fields']['retweeted'][0], results[index]['fields']['comment'][0], nick_name, weibo_url ]) if i == number: break uid_list = all_sort_filter(un_uid_list[:number], sort_norm, time, True, number) print "filter_uid_list: ", len(uid_list) if uid_list: results = make_up_user_info(uid_list, isall, time, sort_norm) else: results = [] print "results: ", len(results) # 修改状态 task_detail = es_user_portrait.get(index=USER_RANK_KEYWORD_TASK_INDEX, doc_type=USER_RANK_KEYWORD_TASK_TYPE, id=task_id) item = task_detail['_source'] item['status'] = 1 item['result'] = json.dumps(results) item['text_results'] = json.dumps(text_results) item['number'] = len(results) es_user_portrait.index(index=USER_RANK_KEYWORD_TASK_INDEX, doc_type=USER_RANK_KEYWORD_TASK_TYPE, id=task_id, body=item) return "1"
bulk_action.extend([action, new_user_item]) iter_count += 1 es_user_portrait.bulk(bulk_action, index=copy_portrait_index_name, doc_type=copy_portrait_index_type) bulk_action = [] add_info = {} iter_count = 0 break except Exception, e: raise e if len(add_info) != 0: uid_list = add_info.keys() evaluate_history_results = es_user_portrait.mget( index=copy_portrait_index_name, doc_type=copy_portrait_index_type, body={'ids': uid_list})['docs'] ''' del_date = ts2datetime(time.time() - DAY*31) del_activeness_key = 'activeness_'+del_date del_influence_key = del_date ''' iter_count = 0 for uid in uid_list: try: user_history_item = evaluate_history_results[iter_count][ '_source'] except: user_history_item = {} try: user_history_item.pop(del_activeness_key)
def get_scan_results(): result_dict = {} gender_result = {'1': 0, '2': 0} verified_result = {'yes': 0, 'no': 0} location_result = {} activity_geo_result = {} keywords_result = {} hashtag_result = {} topic_result = {} online_pattern_result = {} domain_result = {} no_gender_count = 0 no_verified_count = 0 no_location_count = 0 no_activity_geo_count = 0 no_keywords_count = 0 no_hashtag_count = 0 no_topic_count = 0 no_online_pattern_count = 0 no_domain_count = 0 s_re = scan(es_user_portrait, query={ 'query': { 'match_all': {} }, 'size': 100 }, index=portrait_index_name, doc_type=portrait_index_type) print 's_re:', s_re activity_count = 0 while True: portrait_uid_list = [] while True: try: scan_re = s_re.next()['_source'] # gender ratio count portrait_uid_list.append(scan_re['uid']) try: gender_result[str(scan_re['gender'])] += 1 except: no_gender_count += 1 # verified ratio count try: verified_result[str(scan_re['verified'])] += 1 except: no_verified_count += 1 # loation top try: location = scan_re['location'] if len(location.split(' ')) > 1: location = location.split(' ')[0] try: location_result[location] += 1 except: location_result[location] = 1 except: no_location_count += 1 # activity geo try: activity_geo = scan_re['activity_geo_dict'] if activity_geo: activity_geo_dict = json.loads(activity_geo)[-1] for geo in activity_geo_dict: geo_list = geo.split('\t') if geo_list[0] == u'中国' and len(geo_list) >= 2: province = geo_list[1] try: activity_geo_result[ province] += activity_geo_dict[geo] except: activity_geo_result[ province] = activity_geo_dict[geo] except: no_activity_geo_count += 1 # keywords try: keywords = json.loads(scan_re['keywords']) if keywords: for word in keywords: try: keywords_result[word] += keywords[word] except: keywords_result[word] = keywords[word] except: no_keywords_count += 1 # hashtag top try: hashtag_dict = json.loads(scan_re['hashtag_dict']) if hashtag_dict: for tag in hashtag_dict: try: hashtag_result[tag] += hashtag_dict[tag] except: hashtag_result[tag] = hashtag_dict[tag] except: no_hashtag_count += 1 # topic top try: topic = scan_re['topic_string'] if topic: topic_list = topic.split('&') for item in topic_list: try: topic_result[item] += 1 except: topic_result[item] = 1 except: no_topic_count += 1 # online pattern top try: online_pattern = json.loads(scan_re['online_pattern']) if online_pattern: for item in online_pattern: try: online_pattern_result[item] += online_pattern[ item] except: online_pattern_result[item] = online_pattern[ item] except: no_online_pattern_count += 1 # domain top try: domain = scan_re['domain'] if domain: try: domain_result[domain] += 1 except: domain_result[domain] = 1 except: no_domain_count += 1 except StopIteration: print 'all done' now_ts = time.time() now_date = ts2datetime(now_ts - DAY) index_time = ''.join(now_date.split('-')) #test index_time = '20130907' # gender ratio count #count = sum(gender_result.values()) all_count = es_user_portrait.count(index=portrait_index_name,doc_type=portrait_index_type,\ body={'query':{'match_all':{}}})['count'] count = all_count print "count:", count gender_ratio = { '1': float(gender_result['1']) / count, '2': float(gender_result['2']) / count } #print 'gender ratio:', gender_ratio activity_result = es_user_portrait.mget( index='bci_' + index_time, doc_type='bci', body={'ids': portrait_uid_list})['docs'] for activity_item in activity_result: if activity_item['found']: activity_count += 1 #print 'activity_count:', activity_count result_dict['activity_count'] = float(activity_count) / count result_dict['gender_ratio'] = json.dumps(gender_ratio) # verified ratio count count = sum(verified_result.values()) if count == 0: verified_ratio = {'yes': 0.5, 'no': 0.5} else: verified_ratio = { 'yes': float(verified_result['yes']) / count, 'no': float(verified_result['no']) / count } #print 'verified ratio:', verified_ratio result_dict['verified_ratio'] = json.dumps(verified_ratio) # location top if location_result: sort_location = sorted(location_result.items(), key=lambda x: x[1], reverse=True) location_top = sort_location[:5] else: location_top = {} #print 'location top:', location_top result_dict['location_top'] = json.dumps(location_top) # activity geo top if activity_geo_result: sort_activity_geo = sorted(activity_geo_result.items(), key=lambda x: x[1], reverse=True) activity_geo_top = sort_activity_geo[:50] else: activity_geo_top = {} #print 'activity_geo_top:', activity_geo_top result_dict['activity_geo_top'] = json.dumps(activity_geo_top) # keywords top if keywords_result: sort_keywords = sorted(keywords_result.items(), key=lambda x: x[1], reverse=True) keywords_top = sort_keywords[:50] else: keywords_top = {} #print 'keywords_top:', keywords_top result_dict['keywords_top'] = json.dumps(keywords_top) # hashtag top if hashtag_result: sort_hashtag = sorted(hashtag_result.items(), key=lambda x: x[1], reverse=True) hashtag_top = sort_hashtag[:50] else: hashtag_top = {} #print 'hashtag top:', hashtag_top result_dict['hashtag_top'] = json.dumps(hashtag_top) # topic top if topic_result: sort_topic = sorted(topic_result.items(), key=lambda x: x[1], reverse=True) topic_top = sort_topic[:50] else: topic_top = {} #print 'topic top:', topic_top result_dict['topic_top'] = json.dumps(topic_top) # online_pattern top if online_pattern_result: sort_online_pattern = sorted(online_pattern_result.items(), key=lambda x: x[1], reverse=True) online_pattern_top = sort_online_pattern[:50] else: online_pattern_top = {} #print 'online pattern top:', online_pattern_top result_dict['online_pattern_top'] = json.dumps( online_pattern_top) # domain top if domain_result: sort_domain = sorted(domain_result.items(), key=lambda x: x[1], reverse=True) domain_top = sort_domain[:20] else: domain_top = {} result_dict['domain_top'] = json.dumps(domain_top) result_dict['domain_top_user'] = json.dumps( get_domain_top_user(domain_top)) result_dict['topic_top_user'] = json.dumps( get_topic_top_user(topic_top)) return result_dict except Exception, r: print Exception, r return result_dict activity_result = es.mget(index='20130907', doc_type='bci', body={'ids': portrait_uid_list})['docs'] for activity_item in activity_result: if activity_item['found']: activity_count += 1
def get_influence_vary_top(): result = [] query_body = { 'query':{ 'match_all':{} }, 'size': 10000, 'sort':[{'vary':{'order': 'desc'}}] } try: es_result = es.search(index='vary', doc_type='bci', body=query_body)['hits']['hits'] except Exception, e: raise e uid_list = [user_dict['_id'] for user_dict in es_result] #print 'uid_list:', uid_list portrait_result = es.mget(index='user_portrait', doc_type='user', body={'ids':uid_list}, _source=True)['docs'] #print 'portrait_result:', portrait_result count = 0 for i in range(len(portrait_result)): if count >=100: break #print 'portrait_result:', portrait_result if portrait_result[i]['found']: uid = portrait_result[i]['_source']['uid'] uname = portrait_result[i]['_source']['uname'] vary = es_result[i]['_source']['vary'] result.append([uid, uname, vary]) count += 1 else: next #print 'result:', result