def count_text_num(uid_list, fb_flow_text_index_list): count_result = {} #QQ那边好像就是按照用户来count的 https://github.com/huxiaoqian/xnr1/blob/82ff9704792c84dddc3e2e0f265c46f3233a786f/xnr/qq_xnr_manage/qq_history_count_timer.py for uid in uid_list: textnum_query_body = { 'query': { "filtered": { "filter": { "bool": { "must": [ { "term": { "uid": uid } }, ] } } } } } text_num = 0 for index_name in fb_flow_text_index_list: result = es.count(index=index_name, doc_type=flow_text_index_type, body=textnum_query_body) if result['_shards']['successful'] != 0: text_num += result['count'] count_result[uid] = text_num return count_result
def qq_history_count(xnr_user_no, qq_number, current_time): current_date = ts2datetime(current_time) timestamp = datetime2ts(current_date) last_date = ts2datetime(current_time - DAY) group_message_index_name = group_message_index_name_pre + current_date query_body = { 'query': { 'bool': { 'must': [{ 'term': { 'speaker_qq_number': qq_number } }, { 'term': { 'xnr_qq_number': qq_number } }] } } } count_result = es.count(index=group_message_index_name, doc_type=group_message_index_type, body=query_body) if count_result['_shards']['successful'] != 0: today_count = count_result['count'] else: print 'es index rank error' today_count = 0 _id_last = xnr_user_no + last_date try: get_result = es.get(index=qq_xnr_history_count_index_name,doc_type=qq_xnr_history_count_index_type,\ id=_id_last)['_source'] total_count_history = get_result['total_post_num'] except: total_count_history = 0 total_count_totay = total_count_history + today_count _id_today = xnr_user_no + '_' + current_date item_dict = dict() item_dict['date_time'] = current_date item_dict['xnr_user_no'] = xnr_user_no item_dict['total_post_num'] = total_count_totay item_dict['daily_post_num'] = today_count item_dict['qq_number'] = qq_number item_dict['timestamp'] = timestamp es.index(index=qq_xnr_history_count_index_name,doc_type=qq_xnr_history_count_index_type,\ id=_id_today,body=item_dict)
def influence_active(uid, index_name): query_body = {'query': {'term': {'uid': uid}}} #index_name = facebook_flow_text_index_name_pre + ts2datetime(current_time) es_count = es.count(index=index_name, doc_type=facebook_flow_text_index_type, body=query_body) if es_count['_shards']['successful'] != 0: active_num = es_count['count'] else: active_num = 0 return active_num
def qq_history_count(xnr_user_no, qq_number, current_time): if S_TYPE == 'test': current_time = datetime2ts(QQ_S_DATE_ASSESSMENT) current_date = ts2datetime(current_time) last_date = ts2datetime(current_time - DAY) group_message_index_name = group_message_index_name_pre + current_date qq_xnr_history_count_index_name = qq_xnr_history_count_index_name_pre + last_date # 得到当天发帖数量 query_body = { 'query': { 'bool': { 'must': [{ 'term': { 'speaker_qq_number': qq_number } }, { 'term': { 'xnr_qq_number': qq_number } }] } } } count_result = es.count(index=group_message_index_name, doc_type=group_message_index_type, body=query_body) if count_result['_shards']['successful'] != 0: today_count = count_result['count'] else: print 'es index rank error' today_count = 0 # 得到历史发言总数 try: get_result = es.get(index=qq_xnr_history_count_index_name,doc_type=qq_xnr_history_count_index_type,\ id=xnr_user_no)['_source'] total_count_history = get_result['total_post_num'] except: total_count_history = 0 total_count_totay = total_count_history + today_count item_dict = dict() item_dict['total_post_num'] = total_count_totay item_dict['daily_post_num'] = today_count # xnr所在群当天发言最多的人 query_body_total_day = { 'query': { 'filtered': { 'filter': { 'term': { 'xnr_qq_number': qq_number } } } }, 'aggs': { 'all_speakers': { 'terms': { 'field': 'speaker_qq_number', "order": { "_count": "desc" } } } } } try: results_total_day = es_xnr.search(index=group_message_index_name,doc_type=group_message_index_type,\ body=query_body_total_day)['aggregations']['all_speakers']['buckets'] speaker_max = results_total_day[0]['doc_count'] except: speaker_max = today_count safe = (float(math.log(today_count + 1)) / (math.log(speaker_max + 1) + 1)) * 100 safe = round(safe, 2) # 保留两位小数 item_dict['mark'] = safe return item_dict
def get_influence_at_num(xnr_user_no, qq_number, current_time): item_dict = {} if S_TYPE == 'test': current_time = datetime2ts(QQ_S_DATE_ASSESSMENT) current_date = ts2datetime(current_time) group_message_index_name = group_message_index_name_pre + current_date #虚拟人当天被@数量 query_body_xnr = { 'query': { 'bool': { 'must': [{ 'term': { 'xnr_qq_number': qq_number } }, { 'wildcard': { 'text': '*' + '@ME' + '*' } }] } } } try: results_xnr = es_xnr.count(index=group_message_index_name,doc_type=group_message_index_type,\ body=query_body_xnr) if results_xnr['_shards']['successful'] != 0: at_num_xnr = results_xnr['count'] else: print 'es index rank error' at_num_xnr = 0 except: at_num_xnr = 0 # 得到历史总数 current_time_last = current_time - DAY current_date_last = ts2datetime(current_time_last) qq_xnr_history_count_index_name = qq_xnr_history_count_index_name_pre + current_date_last try: result_last = es_xnr.get(index=qq_xnr_history_count_index_name, doc_type=qq_xnr_history_be_at_index_type, id=xnr_user_no)['_source'] total_be_at_num_last = result_last['total_be_at_num'] except: total_be_at_num_last = 0 item_dict['daily_be_at_num'] = at_num_xnr item_dict['total_be_at_num'] = at_num_xnr + total_be_at_num_last # 被@总数 query_body_total_day = { 'query': { 'bool': { 'must': [{ 'term': { 'xnr_qq_number': qq_number } }, { 'wildcard': { 'text': '*' + '@' + '*' } }] } } } try: results_total_day = es_xnr.count(index=group_message_index_name,doc_type=group_message_index_type,\ body=query_body_total_day) if results_total_day['_shards']['successful'] != 0: at_num_total_day = results_total_day['count'] else: print 'es index rank error' at_num_total_day = 0 except: at_num_total_day = 0 influence = (float(math.log(at_num_xnr + 1)) / (math.log(at_num_total_day + 1) + 1)) * 100 influence = round(influence, 2) # 保留两位小数 mark = influence item_dict['mark'] = mark # es_xnr.index(index=qq_xnr_history_count_index_name,doc_type=qq_xnr_history_be_at_index_type,\ # body=item_dict) return item_dict
def social_sensing(task_detail): ''' with open("prediction_uid.pkl", "r") as f: uid_model = pickle.load(f) with open("prediction_weibo.pkl", "r") as f: weibo_model = pickle.load(f) ''' # 任务名 传感器 终止时间 之前状态 创建者 时间 task_name = task_detail[0] social_sensors = task_detail[1] #ts = int(task_detail[2]) ts = float(task_detail[2]) xnr_user_no = task_detail[3] print ts2date(ts) index_list = [] important_words = [] datetime_1 = ts2datetime(ts) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es = es_text.indices.exists(index=index_name_1) if exist_es: index_list.append(index_name_1) datetime_2 = ts2datetime(ts - DAY) index_name_2 = flow_text_index_name_pre + datetime_2 exist_es = es_text.indices.exists(index=index_name_2) if exist_es: index_list.append(index_name_2) if es_text.indices.exists(index=flow_text_index_name_pre + ts2datetime(ts - 2 * DAY)): index_list.append(flow_text_index_name_pre + ts2datetime(ts - 2 * DAY)) # PART 1 #forward_result = get_forward_numerical_info(task_name, ts, create_by) # 之前时间阶段内的原创微博list/retweeted forward_origin_weibo_list, forward_1 = query_mid_list( ts - time_interval, social_sensors, forward_time_range) forward_retweeted_weibo_list, forward_3 = query_mid_list( ts - time_interval, social_sensors, forward_time_range, 3) # 当前阶段内原创微博list current_mid_list, current_1 = query_mid_list(ts, social_sensors, time_interval) current_retweeted_mid_list, current_3 = query_mid_list( ts, social_sensors, time_interval, 3) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(current_retweeted_mid_list) all_mid_list.extend(forward_origin_weibo_list) all_mid_list.extend(forward_retweeted_weibo_list) all_origin_list = [] all_origin_list.extend(current_mid_list) all_origin_list.extend(forward_origin_weibo_list) all_origin_list = list(set(all_origin_list)) all_retweeted_list = [] all_retweeted_list.extend(current_retweeted_mid_list) all_retweeted_list.extend( forward_retweeted_weibo_list) #被转发微博的mid/root-mid all_retweeted_list = list(set(all_retweeted_list)) all_mid_list = filter_mid(all_mid_list) all_origin_list = filter_mid(all_origin_list) all_retweeted_list = filter_mid(all_retweeted_list) print "all mid list: ", len(all_mid_list) print "all_origin_list", len(all_origin_list) print "all_retweeted_list", len(all_retweeted_list) # 查询微博在当前时间内的转发和评论数, 聚合按照message_type #statistics_count = query_related_weibo(ts, all_mid_list, time_interval) if all_origin_list: #origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval) # 原创微博详情 origin_weibo_detail = dict() for mid in all_origin_list: retweet_count = es_text.count( index=index_list, doc_type="text", body={"query": { "bool": { "must": [{ "term": { "fid": mid } }] } }})["count"] comment_count = es_text.count( index=index_list, doc_type="text", body={"query": { "bool": { "must": [{ "term": { "fid": mid } }] } }})["count"] tmp = dict() tmp["retweeted"] = retweet_count tmp["comment"] = comment_count origin_weibo_detail[mid] = tmp else: origin_weibo_detail = {} print "len(origin_weibo_detail): ", len(origin_weibo_detail) if all_retweeted_list: retweeted_weibo_detail = dict() for mid in all_retweeted_list: retweet_count = es_text.count(index=index_list, doc_type="text", body={ "query": { "bool": { "must": [{ "term": { "root_mid": mid } }, { "term": { "message_type": 3 } }] } } })["count"] comment_count = es_text.count(index=index_list, doc_type="text", body={ "query": { "bool": { "must": [{ "term": { "root_mid": mid } }, { "term": { "message_type": 2 } }] } } })["count"] tmp = dict() tmp["retweeted"] = retweet_count tmp["comment"] = comment_count retweeted_weibo_detail[mid] = tmp #retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval) # 转发微博详情 else: retweeted_weibo_detail = {} print "len(retweeted_weibo_detail): ", len(retweeted_weibo_detail) #current_total_count = statistics_count['total_count'] # 当前阶段内所有微博总数 #current_retweeted_count = statistics_count['retweeted'] #current_comment_count = statistics_count['comment'] #all_mid_list = list(set(all_origin_list[:100]) | set(all_retweeted_list[:100])) # 感知到的事, all_mid_list sensitive_text_list = [] tmp_sensitive_warning = "" text_dict = dict() # 文本信息 mid_value = dict() # 文本赋值 duplicate_dict = dict() # 重合字典 portrait_dict = dict() # 背景信息 classify_text_dict = dict() # 分类文本 classify_uid_list = [] duplicate_text_list = [] sensitive_words_dict = dict() sensitive_weibo_detail = {} trendline_dict = dict() all_text_dict = dict() # 有事件发生时开始 if 1: print "index_list:", index_list if index_list and all_mid_list: query_body = { "query": { "filtered": { "filter": { "terms": { "mid": all_mid_list } } } }, "size": 5000 } search_results = es_text.search(index=index_list, doc_type="text", body=query_body)['hits']['hits'] print "search mid len: ", len(search_results) tmp_sensitive_warning = "" text_dict = dict() # 文本信息 mid_value = dict() # 文本赋值 duplicate_dict = dict() # 重合字典 portrait_dict = dict() # 背景信息 classify_text_dict = dict() # 分类文本 #classify_uid_list = [] classify_mid_list = [] duplicate_text_list = [] sensitive_words_dict = dict() mid_ts_dict = dict() # 文本发布时间 uid_prediction_dict = dict() weibo_prediction_dict = dict() trendline_dict = dict() feature_prediction_list = [] # feature mid_prediction_list = [] # dui ying mid if search_results: for item in search_results: iter_uid = item['_source']['uid'] iter_mid = item['_source']['mid'] mid_ts_dict[iter_mid] = item["_source"]["timestamp"] iter_text = item['_source']['text'].encode( 'utf-8', 'ignore') iter_sensitive = item['_source'].get('sensitive', 0) tmp_text = get_weibo(item['_source']) all_text_dict[iter_mid] = tmp_text duplicate_text_list.append({ "_id": iter_mid, "title": "", "content": iter_text.decode("utf-8", 'ignore') }) if iter_sensitive: tmp_sensitive_warning = signal_sensitive_variation #涉及到敏感词的微博 sensitive_words_dict[iter_mid] = iter_sensitive keywords_dict = json.loads( item['_source']['keywords_dict']) personal_keywords_dict = dict() for k, v in keywords_dict.iteritems(): k = k.encode('utf-8', 'ignore') personal_keywords_dict[k] = v classify_text_dict[iter_mid] = personal_keywords_dict #classify_uid_list.append(iter_uid) classify_mid_list.append(iter_mid) # 去重 print "start duplicate" if duplicate_text_list: dup_results = duplicate(duplicate_text_list) for item in dup_results: if item['duplicate']: duplicate_dict[item['_id']] = item['same_from'] # 分类 print "start classify" mid_value = dict() if classify_text_dict: #classify_results = topic_classfiy(classify_uid_list, classify_text_dict) classify_results = topic_classfiy(classify_mid_list, classify_text_dict) #print "classify_results: ", classify_results for k, v in classify_results.iteritems(): # mid:value #mid_value[k] = topic_value_dict[v[0]] mid_value[k] = v[0] #feature_list = organize_feature(k, mid_ts_dict[k]) #feature_prediction_list.append(feature_list) # feature list #mid_prediction_list.append(k) # corresponding # prediction """ print "start prediction" weibo_prediction_result = weibo_model.predict(feature_prediction_list) uid_prediction_result = uid_model.predict(feature_prediction_list) for i in range(len(mid_prediction_list)): if i % 100 == 0: print i uid_prediction_dict[mid_prediction_list[i]] = uid_prediction_result[i] weibo_prediction_dict[mid_prediction_list[i]] = weibo_prediction_result[i] tmp_trendline = trendline_list(mid_prediction_list[i], weibo_prediction_result[i], mid_ts_dict[mid_prediction_list[i]]) trendline_dict[mid_prediction_list[i]] = tmp_trendline """ # organize data mid_list = all_text_dict.keys() print "final mid:", len(mid_list) print "intersection: ", len(set(mid_list) & set(all_mid_list)) bulk_action = [] count = 0 for mid in mid_list: iter_dict = dict() if origin_weibo_detail.has_key(mid): iter_dict.update(origin_weibo_detail[mid]) iter_dict["type"] = 1 elif retweeted_weibo_detail.has_key(mid): iter_dict.update(retweeted_weibo_detail[mid]) iter_dict["type"] = 3 else: iter_dict["retweeted"] = 0 iter_dict["comment"] = 0 print "mid in all_mid_list: ", mid in set(all_mid_list) #iter_dict["trendline"] = json.dumps(trendline_dict[mid]) if duplicate_dict.has_key(mid): iter_dict["duplicate"] = duplicate_dict[mid] else: iter_dict["duplicate"] = "" #iter_dict["uid_prediction"] = uid_prediction_dict[mid] #iter_dict["weibo_prediction"] = weibo_prediction_dict[mid] iter_dict["compute_status"] = 0 # 尚未计算 iter_dict["topic_field"] = mid_value[mid] iter_dict["detect_ts"] = ts iter_dict["xnr_user_no"] = xnr_user_no iter_dict.update(all_text_dict[mid]) count += 1 print 'iter_dict:::', iter_dict _id = xnr_user_no + '_' + mid bulk_action.extend([{"index": {"_id": _id}}, iter_dict]) if count % 500 == 0: es_xnr.bulk(bulk_action, index="social_sensing_text", doc_type="text", timeout=600) bulk_action = [] if bulk_action: es_xnr.bulk(bulk_action, index="social_sensing_text", doc_type="text", timeout=600) return "1"