def get_sensitive_info(timestamp,mid): index_name = flow_text_index_name_pre + ts2datetime(timestamp) try: item_result = es_flow_text.get(index=index_name,doc_type=flow_text_index_type,id=mid)['_source'] sensitive_info = item_result['sensitive'] except: sensitive_info = 0 return sensitive_info
def tw_uid2nick_name_photo(uid): uname_photo_dict = {} try: user = es_flow_text.get(index=twitter_user_index_name,doc_type=twitter_user_index_type,id=uid)['_source'] nick_name = user['username'] photo_url = user['profile_image_url_https'] except: nick_name = uid photo_url = '' return nick_name,photo_url
def fb_uid2nick_name_photo(uid): uname_photo_dict = {} try: user = es_flow_text.get(index=facebook_user_index_name,doc_type=facebook_user_index_type,id=uid)['_source'] nick_name = user['name'] photo_url = ''#user['photo_url'] except: nick_name = uid photo_url = '' return nick_name,photo_url
def organize_feature(mid, ts): index_list = [] single_index = "flow_text_" + ts2datetime(ts) index_list.append("flow_text_" + ts2datetime(ts)) next_index = "flow_text_" + ts2datetime(ts + 24 * 3600) if es.indices.exists(next_index): index_list.append(next_index) result = es.get(index=single_index, doc_type="text", id=mid)["_source"] if not result: return [0, 0, 0, 0, 0, 0, 0] ts = result["timestamp"] query_body = {"query": {"term": {"root_mid": mid}}} #total_weibo #count = es.count(index=index_list, doc_type="text", body=query_body)["count"] query_body_uid = { "query": { "term": { "root_mid": mid } }, "aggs": { "uid_count": { "cardinality": { "field": "uid" } } } } # total_uid #total_uid_count = es.search(index=index_list, doc_type="text", body=query_body_uid)['aggregations']["uid_count"]["value"] feature_list = [] feature_list.append(math.log(result["user_fansnum"] + 1)) query_body_ts = { "query": { "bool": { "must": [{ "term": { "root_mid": mid } }, { "range": { "timestamp": { "lt": ts + 3600 * 10 } } }] } }, "aggs": { "weibo_type": { "terms": { "field": "message_type" } } } } comment = 0 retweet = 0 tmp_count = es.search( index=index_list, doc_type="text", body=query_body_ts)['aggregations']["weibo_type"]["buckets"] if tmp_count: for item in tmp_count: if int(item["key"]) == 2: comment = item["doc_count"] elif int(item["key"]) == 3: retweet = item["doc_count"] feature_list.append(comment + retweet) feature_list.append(retweet) feature_list.append(comment) feature_list.append(retweet / float(comment + retweet + 1)) feature_list.append(comment / float(comment + retweet + 1)) query_body_uid = { "query": { "bool": { "must": [{ "term": { "root_mid": mid } }, { "range": { "timestamp": { "lt": ts + 3600 * 10 } } }] } }, "aggs": { "uid_count": { "cardinality": { "field": "uid" } } } } uid_count = es.search( index=index_list, doc_type="text", body=query_body_uid)['aggregations']["uid_count"]["value"] feature_list.append(uid_count) #feature_list.append(topic_field_dict[topic]) return feature_list
def trendline_list(mid, total_value): if RUN_TYPE: ts = time.time() else: ts = datetime2ts("2016-11-19") index_list = [] nn = 24*3600/diffusion_time_interval ### for i in range(diffusion_time): index_list.append("flow_text_"+ts2datetime(ts-i*24*3600)) result = dict() for iter_index in index_list: if not es.indices.exists(index=iter_index): continue try: result = es.get(index=iter_index, doc_type="text", id=mid)["_source"] break except: pass if not result: return [] current_list = [] rising_list = [] falling_list = [] exist_time_list = [] total_time_list = [] timestamp = result["timestamp"] start_ts = timestamp timestamp = datehour2ts(ts2datehour(timestamp)) for i in range(diffusion_time*nn): total_time_list.append(timestamp+i*diffusion_time_interval) # diffusion more than 5 days, return time list as far if 1: while 1: query_body = { "query":{ "bool":{ "must":[ {"term":{"root_mid": mid}}, {"range":{ "timestamp":{ "gte": timestamp, "lt": timestamp + diffusion_time_interval } }} ] } } } index_name = "flow_text_"+ts2datetime(timestamp) count = es.count(index=index_name, doc_type="text", body=query_body)["count"] current_list.append(count) exist_time_list.append(timestamp) timestamp += diffusion_time_interval if timestamp >= ts: break left_set = set(total_time_list) - set(exist_time_list) left_list = sorted(list(left_set), reverse=False) max_value = max(current_list) index_exist = len(current_list) value = current_list expected_value = total_value*0.8/(0.2*nn*diffusion_time) if expected_value <= max_value: top_value = (max_value +total_value)/2 else: top_value = expected_value # weibo prediction k = 5 h = 0.5 peak = spd(value,h,k) flag = judge(peak,value) if len(flag) == 2: paras = getTwoBeauties(value,flag[0],flag[1]) paras[-1] = diffusion_time*nn series = bassTwoPeaks(paras) else: paras = getSingleBeauty(value) paras[-1] = diffusion_time*nn series = bassOnePeak(paras) # 预测峰值位置 predict_climax = series.index(max(series)) if predict_climax > index_exist: predict_climax_left = predict_climax - len(current_list) rise_trend, fall_trend = get_trend(left_list, predict_climax_left, value[-1], top_value) true_climax = exist_time_list[0] + (exist_time_list[1]-exist_time_list[0])*predict_climax else: top_value = value[-1] rise_trend, fall_trend = get_trend(left_list, 0, value[-1], 1) true_climax = exist_time_list[value.index(max(value))] top_value = max(value) results = dict() results["climax"] = [true_climax, top_value] results["rise_trend"] = rise_trend results["fall_trend"] = fall_trend new_list = [] for i in range(len(exist_time_list)): new_list.append([exist_time_list[i], value[i]]) results["exist_trend"] = new_list return results