def task_list(): create_task() while 1: task_detail = r_micro.rpop(task_micro_prediction) if not task_detail: break task_detail = json.loads(task_detail) task_name = task_detail[0] start_ts = task_detail[1] end_ts = task_detail[2] during = task_detail[3] mappings_micro_task("micro_prediction_"+task_name) while 1: es_result = es_prediction.get(index=index_manage_prediction_task, doc_type=type_manage_prediction_task, id=task_name)["_source"] if int(es_result["scan_text_processing"]) == 2: break else: time.sleep(10) r_micro.lpush(task_micro_prediction, json.dumps(task_detail)) organize_feature(task_name, task_name, start_ts, end_ts) dispose_data(task_name, end_ts)
def start_task(): while 1: detail = r_stimulation.rpop(task_stimulation) print "detail: ", detail if not detail: break task_detail = json.loads(detail) task_name = task_detail[0] stop_time = task_detail[1] scan_text_finish = task_detail[2] ts = task_detail[3] if RUN_TYPE == 1: while 1: if float(scan_text_finish) != float(1): time.sleep(60) scan_text_finish = es_prediction.get(index=index_manage_interfere_task,doc_type=\ type_manage_interfere_task,id=task_name)["_source"]["scan_text_finish"] else: print "begin work" break predict_user_influence(task_name, stop_time, ts)
def prediction_task(task_name, current_ts, during=3600): # how many prediction work has done exist_count = 0 task_detail = es_prediction.get(index=index_manage_prediction_task, doc_type=type_manage_prediction_task, id=task_name)["_source"] start_time = int(task_detail["start_time"]) origin_task_name = task_name task_name = "micro_prediction_" + task_name while 1: start_time += during try: exist_work = es_prediction.get(index=task_name, doc_type="micro_task", id=start_time)["_source"] if exist_work["prediction_value"]: pass except: if exist_work: update_time = start_time else: update_time = start_time - during break exist_count += 1 """ if exist_count == 0: update_time = start_time else: update_time = start_time -during """ if update_time > current_ts: return True else: while 1: if update_time > current_ts: print "update time: ", update_time print "current ts: ", current_ts break else: print "update time: ", update_time dispose_data(origin_task_name, update_time, during) update_time += during
def organize_network(task_name): count = 0 es_results = es_prediction.get(index=index_manage_interfere_task, \ doc_type=type_manage_interfere_task, id=task_name)["_source"] start_time = es_results["start_time"] stop_time = es_results["stop_time"] user_set = set() query_body = { "query": { "range": { "timestamp": { "gte": start_time, "lt": stop_time } } } } if RUN_TYPE == 0: query_body = { "query": { "range": { "timestamp": { "gte": 1482681602, "lt": 1482681602 + 10 * 2400 } } } } es_scan = scan(es_prediction, query=query_body, index=task_name, doc_type="text", size=3000) while 1: try: re_es = es_scan.next() count += 1 if count % 3000 == 0: print count detail = re_es["_source"] if int(detail["message_type"]) != 2: user_set.add(detail["uid"]) if int(detail["message_type"]) == 3 or int( detail["message_type"]) == 2: if detail["directed_uid"]: user_set.add(str(detail["directed_uid"])) user_set.add(detail["root_uid"]) except StopIteration: print "finish" break print len(user_set) return list(user_set)
def organize_network(task_name, ts): count = 0 es_results = es_prediction.get(index=index_manage_interfere_task, \ doc_type=type_manage_interfere_task, id=task_name)["_source"] start_time = es_results["start_time"] stop_time = es_results["stop_time"] user_set = set() query_body = { "query": { "bool": { "must": [{ "range": { "timestamp": { "gte": start_time, "lt": ts } } }, { "range": { "user_fansnum": { "gte": 10000 } } }] } } } es_scan = scan(es_prediction, query=query_body, index=task_name, doc_type="text", size=3000) while 1: try: re_es = es_scan.next() count += 1 if count % 3000 == 0: print "search participators: ", count detail = re_es["_source"] if int(detail["message_type"]) != 2: user_set.add(detail["uid"]) if int(detail["message_type"]) == 3 or int( detail["message_type"]) == 2: if detail["directed_uid"]: user_set.add(str(detail["directed_uid"])) user_set.add(detail["root_uid"]) except StopIteration: print "finish" break print "current participators: ", len(user_set) return list(user_set)
def dispose_results(task_name, ts, future_total, current_total): index_name = "stimulation_"+task_name index_type = "stimulation_results" results = es.get(index=index_name, doc_type=index_type, id=ts)["_source"] future_results = json.loads(results["future_results"]) future_list = [] # 未来传播路径 diffusion_path = dict() # 未来传播数值 diffusion_value = dict() for start_uid, end_dict in future_results.iteritems(): diffusion_path[start_uid] = end_dict.keys() future_list.extend(end_dict.keys()) diffusion_value.update(end_dict) # 未来传播者信息 # uid nick_name, photo_url, fans_num, weibo_num, prediction_value future_list = list(set(future_list)) future_user_info = get_future_user(future_list) #print future_user_info for i in range(len(future_list)): uid = future_user_info[i][0] future_user_info[i].append(int(diffusion_value[uid])) # 当前热门微博、用户信息 current_hot_mid = search_hot_mid(task_name, ts) # 当前潜在热门微博 potential_mid, t1, t2 = potential_user(task_name, ts) future_total += t1 current_total += t2 ratio = float(current_total)/future_total update_dict = dict() update_dict["diffusion_path"] = json.dumps(diffusion_path) update_dict["future_user_info"] = json.dumps(future_user_info) update_dict["current_hot_weibo"] = json.dumps(current_hot_mid) update_dict["potential_hot_weibo"] = json.dumps(potential_mid) update_dict["ratio"] = ratio es.update(index=index_name, doc_type=index_type, id=ts, body={"doc":update_dict}) return True
def create_task(): ts = time.time() current_ts = datehour2ts(ts2datehour(ts)) query_body = { "query": { "term":{"finish":"0"} }, "size":10000 } results = es_prediction.search(index=index_manage_prediction_task,\ doc_type=type_manage_prediction_task, body=query_body)["hits"]["hits"] for item in results: task_name = item["_source"]["pinyin_task_name"] print "push task_name: ", task_name task_detail = es_prediction.get(index_manage_prediction_task,doc_type=type_manage_prediction_task,id=task_name)["_source"] update_time = item["update_time"] stop_time = item["_source"]["stop_time"] if current_ts > stop_time: es_prediction.update(index=index_manage_prediction_task,doc_type=type_manage_prediction_task,\ id=task_name, body={"doc":{"finish":"1"}}) during = item["_source"]["micro_during"] if current_ts - update_time>= during: r_micro.lpush(task_micro_prediction, json.dumps([task_name, update_time, current_ts, during]))
def dispose_data(task_name, current_ts, during=3600): K = 2 ######## task_detail = es_prediction.get(index=index_manage_prediction_task, doc_type=type_manage_prediction_task, id=task_name)["_source"] start_time = int(task_detail["start_time"]) origin_task_name = task_name task_name = "micro_prediction_" + task_name query_body = { "query": { "range": { "update_time": { "lte": current_ts } } }, "size": K, "sort": { "update_time": { "order": "desc" } } } sort_query_body = { "query": { "range": { "update_time": { "lte": current_ts } } } } total_count = [] total_fans_list = [] total_origin_list = [] total_retweet_list = [] total_comment_list = [] total_uid_list = [] total_positive_list = [] total_negetive_list = [] average_origin_ts = [] average_retweet_ts = [] feature_list = [] results = es_prediction.search(index=task_name, doc_type=index_type_prediction_task, body=query_body)["hits"]["hits"] location = es_prediction.count(index=task_name, doc_type=index_type_prediction_task, body=sort_query_body)["count"] if len(results) != K: short_len = K - len(results) results.extend([[]] * short_len) print "former result: ", len(results), K results.reverse() for item in results: if item: item = item["_source"] #total_fans_list.append(item["total_fans_number"]) total_origin_list.append(item["origin_weibo_number"]) total_retweet_list.append(item["retweeted_weibo_number"]) total_comment_list.append(item["comment_weibo_number"]) total_count.append(item["total_count"]) total_uid_list.append(item["total_uid_count"]) total_positive_list.append(item["positive_count"]) total_negetive_list.append(item["negetive_count"]) average_origin_ts.append(item["average_origin_ts"]) average_retweet_ts.append(item["average_retweet_ts"]) else: #total_fans_list.append(0) total_origin_list.append(0) total_retweet_list.append(0) total_comment_list.append(0) total_uid_list.append(0) total_count.append(0) total_positive_list.append(0) total_negetive_list.append(0) average_origin_ts.append(0) average_retweet_ts.append(0) print "total_count: ", total_count feature_list = [] feature_list.append(math.log(int(total_retweet_list[-1] + 1))) feature_list.append(math.log(int(total_comment_list[-1] + 1))) feature_list.append(math.log(int(total_positive_list[-1] + 1))) feature_list.append(math.log(int(total_negetive_list[-2] + 1))) feature_list.append(math.log(int(total_negetive_list[-1] + 1))) feature_list.append(math.log(int(total_count[-1] + 1))) feature_list.append(math.log(int(total_uid_list[-1] + 1))) if int(during) == 3 * 3600: feature_list.append(average_origin_ts[-1]) feature_list.append(average_retweet_ts[-1]) # load model and prediction if int(during) == 3600: if total_count[-1] - total_count[-2] >= -0.2 * total_count[-2]: with open("model-up.pkl", "r") as f: gbdt = pickle.load(f) else: with open("model-down.pkl", "r") as f: gbdt = pickle.load(f) elif int(during) == 3 * 3600: with open("model-3.pkl", "r") as f: gbdt = pickle.load(f) print "feature_list: ", feature_list pred = gbdt.predict(feature_list) for item in pred: prediction_value = item prediction_value = math.exp(prediction_value) print "prediction_valie: ", prediction_value # update scan processing #es_prediction.update(index=index_manage_prediction_task,doc_type=type_manage_prediction_task, \ # id=origin_task_name, body={"doc":{"scan_text_processing":"0"}}) # update prediction value in es task_detail = es_prediction.get(index=index_manage_prediction_task, \ doc_type=type_manage_prediction_task, id=origin_task_name)["_source"] if current_ts >= int(task_detail["stop_time"]): task_detail["finish"] = "1" task_detail["processing_status"] = "0" # update task info es_prediction.index(index=index_manage_prediction_task, \ doc_type=type_manage_prediction_task, id=origin_task_name, body=task_detail) # update prediction es_prediction.update(index=task_name, doc_type=index_type_prediction_task, id=current_ts, body={"doc": { "prediction_value": prediction_value }}) return True
def dispose_data(task_name, current_ts): es_result = es_prediction.get(index=index_manage_prediction_task, doc_type=type_manage_prediction_task, id=task_name)["_source"] macro_during = es_result['macro_during'] start_ts = datehour2ts(ts2datehour(es_result["submit_time"])) task_start_ts = start_ts end_ts = datehour2ts(ts2datehour(es_result["stop_time"])) index_micro = "micro_prediction_" + task_name query_body = { "query": { "filtered": { "filter": { "range": { "update_time": { "lte": current_ts } } } } }, "size": 10000, "sort": { "update_time": { "order": "asc" } } } micro_results = es_prediction.search(index=index_micro, doc_type="micro_task", body=query_body)["hits"]["hits"] total_list = [] for item in micro_results: total_list.append(item["_source"]["total_count"]) # 每个时间段内的微博量 total_len = (end_ts - start_ts) / macro_during times = int(macro_during) / 3600 lenth = len(total_list) / times adjust_list = [] time_list = [] count = 0 i = 0 for item in total_list: count += item i += 1 if i % times == 0: if start_ts <= current_ts: adjust_list.append(count) count = 0 time_list.append(start_ts) else: break start_ts += 3600 # 总得时间走势图 total_time_list = [] for i in range(total_len): total_time_list.append(task_start_ts + i * macro_during) left_time = list(set(total_time_list) - set(time_list)) left_time = sorted(left_time) return adjust_list, total_len, time_list, left_time
def organize_feature(task_name, mid, ts): result = dict() try: result = es.get(index=task_name, doc_type="text", id=mid)["_source"] except: pass if not result: return [0, 0, 0, 0, 0, 0, 0] ts = result["timestamp"] query_body = {"query": {"term": {"root_mid": mid}}} #total_weibo #count = es.count(index=index_list, doc_type="text", body=query_body)["count"] query_body_uid = { "query": { "term": { "root_mid": mid } }, "aggs": { "uid_count": { "cardinality": { "field": "uid" } } } } # total_uid #total_uid_count = es.search(index=index_list, doc_type="text", body=query_body_uid)['aggregations']["uid_count"]["value"] feature_list = [] feature_list.append(math.log(result["user_fansnum"] + 1)) query_body_ts = { "query": { "bool": { "must": [{ "term": { "root_mid": mid } }, { "range": { "timestamp": { "lt": ts + 3600 * 10 } } }] } }, "aggs": { "weibo_type": { "terms": { "field": "message_type" } } } } comment = 0 retweet = 0 tmp_count = es.search( index=task_name, doc_type="text", body=query_body_ts)['aggregations']["weibo_type"]["buckets"] if tmp_count: for item in tmp_count: if int(item["key"]) == 2: comment = item["doc_count"] elif int(item["key"]) == 3: retweet = item["doc_count"] feature_list.append(comment + retweet) feature_list.append(retweet) feature_list.append(comment) feature_list.append(retweet / float(comment + retweet + 1)) feature_list.append(comment / float(comment + retweet + 1)) query_body_uid = { "query": { "bool": { "must": [{ "term": { "root_mid": mid } }, { "range": { "timestamp": { "lt": ts + 3600 * 10 } } }] } }, "aggs": { "uid_count": { "cardinality": { "field": "uid" } } } } uid_count = es.search( index=task_name, doc_type="text", body=query_body_uid)['aggregations']["uid_count"]["value"] feature_list.append(uid_count) #feature_list.append(topic_field_dict[topic]) return feature_list
def potential_user(task_name, ts): index_name = "stimulation_"+task_name index_type = "stimulation_results" #查询当前root_mid query_body = { "query": { "bool":{ "must":[ {"range":{ "timestamp":{ "lt": ts } }}, {"term":{"message_type":1}}, {"range":{ "user_fansnum":{ "gte": 10000 } }} ] } }, "size": 10000 } es_results = es.search(index=task_name, doc_type="text", body=query_body)["hits"]["hits"] mid_list = [] uid_list = [] feature_list = [] prediction_uid = [] prediction_weibo = [] with open("prediction_uid.pkl", "r") as f: uid_model = pickle.load(f) with open("prediction_weibo.pkl", "r") as f: weibo_model = pickle.load(f) for item in es_results: mid_list.append(item["_id"]) uid_list.append(item["_source"]["uid"]) tmp_feature_list = organize_feature(task_name,item["_id"], ts) feature_list.append(tmp_feature_list) weibo_prediction_result = weibo_model.predict(feature_list) uid_prediction_result = uid_model.predict(feature_list) future_total = 0 current_total = 0 results_dict = dict() in_potential_list = [] for i in range(len(mid_list)): mid = mid_list[i] uid = uid_list[i] iter_count = es.count(index=task_name, doc_type="text", body={"query":{"term":{"root_mid":mid}}})["count"] pre_count = weibo_prediction_result[i] future_total += abs(pre_count-iter_count) if pre_count >= 500 and iter_count <= 500: current_total += abs(pre_count-iter_count) if not results_dict.has_key(uid): results_dict[uid] = dict() tmp = dict() tmp["mid"] = mid tmp["current_count"] = iter_count tmp["prediction_count"] = int(pre_count) weibo_detail = es.get(index=task_name, doc_type="text", id=mid)["_source"] tmp.update(weibo_detail) retweet, comment = search_retweet_comment(task_name, mid) tmp["retweeted"] = retweet tmp["comment"] = comment results_dict[uid][mid] = tmp # user profile tmp_in_list = results_dict.keys() if tmp_in_list: profile_results = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":tmp_in_list})["docs"] for i in range(len(tmp_in_list)): detail = profile_results[i] tmp = [] uid = tmp_in_list[i] if detail["found"]: tmp.append(detail["_source"]["nick_name"]) tmp.append(detail["_source"]["photo_url"]) tmp.append(detail["_source"]["fansnum"]) tmp.append(detail["_source"]["statusnum"]) else: tmp.append(detail["_id"]) tmp.extend(["","",""]) results_dict[uid]["user_profile"] = tmp return results_dict, future_total, current_total
def get_origin_weibo_detail(ts, size, order, message_type=1): #print r.get("topic_value_dict") #error:topic_value_dict里存的为空 #topic_value_dict = json.loads(r.get("topic_value_dict")) task_detail = es_prediction.get(index=index_sensing_task, doc_type=_id, id=ts)['_source'] mid_value = json.loads(task_detail['mid_topic_value']) duplicate_dict = json.loads(task_detail['duplicate_dict']) tmp_duplicate_dict = dict() for k,v in duplicate_dict.iteritems(): try: tmp_duplicate_dict[v].append(k) except: tmp_duplicate_dict[v] = [k, v] if message_type == 1: weibo_detail = json.loads(task_detail['origin_weibo_detail']) elif message_type == 2: weibo_detail = json.loads(task_detail['retweeted_weibo_detail']) else: weibo_detail = json.loads(task_detail['sensitive_weibo_detail']) weibo_detail_list = [] if weibo_detail: for iter_mid, item in weibo_detail.iteritems(): tmp = [] tmp.append(iter_mid) tmp.append(item[iter_mid]) tmp.append(item['retweeted']) tmp.append(item['comment']) weibo_detail_list.append(tmp) mid_list = weibo_detail.keys() results = [] query_body = { "query":{ "filtered":{ "filter":{ "terms":{"mid": mid_list} } } }, "size": 1000, "sort": {"timestamp": {"order": "desc"}} } index_list = [] datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-DAY) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: index_list.append(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) if exist_es_1: index_list.append(index_name_1) if index_list and mid_list: search_results = es_text.search(index=index_list, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] uid_list = [] text_dict = dict() # 文本信息 portrait_dict = dict() # 背景信息 sort_results = [] if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) text_dict[item['_id']] = item['_source'] # _id是mid if uid_list: portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"] for item in portrait_result: if item['found']: portrait_dict[item['_id']] = {"nick_name": item["fields"]["nick_name"][0], "photo_url": item["fields"]["photo_url"][0]} else: portrait_dict[item['_id']] = {"nick_name": item['_id'], "photo_url":""} if order == "total": sorted_list = sorted(weibo_detail_list, key=lambda x:x[1], reverse=True) elif order == "retweeted": sorted_list = sorted(weibo_detail_list, key=lambda x:x[2], reverse=True) elif order == "comment": sorted_list = sorted(weibo_detail_list, key=lambda x:x[3], reverse=True) else: sorted_list = weibo_detail_list count_n = 0 results_dict = dict() mid_index_dict = dict() for item in sorted_list: # size mid = item[0] iter_text = text_dict.get(mid, {}) temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type if iter_text: uid = iter_text['uid'] temp.append(uid) iter_portrait = portrait_dict.get(uid, {}) if iter_portrait: temp.append(iter_portrait['nick_name']) temp.append(iter_portrait['photo_url']) else: temp.extend([uid,'']) temp.append(iter_text["text"]) temp.append(iter_text["sentiment"]) temp.append(ts2date(iter_text['timestamp'])) temp.append(iter_text['geo']) if message_type == 1: temp.append(1) elif message_type == 2: temp.append(3) else: temp.append(iter_text['message_type']) temp.append(item[2]) temp.append(item[3]) temp.append(iter_text.get('sensitive', 0)) temp.append(iter_text['timestamp']) temp.append(mid_value[mid]) temp.append(mid) results.append(temp) count_n += 1 results = sorted(results, key=operator.itemgetter(-4, -2, -6), reverse=True) # -4 -2 -3 sort_results = [] count = 0 for item in results: sort_results.append([item]) mid_index_dict[item[-1]] = count count += 1 if tmp_duplicate_dict: remove_list = [] value_list = tmp_duplicate_dict.values() # [[mid, mid], ] for item in value_list: tmp = [] for mid in item: if mid_index_dict.get(mid, 0): tmp.append(mid_index_dict[mid]) if len(tmp) > 1: tmp_min = min(tmp) else: continue tmp.remove(tmp_min) for iter_count in tmp: sort_results[tmp_min].extend(sort_results[iter_count]) remove_list.append(sort_results[iter_count]) if remove_list: for item in remove_list: sort_results.remove(item) return sort_results
def get_retweet_weibo_detail(ts, size, text_type, type_value): task_detail = es_prediction.get(index=index_sensing_task, doc_type=_id, id=ts)['_source'] origin_weibo_detail = json.loads(task_detail['origin_weibo_detail']) retweeted_weibo_detail = json.loads(task_detail['retweeted_weibo_detail']) mid_list = [] mid_list.extend(origin_weibo_detail.keys()) mid_list.extend(retweeted_weibo_detail.keys()) query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte": ts - time_interval, "lt": ts } }}, {"terms": {"root_mid": mid_list}} ] } } } }, "sort": {"timestamp": {"order": "desc"}}, "size": 100 } if text_type == "message_type": query_body['query']['filtered']['filter']['bool']['must'].append({"term":{text_type: type_value}}) if text_type == "sentiment": #if isinstance(type_value, str): if len(type_value) == 1: query_body['query']['filtered']['filter']['bool']['must'].append({"term":{text_type: type_value}}) else: query_body['query']['filtered']['filter']['bool']['must'].append({"terms":{text_type: type_value}}) datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) # 1. 查询微博 if datetime == datetime_1 and exist_es: search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] elif datetime != datetime_1 and exist_es_1: search_results = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] #print search_results # 2. 获取微博相关信息 results = [] uid_list = [] if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) if uid_list: portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"] for i in range(len(uid_list)): item = search_results[i]['_source'] temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type temp.append(item['uid']) if portrait_result[i]['found']: temp.append(portrait_result[i]["fields"]["nick_name"][0]) temp.append(portrait_result[i]["fields"]["photo_url"][0]) else: temp.append(item['uid']) temp.append("") temp.append(item["text"]) #print item['text'] temp.append(item["sentiment"]) temp.append(ts2date(item['timestamp'])) temp.append(item['geo']) temp.append(item["message_type"]) results.append(temp) return results
def task_list(): create_task() if RUN_TYPE: current_ts = datehour2ts(ts2datehour(time.time())) else: current_ts = 1482861600 while 1: task_detail = r_trendline.rpop(task_trendline) print task_detail if not task_detail: break task_name = task_detail while 1: micro_index = "micro_prediction_" + task_name es_exist = es_prediction.exists(index=micro_index, doc_type="micro_task", id=current_ts) if not es_exist: time.sleep(60) else: break # obtain time series value, total_len, time_list, left_list = dispose_data( task_name, current_ts) # macro prediction result try: es_macro_result = es_prediction.get(index=index_macro_feature_result,\ doc_type=type_macro_feature_result,id=task_name)["_source"] prediction_total_value = es_macro_result["predict_weibo_value"] top_value = prediction_total_value * 0.8 / (0.2 * total_len) except: top_value = 0 # 已知的最大值和位置 max_exist = max(value) index_exist = len(value) if top_value < max_exist: top_value = 2 * max_exist # weibo prediction k = 5 h = 0.5 peak = spd(value, h, k) flag = judge(peak, value) if len(flag) == 2: print("Two peaks:") paras = getTwoBeauties(value, flag[0], flag[1]) paras[-1] = total_len series = bassTwoPeaks(paras) else: print("Single peak:") paras = getSingleBeauty(value) paras[-1] = total_len series = bassOnePeak(paras) # 预测峰值位置 predict_climax = series.index(max(series)) if predict_climax > index_exist: predict_climax_left = predict_climax - len(value) # 剩余走势图 climax位置 起止点值 最大值 rise_trend, fall_trend = get_trend(left_list, predict_climax_left, value[-1], top_value) true_climax = time_list[0] + (time_list[1] - time_list[0]) * predict_climax else: top_value = value[-1] rise_trend, fall_trend = get_trend(left_list, 0, value[-1], 1) true_climax = time_list[value.index(max(value))] results = dict() results["climax"] = [true_climax, top_value] results["rise_trend"] = rise_trend results["fall_trend"] = fall_trend new_list = [] for i in range(len(time_list)): new_list.append([time_list[i], value[i]]) results["exist_trend"] = new_list r_trendline.set("trendline_" + task_name, json.dumps(results)) print results
def rank_predict(event, start_ts, end_ts): feature_list = feature_compute(event, start_ts, end_ts) print 'feature_list:::::', feature_list feature_list_gbdt = [] for i in range(len(feature_list)): #把多个分开(extend if i == 15: feature_list[i] = json.loads(json.dumps(feature_list[i])) print 'type::', type(feature_list[i]) print 'feature_list[i][at_0]::', feature_list[i]['at_0'] feature_list_gbdt.append(feature_list[i]['at_0']) feature_list_gbdt.append(feature_list[i]['at_1']) feature_list_gbdt.append(feature_list[i]['at_2']) feature_list_gbdt.append(feature_list[i]['at>3']) elif i == 16: feature_list[i] = json.loads(json.dumps(feature_list[i])) print 'feature_list[i]:::', feature_list[i] feature_list_gbdt.append(feature_list[i][0]) feature_list_gbdt.append(feature_list[i][1]) feature_list_gbdt.append(feature_list[i][2]) feature_list_gbdt.append(feature_list[i][3]) else: feature_list_gbdt.append(feature_list[i]) print 'feature_list_gbdt:::::', feature_list_gbdt #加载微博模型 with open("0305_macro-prediction-weibos-value.pkl", "rb") as f: gbdt = pickle.load(f) pred = gbdt.predict(feature_list_gbdt) for item in pred: predict_weibo_value = item #加载用户模型 with open("0305_macro-prediction-uids-value.pkl", "rb") as f: gbdt = pickle.load(f) pred = gbdt.predict(feature_list_gbdt) for item in pred: predict_user_value = item predict_rank = get_rank(predict_user_value) ## 存入事件信息表 #for i in range(len(feature_list)): feature_results = {} feature_results['event'] = event ''' feature_results['topic_field'] = feature_list[0] feature_results['total_num'] = feature_list[1] feature_results['total_user_fans'] = feature_list[2] feature_results['total_comment'] = feature_list[3] feature_results['total_retweet'] = feature_list[4] feature_results['total_sensitive'] = feature_list[5] feature_results['total_sensitive_ratio'] = feature_list[6] feature_results['total_negtive'] = feature_list[7] feature_results['total_important_user'] = feature_list[8] feature_results['total_origin_type'] = feature_list[9] feature_results['origin_ratio'] = feature_list[10] feature_results['total_retweet_type'] = feature_list[11] feature_results['retweet_ratio'] = feature_list[12] feature_results['total_comment_type'] = feature_list[13] feature_results['comment_ratio'] = feature_list[14] feature_results['at_count'] = feature_list[15] feature_results['event_uid_count'] = feature_list[16] feature_results['event_trend_delta'] = feature_list[17] feature_results['predict_weibo_value'] = predict_weibo_value feature_results['predict_user_value'] = predict_user_value feature_results['predict_rank'] = predict_rank feature_results['update_time'] = time.time() ''' #feature_results['topic_field'] = feature_list[0] feature_results['uid_count'] = feature_list[0] feature_results['total_num'] = feature_list[1] feature_results['total_user_fans'] = feature_list[2] feature_results['total_comment'] = feature_list[3] feature_results['total_retweet'] = feature_list[4] feature_results['total_sensitive'] = feature_list[5] feature_results['total_sensitive_ratio'] = feature_list[6] feature_results['total_negtive'] = feature_list[7] feature_results['total_important_user'] = feature_list[8] feature_results['total_origin_type'] = feature_list[9] feature_results['origin_ratio'] = feature_list[10] feature_results['total_retweet_type'] = feature_list[11] feature_results['retweet_ratio'] = feature_list[12] feature_results['total_comment_type'] = feature_list[13] feature_results['comment_ratio'] = feature_list[14] feature_results['at_count'] = json.dumps(feature_list[15]) feature_results['event_trend_delta'] = json.dumps(feature_list[16]) feature_results['predict_weibo_value'] = predict_weibo_value feature_results['predict_user_value'] = predict_user_value feature_results['predict_rank'] = predict_rank feature_results['update_time'] = time.time() ''' save_event_info_results(event,topic_field,total_num,total_user_fans,\ total_comment,total_retweet,total_sensitive,\ total_sensitive_ratio,total_negtive,total_important_user,\ total_origin_type,origin_ratio,total_retweet_type,retweet_ratio,\ total_comment_type,comment_ratio,at_count,event_uid_count,\ event_trend_delta,predict_value,predict_rank,update_time) ''' #update macro features & results feature_results = json.dumps(feature_results) try: item_exists = es_prediction.get(index=index_macro_feature_result,doc_type= type_macro_feature_result,\ id=event)['_source'] es_prediction.update(index=index_macro_feature_result,doc_type=type_macro_feature_result,\ id=event,body={'doc':feature_results}) except: es_prediction.index(index=index_macro_feature_result,doc_type=type_macro_feature_result,\ id=event,body=feature_results) # update task info —— "macro_value_finish" task_detail = es_prediction.get(index=index_manage_prediction_task, \ doc_type=type_manage_prediction_task, id=event)["_source"] task_detail["macro_value_finish"] = '1' es_prediction.index(index=index_manage_prediction_task, \ doc_type=type_manage_prediction_task, id=event, body=task_detail) print 'feature_results::::', feature_results