예제 #1
0
def task_list():
    create_task()
    while 1:
        task_detail = r_micro.rpop(task_micro_prediction)
        if not task_detail:
            break

        task_detail = json.loads(task_detail)
        task_name = task_detail[0]
        start_ts = task_detail[1]
        end_ts = task_detail[2]
        during = task_detail[3]

        mappings_micro_task("micro_prediction_"+task_name)

        while 1:
            es_result = es_prediction.get(index=index_manage_prediction_task, doc_type=type_manage_prediction_task, id=task_name)["_source"]
            if int(es_result["scan_text_processing"]) == 2:
                break
            else:
                time.sleep(10)
                r_micro.lpush(task_micro_prediction, json.dumps(task_detail))

        organize_feature(task_name, task_name, start_ts, end_ts)
        dispose_data(task_name, end_ts)
예제 #2
0
def start_task():
    while 1:
        detail = r_stimulation.rpop(task_stimulation)
        print "detail: ", detail
        if not detail:
            break

        task_detail = json.loads(detail)

        task_name = task_detail[0]
        stop_time = task_detail[1]
        scan_text_finish = task_detail[2]
        ts = task_detail[3]

        if RUN_TYPE == 1:
            while 1:
                if float(scan_text_finish) != float(1):
                    time.sleep(60)
                    scan_text_finish = es_prediction.get(index=index_manage_interfere_task,doc_type=\
                        type_manage_interfere_task,id=task_name)["_source"]["scan_text_finish"]
                else:
                    print "begin work"
                    break

        predict_user_influence(task_name, stop_time, ts)
def prediction_task(task_name, current_ts, during=3600):
    # how many prediction work has done
    exist_count = 0
    task_detail = es_prediction.get(index=index_manage_prediction_task,
                                    doc_type=type_manage_prediction_task,
                                    id=task_name)["_source"]
    start_time = int(task_detail["start_time"])
    origin_task_name = task_name
    task_name = "micro_prediction_" + task_name

    while 1:
        start_time += during
        try:
            exist_work = es_prediction.get(index=task_name,
                                           doc_type="micro_task",
                                           id=start_time)["_source"]
            if exist_work["prediction_value"]:
                pass
        except:
            if exist_work:
                update_time = start_time
            else:
                update_time = start_time - during
            break
        exist_count += 1
    """
    if exist_count == 0:
        update_time = start_time
    else:
        update_time = start_time -during
    """

    if update_time > current_ts:
        return True

    else:
        while 1:
            if update_time > current_ts:
                print "update time: ", update_time
                print "current ts: ", current_ts
                break
            else:
                print "update time: ", update_time
                dispose_data(origin_task_name, update_time, during)
            update_time += during
예제 #4
0
def organize_network(task_name):
    count = 0
    es_results = es_prediction.get(index=index_manage_interfere_task, \
            doc_type=type_manage_interfere_task, id=task_name)["_source"]
    start_time = es_results["start_time"]
    stop_time = es_results["stop_time"]
    user_set = set()
    query_body = {
        "query": {
            "range": {
                "timestamp": {
                    "gte": start_time,
                    "lt": stop_time
                }
            }
        }
    }

    if RUN_TYPE == 0:
        query_body = {
            "query": {
                "range": {
                    "timestamp": {
                        "gte": 1482681602,
                        "lt": 1482681602 + 10 * 2400
                    }
                }
            }
        }

    es_scan = scan(es_prediction,
                   query=query_body,
                   index=task_name,
                   doc_type="text",
                   size=3000)
    while 1:
        try:
            re_es = es_scan.next()
            count += 1
            if count % 3000 == 0:
                print count
            detail = re_es["_source"]
            if int(detail["message_type"]) != 2:
                user_set.add(detail["uid"])
            if int(detail["message_type"]) == 3 or int(
                    detail["message_type"]) == 2:
                if detail["directed_uid"]:
                    user_set.add(str(detail["directed_uid"]))
                user_set.add(detail["root_uid"])
        except StopIteration:
            print "finish"
            break

    print len(user_set)
    return list(user_set)
예제 #5
0
def organize_network(task_name, ts):
    count = 0
    es_results = es_prediction.get(index=index_manage_interfere_task, \
            doc_type=type_manage_interfere_task, id=task_name)["_source"]
    start_time = es_results["start_time"]
    stop_time = es_results["stop_time"]
    user_set = set()
    query_body = {
        "query": {
            "bool": {
                "must": [{
                    "range": {
                        "timestamp": {
                            "gte": start_time,
                            "lt": ts
                        }
                    }
                }, {
                    "range": {
                        "user_fansnum": {
                            "gte": 10000
                        }
                    }
                }]
            }
        }
    }

    es_scan = scan(es_prediction,
                   query=query_body,
                   index=task_name,
                   doc_type="text",
                   size=3000)
    while 1:
        try:
            re_es = es_scan.next()
            count += 1
            if count % 3000 == 0:
                print "search participators: ", count
            detail = re_es["_source"]
            if int(detail["message_type"]) != 2:
                user_set.add(detail["uid"])
            if int(detail["message_type"]) == 3 or int(
                    detail["message_type"]) == 2:
                if detail["directed_uid"]:
                    user_set.add(str(detail["directed_uid"]))
                user_set.add(detail["root_uid"])
        except StopIteration:
            print "finish"
            break

    print "current participators: ", len(user_set)
    return list(user_set)
def dispose_results(task_name, ts, future_total, current_total):
    index_name = "stimulation_"+task_name
    index_type = "stimulation_results"
    results = es.get(index=index_name, doc_type=index_type, id=ts)["_source"]
    future_results = json.loads(results["future_results"])

    future_list = []
    # 未来传播路径
    diffusion_path = dict()
    # 未来传播数值
    diffusion_value = dict()
    for start_uid, end_dict in future_results.iteritems():
        diffusion_path[start_uid] = end_dict.keys()
        future_list.extend(end_dict.keys())
        diffusion_value.update(end_dict)

    # 未来传播者信息
    # uid nick_name, photo_url, fans_num, weibo_num, prediction_value
    future_list = list(set(future_list))
    future_user_info = get_future_user(future_list)
    #print future_user_info
    for i in range(len(future_list)):
        uid = future_user_info[i][0]
        future_user_info[i].append(int(diffusion_value[uid]))


    # 当前热门微博、用户信息
    current_hot_mid = search_hot_mid(task_name, ts)

    # 当前潜在热门微博
    potential_mid, t1, t2 = potential_user(task_name, ts)

    future_total += t1
    current_total += t2
    ratio = float(current_total)/future_total

    update_dict = dict()
    update_dict["diffusion_path"] = json.dumps(diffusion_path)
    update_dict["future_user_info"] = json.dumps(future_user_info)
    update_dict["current_hot_weibo"] = json.dumps(current_hot_mid)
    update_dict["potential_hot_weibo"] = json.dumps(potential_mid)
    update_dict["ratio"] = ratio
    es.update(index=index_name, doc_type=index_type, id=ts, body={"doc":update_dict})

    return True
예제 #7
0
def create_task():
    ts = time.time()
    current_ts = datehour2ts(ts2datehour(ts))
    query_body = {
        "query": {
            "term":{"finish":"0"}
        },
        "size":10000
    }

    results = es_prediction.search(index=index_manage_prediction_task,\
            doc_type=type_manage_prediction_task, body=query_body)["hits"]["hits"]
    for item in results:
        task_name = item["_source"]["pinyin_task_name"]
        print "push task_name: ", task_name
        task_detail = es_prediction.get(index_manage_prediction_task,doc_type=type_manage_prediction_task,id=task_name)["_source"]
        update_time = item["update_time"]
        stop_time = item["_source"]["stop_time"]
        if current_ts > stop_time:
            es_prediction.update(index=index_manage_prediction_task,doc_type=type_manage_prediction_task,\
                    id=task_name, body={"doc":{"finish":"1"}})
        during = item["_source"]["micro_during"]
        if current_ts - update_time>= during:
            r_micro.lpush(task_micro_prediction, json.dumps([task_name, update_time, current_ts, during]))
def dispose_data(task_name, current_ts, during=3600):
    K = 2  ########

    task_detail = es_prediction.get(index=index_manage_prediction_task,
                                    doc_type=type_manage_prediction_task,
                                    id=task_name)["_source"]
    start_time = int(task_detail["start_time"])

    origin_task_name = task_name
    task_name = "micro_prediction_" + task_name

    query_body = {
        "query": {
            "range": {
                "update_time": {
                    "lte": current_ts
                }
            }
        },
        "size": K,
        "sort": {
            "update_time": {
                "order": "desc"
            }
        }
    }

    sort_query_body = {
        "query": {
            "range": {
                "update_time": {
                    "lte": current_ts
                }
            }
        }
    }

    total_count = []
    total_fans_list = []
    total_origin_list = []
    total_retweet_list = []
    total_comment_list = []
    total_uid_list = []
    total_positive_list = []
    total_negetive_list = []
    average_origin_ts = []
    average_retweet_ts = []

    feature_list = []
    results = es_prediction.search(index=task_name,
                                   doc_type=index_type_prediction_task,
                                   body=query_body)["hits"]["hits"]
    location = es_prediction.count(index=task_name,
                                   doc_type=index_type_prediction_task,
                                   body=sort_query_body)["count"]

    if len(results) != K:
        short_len = K - len(results)
        results.extend([[]] * short_len)
    print "former result: ", len(results), K
    results.reverse()
    for item in results:
        if item:
            item = item["_source"]
            #total_fans_list.append(item["total_fans_number"])
            total_origin_list.append(item["origin_weibo_number"])
            total_retweet_list.append(item["retweeted_weibo_number"])
            total_comment_list.append(item["comment_weibo_number"])
            total_count.append(item["total_count"])
            total_uid_list.append(item["total_uid_count"])
            total_positive_list.append(item["positive_count"])
            total_negetive_list.append(item["negetive_count"])
            average_origin_ts.append(item["average_origin_ts"])
            average_retweet_ts.append(item["average_retweet_ts"])
        else:
            #total_fans_list.append(0)
            total_origin_list.append(0)
            total_retweet_list.append(0)
            total_comment_list.append(0)
            total_uid_list.append(0)
            total_count.append(0)
            total_positive_list.append(0)
            total_negetive_list.append(0)
            average_origin_ts.append(0)
            average_retweet_ts.append(0)
    print "total_count: ", total_count

    feature_list = []
    feature_list.append(math.log(int(total_retweet_list[-1] + 1)))
    feature_list.append(math.log(int(total_comment_list[-1] + 1)))
    feature_list.append(math.log(int(total_positive_list[-1] + 1)))
    feature_list.append(math.log(int(total_negetive_list[-2] + 1)))
    feature_list.append(math.log(int(total_negetive_list[-1] + 1)))
    feature_list.append(math.log(int(total_count[-1] + 1)))
    feature_list.append(math.log(int(total_uid_list[-1] + 1)))
    if int(during) == 3 * 3600:
        feature_list.append(average_origin_ts[-1])
        feature_list.append(average_retweet_ts[-1])

    # load model and prediction
    if int(during) == 3600:
        if total_count[-1] - total_count[-2] >= -0.2 * total_count[-2]:
            with open("model-up.pkl", "r") as f:
                gbdt = pickle.load(f)
        else:
            with open("model-down.pkl", "r") as f:
                gbdt = pickle.load(f)
    elif int(during) == 3 * 3600:
        with open("model-3.pkl", "r") as f:
            gbdt = pickle.load(f)

    print "feature_list: ", feature_list
    pred = gbdt.predict(feature_list)
    for item in pred:
        prediction_value = item
        prediction_value = math.exp(prediction_value)
        print "prediction_valie: ", prediction_value

    # update scan processing
    #es_prediction.update(index=index_manage_prediction_task,doc_type=type_manage_prediction_task, \
    #        id=origin_task_name, body={"doc":{"scan_text_processing":"0"}})

    # update prediction value in es
    task_detail = es_prediction.get(index=index_manage_prediction_task, \
            doc_type=type_manage_prediction_task, id=origin_task_name)["_source"]
    if current_ts >= int(task_detail["stop_time"]):
        task_detail["finish"] = "1"
        task_detail["processing_status"] = "0"

        # update task info
        es_prediction.index(index=index_manage_prediction_task, \
            doc_type=type_manage_prediction_task, id=origin_task_name, body=task_detail)

    # update prediction
    es_prediction.update(index=task_name,
                         doc_type=index_type_prediction_task,
                         id=current_ts,
                         body={"doc": {
                             "prediction_value": prediction_value
                         }})

    return True
예제 #9
0
def dispose_data(task_name, current_ts):
    es_result = es_prediction.get(index=index_manage_prediction_task,
                                  doc_type=type_manage_prediction_task,
                                  id=task_name)["_source"]
    macro_during = es_result['macro_during']
    start_ts = datehour2ts(ts2datehour(es_result["submit_time"]))
    task_start_ts = start_ts
    end_ts = datehour2ts(ts2datehour(es_result["stop_time"]))

    index_micro = "micro_prediction_" + task_name
    query_body = {
        "query": {
            "filtered": {
                "filter": {
                    "range": {
                        "update_time": {
                            "lte": current_ts
                        }
                    }
                }
            }
        },
        "size": 10000,
        "sort": {
            "update_time": {
                "order": "asc"
            }
        }
    }
    micro_results = es_prediction.search(index=index_micro,
                                         doc_type="micro_task",
                                         body=query_body)["hits"]["hits"]
    total_list = []

    for item in micro_results:
        total_list.append(item["_source"]["total_count"])
    # 每个时间段内的微博量

    total_len = (end_ts - start_ts) / macro_during
    times = int(macro_during) / 3600
    lenth = len(total_list) / times
    adjust_list = []
    time_list = []
    count = 0
    i = 0
    for item in total_list:
        count += item
        i += 1
        if i % times == 0:
            if start_ts <= current_ts:
                adjust_list.append(count)
                count = 0
                time_list.append(start_ts)
            else:
                break
        start_ts += 3600

    # 总得时间走势图
    total_time_list = []
    for i in range(total_len):
        total_time_list.append(task_start_ts + i * macro_during)

    left_time = list(set(total_time_list) - set(time_list))
    left_time = sorted(left_time)

    return adjust_list, total_len, time_list, left_time
예제 #10
0
def organize_feature(task_name, mid, ts):

    result = dict()
    try:
        result = es.get(index=task_name, doc_type="text", id=mid)["_source"]
    except:
        pass
    if not result:
        return [0, 0, 0, 0, 0, 0, 0]

    ts = result["timestamp"]

    query_body = {"query": {"term": {"root_mid": mid}}}
    #total_weibo
    #count = es.count(index=index_list, doc_type="text", body=query_body)["count"]

    query_body_uid = {
        "query": {
            "term": {
                "root_mid": mid
            }
        },
        "aggs": {
            "uid_count": {
                "cardinality": {
                    "field": "uid"
                }
            }
        }
    }
    # total_uid
    #total_uid_count = es.search(index=index_list, doc_type="text", body=query_body_uid)['aggregations']["uid_count"]["value"]

    feature_list = []
    feature_list.append(math.log(result["user_fansnum"] + 1))
    query_body_ts = {
        "query": {
            "bool": {
                "must": [{
                    "term": {
                        "root_mid": mid
                    }
                }, {
                    "range": {
                        "timestamp": {
                            "lt": ts + 3600 * 10
                        }
                    }
                }]
            }
        },
        "aggs": {
            "weibo_type": {
                "terms": {
                    "field": "message_type"
                }
            }
        }
    }
    comment = 0
    retweet = 0
    tmp_count = es.search(
        index=task_name, doc_type="text",
        body=query_body_ts)['aggregations']["weibo_type"]["buckets"]
    if tmp_count:
        for item in tmp_count:
            if int(item["key"]) == 2:
                comment = item["doc_count"]
            elif int(item["key"]) == 3:
                retweet = item["doc_count"]
    feature_list.append(comment + retweet)
    feature_list.append(retweet)
    feature_list.append(comment)
    feature_list.append(retweet / float(comment + retweet + 1))
    feature_list.append(comment / float(comment + retweet + 1))
    query_body_uid = {
        "query": {
            "bool": {
                "must": [{
                    "term": {
                        "root_mid": mid
                    }
                }, {
                    "range": {
                        "timestamp": {
                            "lt": ts + 3600 * 10
                        }
                    }
                }]
            }
        },
        "aggs": {
            "uid_count": {
                "cardinality": {
                    "field": "uid"
                }
            }
        }
    }
    uid_count = es.search(
        index=task_name, doc_type="text",
        body=query_body_uid)['aggregations']["uid_count"]["value"]
    feature_list.append(uid_count)
    #feature_list.append(topic_field_dict[topic])

    return feature_list
def potential_user(task_name, ts):
    index_name = "stimulation_"+task_name
    index_type = "stimulation_results"

    #查询当前root_mid
    query_body = {
        "query": {
            "bool":{
                "must":[
                    {"range":{
                        "timestamp":{
                            "lt": ts
                        }
                    }},
                    {"term":{"message_type":1}},
                    {"range":{
                        "user_fansnum":{
                            "gte": 10000
                        }
                    }}
                ]
            }
        },
        "size": 10000
    }

    es_results = es.search(index=task_name, doc_type="text", body=query_body)["hits"]["hits"]

    mid_list = []
    uid_list = []
    feature_list = []
    prediction_uid = []
    prediction_weibo = []
    with open("prediction_uid.pkl", "r") as f:
        uid_model = pickle.load(f)
    with open("prediction_weibo.pkl", "r") as f:
        weibo_model = pickle.load(f)

    for item in es_results:
        mid_list.append(item["_id"])
        uid_list.append(item["_source"]["uid"])
        tmp_feature_list = organize_feature(task_name,item["_id"], ts)
        feature_list.append(tmp_feature_list)
        weibo_prediction_result = weibo_model.predict(feature_list)
        uid_prediction_result = uid_model.predict(feature_list)

    future_total = 0
    current_total = 0

    results_dict = dict()
    in_potential_list = []
    for i in range(len(mid_list)):
        mid = mid_list[i]
        uid = uid_list[i]
        iter_count = es.count(index=task_name, doc_type="text", body={"query":{"term":{"root_mid":mid}}})["count"]
        pre_count = weibo_prediction_result[i]
        future_total += abs(pre_count-iter_count)
        if pre_count >= 500 and iter_count <= 500:
            current_total += abs(pre_count-iter_count)
            if not results_dict.has_key(uid):
                results_dict[uid] = dict()
            tmp = dict()
            tmp["mid"] = mid
            tmp["current_count"] = iter_count
            tmp["prediction_count"] = int(pre_count)
            weibo_detail = es.get(index=task_name, doc_type="text", id=mid)["_source"]
            tmp.update(weibo_detail)
            retweet, comment = search_retweet_comment(task_name, mid)
            tmp["retweeted"] = retweet
            tmp["comment"] = comment
            results_dict[uid][mid] = tmp


    # user profile
    tmp_in_list = results_dict.keys()
    if tmp_in_list:
        profile_results = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":tmp_in_list})["docs"]
        for i in range(len(tmp_in_list)):
            detail = profile_results[i]
            tmp = []
            uid = tmp_in_list[i]
            if detail["found"]:
                tmp.append(detail["_source"]["nick_name"])
                tmp.append(detail["_source"]["photo_url"])
                tmp.append(detail["_source"]["fansnum"])
                tmp.append(detail["_source"]["statusnum"])
            else:
                tmp.append(detail["_id"])
                tmp.extend(["","",""])
            results_dict[uid]["user_profile"] = tmp


    return results_dict, future_total, current_total
def get_origin_weibo_detail(ts, size, order, message_type=1):
    #print r.get("topic_value_dict")
    #error:topic_value_dict里存的为空
    #topic_value_dict = json.loads(r.get("topic_value_dict"))
    task_detail = es_prediction.get(index=index_sensing_task, doc_type=_id, id=ts)['_source']

    mid_value = json.loads(task_detail['mid_topic_value'])
    duplicate_dict = json.loads(task_detail['duplicate_dict'])
    tmp_duplicate_dict = dict()
    for k,v in duplicate_dict.iteritems():
        try:
            tmp_duplicate_dict[v].append(k)
        except:
            tmp_duplicate_dict[v] = [k, v]


    if message_type == 1:
        weibo_detail = json.loads(task_detail['origin_weibo_detail'])
    elif message_type == 2:
        weibo_detail = json.loads(task_detail['retweeted_weibo_detail'])
    else:
        weibo_detail = json.loads(task_detail['sensitive_weibo_detail'])
    weibo_detail_list = []
    if weibo_detail:
        for iter_mid, item in weibo_detail.iteritems():
            tmp = []
            tmp.append(iter_mid)
            tmp.append(item[iter_mid])
            tmp.append(item['retweeted'])
            tmp.append(item['comment'])
            weibo_detail_list.append(tmp)
    mid_list = weibo_detail.keys()

    results = []
    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "terms":{"mid": mid_list}
                }
            }
        },
        "size": 1000,
        "sort": {"timestamp": {"order": "desc"}}
    }


    index_list = []
    datetime = ts2datetime(ts)
    datetime_1 = ts2datetime(ts-DAY)
    index_name = flow_text_index_name_pre + datetime
    exist_es = es_text.indices.exists(index_name)
    if exist_es:
        index_list.append(index_name)
    index_name_1 = flow_text_index_name_pre + datetime_1
    exist_es_1 = es_text.indices.exists(index_name_1)
    if exist_es_1:
        index_list.append(index_name_1)

    if index_list and mid_list:
        search_results = es_text.search(index=index_list, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"]
    else:
        search_results = []

    uid_list = []
    text_dict = dict() # 文本信息
    portrait_dict = dict() # 背景信息
    sort_results = []
    if search_results:
        for item in search_results:
            uid_list.append(item["_source"]['uid'])
            text_dict[item['_id']] = item['_source'] # _id是mid
        if uid_list:
            portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"]
            for item in portrait_result:
                if item['found']:
                    portrait_dict[item['_id']] = {"nick_name": item["fields"]["nick_name"][0], "photo_url": item["fields"]["photo_url"][0]}
                else:
                    portrait_dict[item['_id']] = {"nick_name": item['_id'], "photo_url":""}

        if order == "total":
            sorted_list = sorted(weibo_detail_list, key=lambda x:x[1], reverse=True)
        elif order == "retweeted":
            sorted_list = sorted(weibo_detail_list, key=lambda x:x[2], reverse=True)
        elif order == "comment":
            sorted_list = sorted(weibo_detail_list, key=lambda x:x[3], reverse=True)
        else:
            sorted_list = weibo_detail_list

        count_n = 0
        results_dict = dict()
        mid_index_dict = dict()
        for item in sorted_list: # size
            mid = item[0]
            iter_text = text_dict.get(mid, {})
            temp = []
            # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type
            if iter_text:
                uid = iter_text['uid']
                temp.append(uid)
                iter_portrait = portrait_dict.get(uid, {})
                if iter_portrait:
                    temp.append(iter_portrait['nick_name'])
                    temp.append(iter_portrait['photo_url'])
                else:
                    temp.extend([uid,''])
                temp.append(iter_text["text"])
                temp.append(iter_text["sentiment"])
                temp.append(ts2date(iter_text['timestamp']))
                temp.append(iter_text['geo'])
                if message_type == 1:
                    temp.append(1)
                elif message_type == 2:
                    temp.append(3)
                else:
                    temp.append(iter_text['message_type'])
                temp.append(item[2])
                temp.append(item[3])
                temp.append(iter_text.get('sensitive', 0))
                temp.append(iter_text['timestamp'])
                temp.append(mid_value[mid])
                temp.append(mid)
                results.append(temp)
            count_n += 1

        results = sorted(results, key=operator.itemgetter(-4, -2, -6), reverse=True) # -4 -2 -3
        sort_results = []
        count = 0
        for item in results:
            sort_results.append([item])
            mid_index_dict[item[-1]] = count
            count += 1

        
        if tmp_duplicate_dict:
            remove_list = []
            value_list = tmp_duplicate_dict.values() # [[mid, mid], ]
            for item in value_list:
                tmp = []
                for mid in item:
                    if mid_index_dict.get(mid, 0):
                        tmp.append(mid_index_dict[mid])
                if len(tmp) > 1:
                    tmp_min = min(tmp)
                else:
                    continue
                tmp.remove(tmp_min)
                for iter_count in tmp:
                    sort_results[tmp_min].extend(sort_results[iter_count])
                    remove_list.append(sort_results[iter_count])
            if remove_list:
                for item in remove_list:
                    sort_results.remove(item)
        

    return sort_results
def get_retweet_weibo_detail(ts, size, text_type, type_value):
    task_detail = es_prediction.get(index=index_sensing_task, doc_type=_id, id=ts)['_source']
    origin_weibo_detail = json.loads(task_detail['origin_weibo_detail'])
    retweeted_weibo_detail = json.loads(task_detail['retweeted_weibo_detail'])

    mid_list = []
    mid_list.extend(origin_weibo_detail.keys())
    mid_list.extend(retweeted_weibo_detail.keys())

    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "must":[
                            {"range":{
                                "timestamp":{
                                    "gte": ts - time_interval,
                                    "lt": ts
                                }
                            }},
                            {"terms": {"root_mid": mid_list}}
                        ]
                    }
                }
            }
        },
        "sort": {"timestamp": {"order": "desc"}},
        "size": 100
    }

    if text_type == "message_type":
        query_body['query']['filtered']['filter']['bool']['must'].append({"term":{text_type: type_value}})
    if text_type == "sentiment":
        #if isinstance(type_value, str):
        if len(type_value) == 1:
            query_body['query']['filtered']['filter']['bool']['must'].append({"term":{text_type: type_value}})
        else:
            query_body['query']['filtered']['filter']['bool']['must'].append({"terms":{text_type: type_value}})

    datetime = ts2datetime(ts)
    datetime_1 = ts2datetime(ts-time_interval)
    index_name = flow_text_index_name_pre + datetime
    exist_es = es_text.indices.exists(index_name)
    index_name_1 = flow_text_index_name_pre + datetime_1
    exist_es_1 = es_text.indices.exists(index_name_1)

    # 1. 查询微博
    if datetime == datetime_1 and exist_es:
        search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"]
    elif datetime != datetime_1 and exist_es_1:
        search_results = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"]
    else:
        search_results = []
    #print search_results
    # 2. 获取微博相关信息
    results = []
    uid_list = []
    if search_results:
        for item in search_results:
            uid_list.append(item["_source"]['uid'])
        if uid_list:
            portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"]

        for i in range(len(uid_list)):
            item = search_results[i]['_source']
            temp = []
            # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type
            temp.append(item['uid'])
            if portrait_result[i]['found']:
                temp.append(portrait_result[i]["fields"]["nick_name"][0])
                temp.append(portrait_result[i]["fields"]["photo_url"][0])
            else:
                temp.append(item['uid'])
                temp.append("")
            temp.append(item["text"])
            #print item['text']
            temp.append(item["sentiment"])
            temp.append(ts2date(item['timestamp']))
            temp.append(item['geo'])
            temp.append(item["message_type"])
            results.append(temp)

    return results
예제 #14
0
def task_list():
    create_task()
    if RUN_TYPE:
        current_ts = datehour2ts(ts2datehour(time.time()))
    else:
        current_ts = 1482861600
    while 1:
        task_detail = r_trendline.rpop(task_trendline)
        print task_detail
        if not task_detail:
            break

        task_name = task_detail
        while 1:
            micro_index = "micro_prediction_" + task_name
            es_exist = es_prediction.exists(index=micro_index,
                                            doc_type="micro_task",
                                            id=current_ts)
            if not es_exist:
                time.sleep(60)
            else:
                break

        # obtain time series
        value, total_len, time_list, left_list = dispose_data(
            task_name, current_ts)

        # macro prediction result
        try:
            es_macro_result = es_prediction.get(index=index_macro_feature_result,\
                doc_type=type_macro_feature_result,id=task_name)["_source"]
            prediction_total_value = es_macro_result["predict_weibo_value"]
            top_value = prediction_total_value * 0.8 / (0.2 * total_len)
        except:
            top_value = 0

        # 已知的最大值和位置
        max_exist = max(value)
        index_exist = len(value)

        if top_value < max_exist:
            top_value = 2 * max_exist

        # weibo prediction
        k = 5
        h = 0.5
        peak = spd(value, h, k)
        flag = judge(peak, value)
        if len(flag) == 2:
            print("Two peaks:")
            paras = getTwoBeauties(value, flag[0], flag[1])
            paras[-1] = total_len
            series = bassTwoPeaks(paras)
        else:
            print("Single peak:")
            paras = getSingleBeauty(value)
            paras[-1] = total_len
            series = bassOnePeak(paras)

        # 预测峰值位置
        predict_climax = series.index(max(series))

        if predict_climax > index_exist:
            predict_climax_left = predict_climax - len(value)
            # 剩余走势图 climax位置 起止点值 最大值
            rise_trend, fall_trend = get_trend(left_list, predict_climax_left,
                                               value[-1], top_value)
            true_climax = time_list[0] + (time_list[1] -
                                          time_list[0]) * predict_climax
        else:
            top_value = value[-1]
            rise_trend, fall_trend = get_trend(left_list, 0, value[-1], 1)
            true_climax = time_list[value.index(max(value))]

        results = dict()
        results["climax"] = [true_climax, top_value]
        results["rise_trend"] = rise_trend
        results["fall_trend"] = fall_trend
        new_list = []
        for i in range(len(time_list)):
            new_list.append([time_list[i], value[i]])
        results["exist_trend"] = new_list
        r_trendline.set("trendline_" + task_name, json.dumps(results))
        print results
def rank_predict(event, start_ts, end_ts):

    feature_list = feature_compute(event, start_ts, end_ts)
    print 'feature_list:::::', feature_list
    feature_list_gbdt = []
    for i in range(len(feature_list)):
        #把多个分开(extend

        if i == 15:
            feature_list[i] = json.loads(json.dumps(feature_list[i]))
            print 'type::', type(feature_list[i])
            print 'feature_list[i][at_0]::', feature_list[i]['at_0']

            feature_list_gbdt.append(feature_list[i]['at_0'])
            feature_list_gbdt.append(feature_list[i]['at_1'])
            feature_list_gbdt.append(feature_list[i]['at_2'])
            feature_list_gbdt.append(feature_list[i]['at>3'])
        elif i == 16:
            feature_list[i] = json.loads(json.dumps(feature_list[i]))

            print 'feature_list[i]:::', feature_list[i]
            feature_list_gbdt.append(feature_list[i][0])
            feature_list_gbdt.append(feature_list[i][1])
            feature_list_gbdt.append(feature_list[i][2])
            feature_list_gbdt.append(feature_list[i][3])
        else:
            feature_list_gbdt.append(feature_list[i])

    print 'feature_list_gbdt:::::', feature_list_gbdt

    #加载微博模型
    with open("0305_macro-prediction-weibos-value.pkl", "rb") as f:
        gbdt = pickle.load(f)

    pred = gbdt.predict(feature_list_gbdt)

    for item in pred:
        predict_weibo_value = item

    #加载用户模型
    with open("0305_macro-prediction-uids-value.pkl", "rb") as f:
        gbdt = pickle.load(f)

    pred = gbdt.predict(feature_list_gbdt)

    for item in pred:
        predict_user_value = item

    predict_rank = get_rank(predict_user_value)

    ## 存入事件信息表
    #for i in range(len(feature_list)):
    feature_results = {}
    feature_results['event'] = event
    '''
    feature_results['topic_field'] = feature_list[0]
    feature_results['total_num'] = feature_list[1]
    feature_results['total_user_fans'] = feature_list[2]
    feature_results['total_comment'] = feature_list[3]
    feature_results['total_retweet'] = feature_list[4]
    feature_results['total_sensitive'] = feature_list[5]
    feature_results['total_sensitive_ratio'] = feature_list[6]
    feature_results['total_negtive'] = feature_list[7]
    feature_results['total_important_user'] = feature_list[8]   
    feature_results['total_origin_type'] = feature_list[9]
    feature_results['origin_ratio'] = feature_list[10]
    feature_results['total_retweet_type'] = feature_list[11]
    feature_results['retweet_ratio'] = feature_list[12]
    feature_results['total_comment_type'] = feature_list[13]
    feature_results['comment_ratio'] = feature_list[14]
    feature_results['at_count'] = feature_list[15]
    feature_results['event_uid_count'] = feature_list[16]
    feature_results['event_trend_delta'] = feature_list[17]
    feature_results['predict_weibo_value'] = predict_weibo_value
    feature_results['predict_user_value'] = predict_user_value
    feature_results['predict_rank'] = predict_rank
    feature_results['update_time'] = time.time()
    '''
    #feature_results['topic_field'] = feature_list[0]
    feature_results['uid_count'] = feature_list[0]
    feature_results['total_num'] = feature_list[1]
    feature_results['total_user_fans'] = feature_list[2]
    feature_results['total_comment'] = feature_list[3]
    feature_results['total_retweet'] = feature_list[4]
    feature_results['total_sensitive'] = feature_list[5]
    feature_results['total_sensitive_ratio'] = feature_list[6]
    feature_results['total_negtive'] = feature_list[7]
    feature_results['total_important_user'] = feature_list[8]
    feature_results['total_origin_type'] = feature_list[9]
    feature_results['origin_ratio'] = feature_list[10]
    feature_results['total_retweet_type'] = feature_list[11]
    feature_results['retweet_ratio'] = feature_list[12]
    feature_results['total_comment_type'] = feature_list[13]
    feature_results['comment_ratio'] = feature_list[14]
    feature_results['at_count'] = json.dumps(feature_list[15])

    feature_results['event_trend_delta'] = json.dumps(feature_list[16])
    feature_results['predict_weibo_value'] = predict_weibo_value
    feature_results['predict_user_value'] = predict_user_value
    feature_results['predict_rank'] = predict_rank
    feature_results['update_time'] = time.time()
    '''
    save_event_info_results(event,topic_field,total_num,total_user_fans,\
                                total_comment,total_retweet,total_sensitive,\
                                total_sensitive_ratio,total_negtive,total_important_user,\
                                total_origin_type,origin_ratio,total_retweet_type,retweet_ratio,\
                                total_comment_type,comment_ratio,at_count,event_uid_count,\
                                event_trend_delta,predict_value,predict_rank,update_time)
    '''
    #update macro features & results

    feature_results = json.dumps(feature_results)
    try:
        item_exists = es_prediction.get(index=index_macro_feature_result,doc_type= type_macro_feature_result,\
                                        id=event)['_source']
        es_prediction.update(index=index_macro_feature_result,doc_type=type_macro_feature_result,\
                            id=event,body={'doc':feature_results})
    except:
        es_prediction.index(index=index_macro_feature_result,doc_type=type_macro_feature_result,\
                                id=event,body=feature_results)

    # update task info —— "macro_value_finish"
    task_detail = es_prediction.get(index=index_manage_prediction_task, \
            doc_type=type_manage_prediction_task, id=event)["_source"]
    task_detail["macro_value_finish"] = '1'
    es_prediction.index(index=index_manage_prediction_task, \
            doc_type=type_manage_prediction_task, id=event, body=task_detail)
    print 'feature_results::::', feature_results