def create_task():
    ts = time.time()
    current_ts = datehour2ts(ts2datehour(ts))
    query_body = {"query": {"term": {"finish": "0"}}, "size": 10000}

    results = es_prediction.search(index=index_manage_prediction_task,\
            doc_type=type_manage_prediction_task, body=query_body)["hits"]["hits"]
    for item in results:
        task_name = item["_source"]["pinyin_task_name"]
        print "push task_name: ", task_name
        update_time = item["_source"]["scan_text_time"]
        stop_time = item["_source"]["stop_time"]
        if current_ts > stop_time:
            es_prediction.update(index=index_manage_prediction_task,doc_type=type_manage_prediction_task,\
                    id=task_name, body={"doc":{"finish":"1"}})
        during = item["_source"]["micro_during"]
        if current_ts - update_time >= during:
            r_micro.lpush(
                task_micro_prediction,
                json.dumps([
                    task_name, item["_source"]["scan_text_time"], current_ts,
                    during
                ]))

    return True
示例#2
0
def create_task():
    ts = time.time()
    if RUN_TYPE:
        current_ts = datehour2ts(ts2datehour(ts))
    else:
        current_ts = 1482861600
    query_body = {
        "query": {
            "term":{"finish":"0"}
        },
        "size":10000
    }

    results = es_prediction.search(index=index_manage_prediction_task,\
            doc_type=type_manage_prediction_task, body=query_body)["hits"]["hits"]
    for item in results:
        print item
        task_name = item["_source"]["pinyin_task_name"]
        stop_time = item["_source"]["stop_time"]
        print stop_time, current_ts
        if stop_time < current_ts:
            es_prediction.update(index=index_manage_prediction_task,\
                    doc_type=type_manage_prediction_task, id=task_name,  body={"doc":{"macro_trendline_finish":"1", "finish": "1"}})
        else:
            r_trendline.lpush(task_trendline, task_name)
def extend_network(task_name):
    file_name = task_name + ".txt"
    f = open(task_name + ".txt", "w")
    line = 0
    user2number_dict = dict()  # mapping: number-uid
    number2user_dict = dict()
    count = 0
    user_list = organize_network(task_name)
    list_len = len(user_list)
    len_1000 = list_len / 1000
    for i in range(len_1000 + 1):
        tmp_uid = user_list[i * 1000:(i + 1) * 1000]
        es_results = es_retweet.mget(index=index_be_retweet,
                                     doc_type=index_type_be_retweet,
                                     body={"ids": tmp_uid})["docs"]
        for item in es_results:
            if item["found"]:
                print count
                uid_be_retweet = json.loads(item["_source"]["uid_be_retweet"])
                be_retweet_list = uid_be_retweet.keys()
                uid = item["_id"]
                if user2number_dict.has_key(uid):
                    uid_count = user2number_dict[uid]
                else:
                    count += 1
                    uid_count = count
                    user2number_dict[uid] = count
                    number2user_dict[count] = uid
                for each in be_retweet_list:
                    if user2number_dict.has_key(each):
                        each_number = user2number_dict[each]
                    else:
                        count += 1
                        user2number_dict[each] = count
                        number2user_dict[count] = uid
                        each_number = count
                    if each_number != uid_count:
                        f.write(str(uid_count) + " " + str(each_number) + "\n")
                        line += 1

    f.close()
    cmd = 'sed -i "" -e "1i %s %s" %s' % (count, line, file_name)
    p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)

    es_prediction.update(index=index_manage_interfere_task, doc_type=type_manage_interfere_task,\
            id=task_name, body={"doc":{"network_exist": "1"}})
    print "finish: ", count

    file_user = open("user_" + task_name + ".txt", "w")
    for uid in user2number_dict.keys():
        file_user.write(str(uid) + '\n')
def dispose_results(task_name, ts, future_total, current_total):
    index_name = "stimulation_"+task_name
    index_type = "stimulation_results"
    results = es.get(index=index_name, doc_type=index_type, id=ts)["_source"]
    future_results = json.loads(results["future_results"])

    future_list = []
    # 未来传播路径
    diffusion_path = dict()
    # 未来传播数值
    diffusion_value = dict()
    for start_uid, end_dict in future_results.iteritems():
        diffusion_path[start_uid] = end_dict.keys()
        future_list.extend(end_dict.keys())
        diffusion_value.update(end_dict)

    # 未来传播者信息
    # uid nick_name, photo_url, fans_num, weibo_num, prediction_value
    future_list = list(set(future_list))
    future_user_info = get_future_user(future_list)
    #print future_user_info
    for i in range(len(future_list)):
        uid = future_user_info[i][0]
        future_user_info[i].append(int(diffusion_value[uid]))


    # 当前热门微博、用户信息
    current_hot_mid = search_hot_mid(task_name, ts)

    # 当前潜在热门微博
    potential_mid, t1, t2 = potential_user(task_name, ts)

    future_total += t1
    current_total += t2
    ratio = float(current_total)/future_total

    update_dict = dict()
    update_dict["diffusion_path"] = json.dumps(diffusion_path)
    update_dict["future_user_info"] = json.dumps(future_user_info)
    update_dict["current_hot_weibo"] = json.dumps(current_hot_mid)
    update_dict["potential_hot_weibo"] = json.dumps(potential_mid)
    update_dict["ratio"] = ratio
    es.update(index=index_name, doc_type=index_type, id=ts, body={"doc":update_dict})

    return True
示例#5
0
def create_task():
    query_body = {
        "query": {
            "bool":{
                "must":[
                    {"term": {"finish": "0"}}
                ]
            }
        },
        "size": 10000
    }

    es_results = es_prediction.search(index=index_manage_interfere_task, doc_type=type_manage_interfere_task,\
            body=query_body)["hits"]["hits"]

    task_list = []
    if int(RUN_TYPE) == 1:
        current_ts = datehour2ts(ts2datehour(time.time()))
    else:
        current_ts = 1482681600 + 18*3600
    for item in es_results:
        tmp = []
        task_detail = item["_source"]
        task_name = task_detail['pinyin_task_name']
        update_time = task_detail["update_time"]
        sti_during = task_detail["stimulation_during"]
        stop_time =  task_detail["stop_time"]
        if RUN_TYPE == 1:
            if stop_time > current_ts:
                es.update(index=index_manage_interfere_task, doc_type=type_manage_interfere_task,\
                    id=task_name, body={"doc":{"finish": "1"}})
        tmp.append(task_name)
        tmp.append(task_detail["stop_time"])
        tmp.append(task_detail["scan_text_finish"])
        tmp.append(current_ts)
        if current_ts - update_time >= sti_during:
            r_stimulation.lpush(task_stimulation, json.dumps(tmp))

        # update: processing status
            es_prediction.update(index=index_manage_interfere_task,doc_type=type_manage_interfere_task,\
                id=task_name, body={"doc":{"stimulation_processing_status":"1"}})


    return True
def scan_flow_text():
    while 1:
        task_detail = r_scan_text.rpop(task_scan_text)
        if not task_detail:
            break

        task_detail = json.loads(task_detail)
        task_name = task_detail[0]
        must_keys = task_detail[1]
        should_keys = task_detail[2]
        end_ts = task_detail[3]
        start_ts = task_detail[4]
        source = task_detail[5]

        scan_event_text(task_name, must_keys, should_keys, end_ts, start_ts)
        scan_weibo(task_name, start_ts, end_ts)

        if source == "prediction":
            es_prediction.update(index=index_manage_prediction_task,doc_type=type_manage_prediction_task, \
                id=task_name, body={"doc":{"scan_text_processing":"2"}})
        elif source == "analysis":
            es_prediction.update(index=index_event_analysis ,doc_type=type_event_analysis, \
                id=task_name, body={"doc":{"scan_text_processing":"0"}})
        elif source == "interfere":
            es_prediction.update(index=index_manage_interfere_task ,doc_type=type_manage_interfere_task, \
                id=task_name, body={"doc":{"scan_text_processing":"1"}})
示例#7
0
def create_task():
    #ts = time.time()
    #current_ts = datehour2ts(ts2datehour(ts))
    index_name = index_manage_event_analysis
    index_type = type_manage_event_analysis

    query_body = {
        "query": {
            "term": {
                "event_value_finish": "0"
            }
        },
        "size": 10000
    }

    results = es.search(index=index_name, doc_type=index_type,
                        body=query_body)["hits"]["hits"]

    item_finish_status = {}

    for item in results:
        topic = item["_source"]["task_name"]
        en_name = item["_source"]["pinyin_task_name"]
        start_ts = item['_source']['start_ts']
        end_ts = item['_source']['end_ts']

        print "push task_name: ", en_name

        r_event_analysis.lpush(task_event_analysis,
                               json.dumps([topic, en_name, start_ts, end_ts]))

        #修改状态为已进入队列但尚未计算
        item_finish_status['event_value_finish'] = 1

        es.update(index=index_name,
                  doc_type=index_type,
                  id=en_name,
                  body={'doc': item_finish_status})
def dispose_data(task_name, current_ts, during=3600):
    K = 2  ########

    task_detail = es_prediction.get(index=index_manage_prediction_task,
                                    doc_type=type_manage_prediction_task,
                                    id=task_name)["_source"]
    start_time = int(task_detail["start_time"])

    origin_task_name = task_name
    task_name = "micro_prediction_" + task_name

    query_body = {
        "query": {
            "range": {
                "update_time": {
                    "lte": current_ts
                }
            }
        },
        "size": K,
        "sort": {
            "update_time": {
                "order": "desc"
            }
        }
    }

    sort_query_body = {
        "query": {
            "range": {
                "update_time": {
                    "lte": current_ts
                }
            }
        }
    }

    total_count = []
    total_fans_list = []
    total_origin_list = []
    total_retweet_list = []
    total_comment_list = []
    total_uid_list = []
    total_positive_list = []
    total_negetive_list = []
    average_origin_ts = []
    average_retweet_ts = []

    feature_list = []
    results = es_prediction.search(index=task_name,
                                   doc_type=index_type_prediction_task,
                                   body=query_body)["hits"]["hits"]
    location = es_prediction.count(index=task_name,
                                   doc_type=index_type_prediction_task,
                                   body=sort_query_body)["count"]

    if len(results) != K:
        short_len = K - len(results)
        results.extend([[]] * short_len)
    print "former result: ", len(results), K
    results.reverse()
    for item in results:
        if item:
            item = item["_source"]
            #total_fans_list.append(item["total_fans_number"])
            total_origin_list.append(item["origin_weibo_number"])
            total_retweet_list.append(item["retweeted_weibo_number"])
            total_comment_list.append(item["comment_weibo_number"])
            total_count.append(item["total_count"])
            total_uid_list.append(item["total_uid_count"])
            total_positive_list.append(item["positive_count"])
            total_negetive_list.append(item["negetive_count"])
            average_origin_ts.append(item["average_origin_ts"])
            average_retweet_ts.append(item["average_retweet_ts"])
        else:
            #total_fans_list.append(0)
            total_origin_list.append(0)
            total_retweet_list.append(0)
            total_comment_list.append(0)
            total_uid_list.append(0)
            total_count.append(0)
            total_positive_list.append(0)
            total_negetive_list.append(0)
            average_origin_ts.append(0)
            average_retweet_ts.append(0)
    print "total_count: ", total_count

    feature_list = []
    feature_list.append(math.log(int(total_retweet_list[-1] + 1)))
    feature_list.append(math.log(int(total_comment_list[-1] + 1)))
    feature_list.append(math.log(int(total_positive_list[-1] + 1)))
    feature_list.append(math.log(int(total_negetive_list[-2] + 1)))
    feature_list.append(math.log(int(total_negetive_list[-1] + 1)))
    feature_list.append(math.log(int(total_count[-1] + 1)))
    feature_list.append(math.log(int(total_uid_list[-1] + 1)))
    if int(during) == 3 * 3600:
        feature_list.append(average_origin_ts[-1])
        feature_list.append(average_retweet_ts[-1])

    # load model and prediction
    if int(during) == 3600:
        if total_count[-1] - total_count[-2] >= -0.2 * total_count[-2]:
            with open("model-up.pkl", "r") as f:
                gbdt = pickle.load(f)
        else:
            with open("model-down.pkl", "r") as f:
                gbdt = pickle.load(f)
    elif int(during) == 3 * 3600:
        with open("model-3.pkl", "r") as f:
            gbdt = pickle.load(f)

    print "feature_list: ", feature_list
    pred = gbdt.predict(feature_list)
    for item in pred:
        prediction_value = item
        prediction_value = math.exp(prediction_value)
        print "prediction_valie: ", prediction_value

    # update scan processing
    #es_prediction.update(index=index_manage_prediction_task,doc_type=type_manage_prediction_task, \
    #        id=origin_task_name, body={"doc":{"scan_text_processing":"0"}})

    # update prediction value in es
    task_detail = es_prediction.get(index=index_manage_prediction_task, \
            doc_type=type_manage_prediction_task, id=origin_task_name)["_source"]
    if current_ts >= int(task_detail["stop_time"]):
        task_detail["finish"] = "1"
        task_detail["processing_status"] = "0"

        # update task info
        es_prediction.index(index=index_manage_prediction_task, \
            doc_type=type_manage_prediction_task, id=origin_task_name, body=task_detail)

    # update prediction
    es_prediction.update(index=task_name,
                         doc_type=index_type_prediction_task,
                         id=current_ts,
                         body={"doc": {
                             "prediction_value": prediction_value
                         }})

    return True
示例#9
0
                    },
                    "future_user_info": {
                        "type": "string",
                        "index": "no"
                    },
                    "current_hot_weibo": {
                        "type": "string",
                        "index": "no"
                    },
                    "potential_hot_weibo": {
                        "type": "string",
                        "index": "no"
                    }
                }
            }
        }
    }

    index_name = "stimulation_" + task_name

    exist_bool = es.indices.exists(index=index_name)
    if not exist_bool:
        es.indices.create(index=index_name, body=index_info, ignore=400)

    return "1"


if __name__ == "__main__":
    es.update(index="manage_interfere_task", doc_type="interfere_task", id=\
            "mao_ze_dong_dan_chen_ji_nian_ri", body={"doc":{"scan_text_finish":"1"}})
                    }
                }
            }
        }
    }

    if not es.indices.exists(index=index_manage_interfere_task):

        es.indices.create(index=index_manage_interfere_task,
                          body=index_info,
                          ignore=400)


if __name__ == "__main__":
    #manage_interfere_task()

    es.indices.put_mapping(
        index=index_manage_interfere_task,
        doc_type=type_manage_interfere_task,
        body={"properties": {
            "update_time": {
                "type": "long"
            }
        }})
    es.update(index=index_manage_interfere_task,
              doc_type=type_manage_interfere_task,
              id="mao_ze_dong_dan_chen_ji_nian_ri",
              body={"doc": {
                  "update_time": 1482724800
              }})
                    "timestamp": {
                        "type": "long",
                    },
                    # data update time
                    "update_time": {
                        "type": "long"
                    },
                    "prediction_value": {
                        "type": "double"
                    },
                    "create_by": {
                        "type": "string",
                        "index": "not_analyzed"
                    }
                }
            }
        }
    }

    if not es.indices.exists(index=task_name):

        es.indices.create(index=task_name, body=index_info, ignore=400)

    return "1"


if __name__ == "__main__":
    #es.indices.create(index="micro_prediction_task", ignore=400)
    es.update(index="manage_prediction_task", doc_type="prediction_task", id="mao_ze_dong_dan_chen_ji_nian_ri",\
            body={"doc":{"finish":"0"}})
示例#12
0
def predict_user_influence(task_name, stop_time, ts):
    future_total = 0 # 未来传播总量
    current_total = 0 # 可控范围
    uid_count, in_user_list, in_user_info, all_user_dict = extend_network(task_name, ts)

    with open("gbdt.pkl", "r") as f:
        gbdt = pickle.load(f)

    # 已出现的重要用户阈值
    try:
        in_user_threshold = float(r_stimulation.get("in_user_threshold"))
    except:
        r_stimulation.set("in_user_threshold", 1000)
        in_user_threshold = 1000


    in_results = gbdt.predict(in_user_info)
    print "len(in_user_list): ", len(in_user_list)
    prediction_in = dict()
    for i in range(len(in_user_list)):
        if math.exp(in_results[i]) > in_user_threshold: # 1000
            prediction_in[in_user_list[i]] = math.exp(in_results[i])


    future_dict = dict()
    count = 0
    for k,v in all_user_dict.iteritems():
        uid = k
        print "k: ", k
        print "v: ", len(v)
        tmp_prediction_list = [] # tmp storage
        tmp_uid_list = []
        if 1:
            user_list = v
            list_len = len(user_list)
            len_1000 = list_len/1000
            for i in range(len_1000+1):
                tmp_uid = user_list[i*1000: (i+1)*1000]
                if not tmp_uid:
                    continue
                es_results = es_retweet.mget(index=index_be_retweet,doc_type=index_type_be_retweet, body={"ids":tmp_uid})["docs"]
                for item in es_results:
                    if item["found"]:
                        count += 1
                        uid_be_retweet = json.loads(item["_source"]["uid_be_retweet"])
                        retweet_count = len(uid_be_retweet)
                        if retweet_count < 1000:
                            continue
                        tmp = []
                        tmp.append(math.log(retweet_count+1))
                        tmp.append(math.log(uid_count+1))
                        tmp_prediction_list.append(tmp)
                        tmp_uid_list.append(item["_id"])
                        if count % 1000 == 0:
                            iter_prediction_list, t1, t2 = prediction_model(uid,gbdt, tmp_prediction_list, tmp_uid_list, future_dict)
                            future_dict = iter_prediction_list
                            tmp_prediction_list = []
                            tmp_uid_list = []
                            future_total += t1
                            current_total += t2
                            print "iter prediction: ", count

        if tmp_prediction_list:
            iter_prediction_list, t1, t2 = prediction_model(uid,gbdt, tmp_prediction_list, tmp_uid_list, future_dict)
            future_dict = iter_prediction_list
            future_total += t1
            current_total += t2
            print "future_dict: ", future_dict

    # storage
    save_results(task_name, ts, prediction_in, future_dict)

    # do left things
    dispose_results(task_name, ts, future_total, current_total)


    # update processing state
    es_prediction.update(index=index_manage_interfere_task,doc_type=type_manage_interfere_task,\
            id=task_name, body={"doc":{"stimulation_processing_status":"0", "update_time": ts, "scan_text_finish":"0"}})

    # stop task
    if ts >= stop_time:
        es_prediction.update(index=index_manage_interfere_task,doc_type=\
                type_manage_interfere_task,id=task_name,body={"doc":{"finish":"1"}})
def rank_predict(event, start_ts, end_ts):

    feature_list = feature_compute(event, start_ts, end_ts)
    print 'feature_list:::::', feature_list
    feature_list_gbdt = []
    for i in range(len(feature_list)):
        #把多个分开(extend

        if i == 15:
            feature_list[i] = json.loads(json.dumps(feature_list[i]))
            print 'type::', type(feature_list[i])
            print 'feature_list[i][at_0]::', feature_list[i]['at_0']

            feature_list_gbdt.append(feature_list[i]['at_0'])
            feature_list_gbdt.append(feature_list[i]['at_1'])
            feature_list_gbdt.append(feature_list[i]['at_2'])
            feature_list_gbdt.append(feature_list[i]['at>3'])
        elif i == 16:
            feature_list[i] = json.loads(json.dumps(feature_list[i]))

            print 'feature_list[i]:::', feature_list[i]
            feature_list_gbdt.append(feature_list[i][0])
            feature_list_gbdt.append(feature_list[i][1])
            feature_list_gbdt.append(feature_list[i][2])
            feature_list_gbdt.append(feature_list[i][3])
        else:
            feature_list_gbdt.append(feature_list[i])

    print 'feature_list_gbdt:::::', feature_list_gbdt

    #加载微博模型
    with open("0305_macro-prediction-weibos-value.pkl", "rb") as f:
        gbdt = pickle.load(f)

    pred = gbdt.predict(feature_list_gbdt)

    for item in pred:
        predict_weibo_value = item

    #加载用户模型
    with open("0305_macro-prediction-uids-value.pkl", "rb") as f:
        gbdt = pickle.load(f)

    pred = gbdt.predict(feature_list_gbdt)

    for item in pred:
        predict_user_value = item

    predict_rank = get_rank(predict_user_value)

    ## 存入事件信息表
    #for i in range(len(feature_list)):
    feature_results = {}
    feature_results['event'] = event
    '''
    feature_results['topic_field'] = feature_list[0]
    feature_results['total_num'] = feature_list[1]
    feature_results['total_user_fans'] = feature_list[2]
    feature_results['total_comment'] = feature_list[3]
    feature_results['total_retweet'] = feature_list[4]
    feature_results['total_sensitive'] = feature_list[5]
    feature_results['total_sensitive_ratio'] = feature_list[6]
    feature_results['total_negtive'] = feature_list[7]
    feature_results['total_important_user'] = feature_list[8]   
    feature_results['total_origin_type'] = feature_list[9]
    feature_results['origin_ratio'] = feature_list[10]
    feature_results['total_retweet_type'] = feature_list[11]
    feature_results['retweet_ratio'] = feature_list[12]
    feature_results['total_comment_type'] = feature_list[13]
    feature_results['comment_ratio'] = feature_list[14]
    feature_results['at_count'] = feature_list[15]
    feature_results['event_uid_count'] = feature_list[16]
    feature_results['event_trend_delta'] = feature_list[17]
    feature_results['predict_weibo_value'] = predict_weibo_value
    feature_results['predict_user_value'] = predict_user_value
    feature_results['predict_rank'] = predict_rank
    feature_results['update_time'] = time.time()
    '''
    #feature_results['topic_field'] = feature_list[0]
    feature_results['uid_count'] = feature_list[0]
    feature_results['total_num'] = feature_list[1]
    feature_results['total_user_fans'] = feature_list[2]
    feature_results['total_comment'] = feature_list[3]
    feature_results['total_retweet'] = feature_list[4]
    feature_results['total_sensitive'] = feature_list[5]
    feature_results['total_sensitive_ratio'] = feature_list[6]
    feature_results['total_negtive'] = feature_list[7]
    feature_results['total_important_user'] = feature_list[8]
    feature_results['total_origin_type'] = feature_list[9]
    feature_results['origin_ratio'] = feature_list[10]
    feature_results['total_retweet_type'] = feature_list[11]
    feature_results['retweet_ratio'] = feature_list[12]
    feature_results['total_comment_type'] = feature_list[13]
    feature_results['comment_ratio'] = feature_list[14]
    feature_results['at_count'] = json.dumps(feature_list[15])

    feature_results['event_trend_delta'] = json.dumps(feature_list[16])
    feature_results['predict_weibo_value'] = predict_weibo_value
    feature_results['predict_user_value'] = predict_user_value
    feature_results['predict_rank'] = predict_rank
    feature_results['update_time'] = time.time()
    '''
    save_event_info_results(event,topic_field,total_num,total_user_fans,\
                                total_comment,total_retweet,total_sensitive,\
                                total_sensitive_ratio,total_negtive,total_important_user,\
                                total_origin_type,origin_ratio,total_retweet_type,retweet_ratio,\
                                total_comment_type,comment_ratio,at_count,event_uid_count,\
                                event_trend_delta,predict_value,predict_rank,update_time)
    '''
    #update macro features & results

    feature_results = json.dumps(feature_results)
    try:
        item_exists = es_prediction.get(index=index_macro_feature_result,doc_type= type_macro_feature_result,\
                                        id=event)['_source']
        es_prediction.update(index=index_macro_feature_result,doc_type=type_macro_feature_result,\
                            id=event,body={'doc':feature_results})
    except:
        es_prediction.index(index=index_macro_feature_result,doc_type=type_macro_feature_result,\
                                id=event,body=feature_results)

    # update task info —— "macro_value_finish"
    task_detail = es_prediction.get(index=index_manage_prediction_task, \
            doc_type=type_manage_prediction_task, id=event)["_source"]
    task_detail["macro_value_finish"] = '1'
    es_prediction.index(index=index_manage_prediction_task, \
            doc_type=type_manage_prediction_task, id=event, body=task_detail)
    print 'feature_results::::', feature_results