Python search 예제들, global_utils.es_prediction.search Python 예제들

예제 #1

0

파일 보기

파일: feature_compute.py 프로젝트: NoahChanInvictus/finance_rumor_detect

def get_event_uid_count(task_name,start_ts,end_ts):

    event_uid_set = set()
    query_body = {
        'query':{
            'filtered':{
                'filter':{
                    'range':{
                        'timestamp':{'gte': start_ts, 'lt':end_ts}
                    }
                }
            }
        },
        'size':99999999
    }

    es_results = es.search(index=task_name,doc_type='text',body=query_body)['hits']['hits']
    for result in es_results:
        #print 'result:::',result
        event_uid_set.add(result['_source']['uid'])
        try:
            event_uid_set.add(result['_source']['root_uid'])
        except:
            continue
        try:
            event_uid_set.add(result['_source']['directed_uid'])
        except:
            continue

    uids_count = len(event_uid_set)
    
    return uids_count

예제 #2

0

파일 보기

def create_task_list(given_ts):
    # 1. search from manage_sensing_task
    # 2. push to redis list-----task_work

    # print start info
    current_path = os.getcwd()
    file_path = os.path.join(current_path, 'task_list.py')
    now_ts = datehour2ts(ts2datehour(time.time() - 3600))
    print_log = "&".join([file_path, "start", ts2date(now_ts)])
    print print_log
    #ts = ts - 3600

    query_body = {"query": {"match_all": {}}}

    search_results = es.search(index=index_sensing,
                               doc_type=type_sensing,
                               body=query_body)['hits']['hits']

    count = 0
    if search_results:
        for iter_item in search_results:
            _id = iter_item['_id']
            item = iter_item['_source']
            task = []
            task.append(item['task_name'])  # task_name
            task.append(json.loads(item['social_sensors']))  # social sensors
            #task.append(now_ts)
            task.append(given_ts)
            r.lpush('task_name', json.dumps(task))
            count += 1

    print count
    print_log = "&".join([file_path, "end", ts2date(time.time())])
    print print_log

예제 #3

0

파일 보기

파일: dispose_results.py 프로젝트: Ymm0008/diffusion-prediction

def search_times(task_name, uid, ts):
    query_body = {
        "query":{
            "filtered":{
                "filter":{
            "bool": {
                "must":[
                    {"range":{
                        "timestamp":{
                            "lt": ts
                            }
                        }
                    }
                ],
                "should":[
                    {"term":{"directed_uid": int(uid)}},
                    {"term": {"root_uid": str(uid)}}
                ]
            }
        }}},
        "aggs":{
            "uid_count":{
                "cardinality":{"field": "uid"}
            }
        }
    }

    count = es.search(index=task_name, doc_type="text", body=query_body)["aggregations"]["uid_count"]["value"]

    return count

예제 #4

0

파일 보기

def create_task():
    #ts = time.time()
    #current_ts = datehour2ts(ts2datehour(ts))
    index_name = index_manage_event_analysis
    index_type = type_manage_event_analysis

    query_body = {
        "query": {
            "term": {
                "event_value_finish": "0"
            }
        },
        "size": 10000
    }

    results = es.search(index=index_name, doc_type=index_type,
                        body=query_body)["hits"]["hits"]

    #item_finish_status = {}

    for item in results:
        topic = item["_source"]["task_name"]
        en_name = item["_source"]["pinyin_task_name"]
        start_ts = item['_source']['start_time']
        end_ts = item['_source']['stop_time']

        print "push task_name: ", en_name

        r_event_analysis.lpush(task_event_analysis,
                               json.dumps([topic, en_name, start_ts, end_ts]))

예제 #5

0

파일 보기

파일: create_micro_task.py 프로젝트: Ymm0008/diffusion-prediction

def create_task():
    ts = time.time()
    current_ts = datehour2ts(ts2datehour(ts))
    query_body = {"query": {"term": {"finish": "0"}}, "size": 10000}

    results = es_prediction.search(index=index_manage_prediction_task,\
            doc_type=type_manage_prediction_task, body=query_body)["hits"]["hits"]
    for item in results:
        task_name = item["_source"]["pinyin_task_name"]
        print "push task_name: ", task_name
        update_time = item["_source"]["scan_text_time"]
        stop_time = item["_source"]["stop_time"]
        if current_ts > stop_time:
            es_prediction.update(index=index_manage_prediction_task,doc_type=type_manage_prediction_task,\
                    id=task_name, body={"doc":{"finish":"1"}})
        during = item["_source"]["micro_during"]
        if current_ts - update_time >= during:
            r_micro.lpush(
                task_micro_prediction,
                json.dumps([
                    task_name, item["_source"]["scan_text_time"], current_ts,
                    during
                ]))

    return True

예제 #6

0

파일 보기

파일: create_trendline_task.py 프로젝트: yuanhuiru/xnr2

def create_task():
    ts = time.time()
    if RUN_TYPE:
        current_ts = datehour2ts(ts2datehour(ts))
    else:
        current_ts = 1482861600
    query_body = {
        "query": {
            "term":{"finish":"0"}
        },
        "size":10000
    }

    results = es_prediction.search(index=index_manage_prediction_task,\
            doc_type=type_manage_prediction_task, body=query_body)["hits"]["hits"]
    for item in results:
        print item
        task_name = item["_source"]["pinyin_task_name"]
        stop_time = item["_source"]["stop_time"]
        print stop_time, current_ts
        if stop_time < current_ts:
            es_prediction.update(index=index_manage_prediction_task,\
                    doc_type=type_manage_prediction_task, id=task_name,  body={"doc":{"macro_trendline_finish":"1", "finish": "1"}})
        else:
            r_trendline.lpush(task_trendline, task_name)

예제 #7

0

파일 보기

def extend_network(task_name, ts):

    index_name = task_name
    # mu qian can yu de yonghu shu
    query_uid = {
        "query":{
            "filtered":{
                "filter":{
                    "range":{
                        "timestamp":{
                            "lt": ts
                        }
                    }
                }
            }
        },
        "aggs":{
            "uid_count":{"cardinality":{"field": "uid"}}
        }
    }
    uid_count = es_prediction.search(index=index_name, doc_type="text", \
            body=query_uid)["aggregations"]["uid_count"]["value"]

    try:
        extend_retweet_threshold = float(r_stimulation.get("extend_retweet_threshold"))
    except:
        r_stimulation.set("extend_retweet_threshold", 10000)
        extend_retweet_threshold = 10000

    user_list = organize_network(task_name, ts)
    exist_user_set = set(user_list)
    in_user_list = list() ####已存在的用户列表
    in_user_info = []
    count = 0
    all_user_dict = dict() ## participate user >>> extended list
    list_len = len(user_list)
    len_1000 = list_len/1000
    for i in range(len_1000+1):
        tmp_uid = user_list[i*1000: (i+1)*1000]
        es_results = es_retweet.mget(index=index_be_retweet,doc_type=index_type_be_retweet, body={"ids":tmp_uid})["docs"]
        for item in es_results:
            if item["found"]:
                count +=1
                if count % 1000 == 0:
                    print "extend network: ", count
                uid_be_retweet = json.loads(item["_source"]["uid_be_retweet"])
                retweet_count = len(uid_be_retweet)
                if retweet_count < extend_retweet_threshold: # 对外扩展的阈值
                    continue
                uid_retweet_list = uid_be_retweet.keys()
                uid_retweet_list = list(set(uid_retweet_list)-exist_user_set)
                all_user_dict[item["_id"]] = uid_retweet_list # 扩展的用户
                retweet_count = len(uid_be_retweet)
                in_user_list.append(item["_id"])
                in_user_info.append([math.log(retweet_count+1), math.log(uid_count+1)])

    return uid_count,in_user_list, in_user_info, all_user_dict

예제 #8

0

파일 보기

파일: dispose_results.py 프로젝트: Ymm0008/diffusion-prediction

def search_hot_mid(task_name, ts):
    query_body = {
        "query": {
            "range":{
                "timestamp":{
                    "lt": ts
                }
            }
        },
        "aggs":{
            "hot_mid":{
                "terms":{"field": "root_mid", "size": 100}
            }
        }
    }

    mid_list = []
    return_list = [] # return hot mid
    uid_list = []
    es_results = es.search(index=task_name, doc_type="text", body=query_body)["aggregations"]["hot_mid"]["buckets"]
    for item in es_results:
        if item["doc_count"] >= 500:
            mid_list.append(item["key"])

    if mid_list:
        weibo_results = es.mget(index=task_name, doc_type="text", body={"ids":mid_list})["docs"]
        for item in weibo_results:
            if item["found"]:
                mid = item["_id"]
                retweet, comment = search_retweet_comment(task_name, mid)
                detail = item["_source"]
                detail["retweet"] = retweet
                detail["comment"] = comment
                uid_list.append(detail["uid"])
                return_list.append(detail)
        if uid_list:
            profile_results = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list})["docs"]
            for i in range(len(uid_list)):
                detail = profile_results[i]
                if detail["found"]:
                    return_list[i]["uname"] = detail["_source"]["nick_name"]
                    return_list[i]["photo_url"] = detail["_source"]["photo_url"]
                    return_list[i]["fansnum"] = detail["_source"]["fansnum"]
                    return_list[i]["statusnum"] = detail["_source"]["statusnum"]
                else:
                    return_list[i]["uname"] = detail["_id"]
                    return_list[i]["photo_url"] = ""
                    return_list[i]["fansnum"] = ""
                    return_list[i]["statusnum"] = ""
    return return_list

예제 #9

0

파일 보기

def create_task():
    query_body = {
        "query": {
            "bool":{
                "must":[
                    {"term": {"finish": "0"}}
                ]
            }
        },
        "size": 10000
    }

    es_results = es_prediction.search(index=index_manage_interfere_task, doc_type=type_manage_interfere_task,\
            body=query_body)["hits"]["hits"]

    task_list = []
    if int(RUN_TYPE) == 1:
        current_ts = datehour2ts(ts2datehour(time.time()))
    else:
        current_ts = 1482681600 + 18*3600
    for item in es_results:
        tmp = []
        task_detail = item["_source"]
        task_name = task_detail['pinyin_task_name']
        update_time = task_detail["update_time"]
        sti_during = task_detail["stimulation_during"]
        stop_time =  task_detail["stop_time"]
        if RUN_TYPE == 1:
            if stop_time > current_ts:
                es.update(index=index_manage_interfere_task, doc_type=type_manage_interfere_task,\
                    id=task_name, body={"doc":{"finish": "1"}})
        tmp.append(task_name)
        tmp.append(task_detail["stop_time"])
        tmp.append(task_detail["scan_text_finish"])
        tmp.append(current_ts)
        if current_ts - update_time >= sti_during:
            r_stimulation.lpush(task_stimulation, json.dumps(tmp))

        # update: processing status
            es_prediction.update(index=index_manage_interfere_task,doc_type=type_manage_interfere_task,\
                id=task_name, body={"doc":{"stimulation_processing_status":"1"}})


    return True

예제 #10

0

파일 보기

파일: new_time_series.py 프로젝트: NoahChanInvictus/finance_rumor_detect

def dispose_data(task_name, current_ts, during=3600):
    K = 2  ########

    task_detail = es_prediction.get(index=index_manage_prediction_task,
                                    doc_type=type_manage_prediction_task,
                                    id=task_name)["_source"]
    start_time = int(task_detail["start_time"])

    origin_task_name = task_name
    task_name = "micro_prediction_" + task_name

    query_body = {
        "query": {
            "range": {
                "update_time": {
                    "lte": current_ts
                }
            }
        },
        "size": K,
        "sort": {
            "update_time": {
                "order": "desc"
            }
        }
    }

    sort_query_body = {
        "query": {
            "range": {
                "update_time": {
                    "lte": current_ts
                }
            }
        }
    }

    total_count = []
    total_fans_list = []
    total_origin_list = []
    total_retweet_list = []
    total_comment_list = []
    total_uid_list = []
    total_positive_list = []
    total_negetive_list = []
    average_origin_ts = []
    average_retweet_ts = []

    feature_list = []
    results = es_prediction.search(index=task_name,
                                   doc_type=index_type_prediction_task,
                                   body=query_body)["hits"]["hits"]
    location = es_prediction.count(index=task_name,
                                   doc_type=index_type_prediction_task,
                                   body=sort_query_body)["count"]

    if len(results) != K:
        short_len = K - len(results)
        results.extend([[]] * short_len)
    print "former result: ", len(results), K
    results.reverse()
    for item in results:
        if item:
            item = item["_source"]
            #total_fans_list.append(item["total_fans_number"])
            total_origin_list.append(item["origin_weibo_number"])
            total_retweet_list.append(item["retweeted_weibo_number"])
            total_comment_list.append(item["comment_weibo_number"])
            total_count.append(item["total_count"])
            total_uid_list.append(item["total_uid_count"])
            total_positive_list.append(item["positive_count"])
            total_negetive_list.append(item["negetive_count"])
            average_origin_ts.append(item["average_origin_ts"])
            average_retweet_ts.append(item["average_retweet_ts"])
        else:
            #total_fans_list.append(0)
            total_origin_list.append(0)
            total_retweet_list.append(0)
            total_comment_list.append(0)
            total_uid_list.append(0)
            total_count.append(0)
            total_positive_list.append(0)
            total_negetive_list.append(0)
            average_origin_ts.append(0)
            average_retweet_ts.append(0)
    print "total_count: ", total_count

    feature_list = []
    feature_list.append(math.log(int(total_retweet_list[-1] + 1)))
    feature_list.append(math.log(int(total_comment_list[-1] + 1)))
    feature_list.append(math.log(int(total_positive_list[-1] + 1)))
    feature_list.append(math.log(int(total_negetive_list[-2] + 1)))
    feature_list.append(math.log(int(total_negetive_list[-1] + 1)))
    feature_list.append(math.log(int(total_count[-1] + 1)))
    feature_list.append(math.log(int(total_uid_list[-1] + 1)))
    if int(during) == 3 * 3600:
        feature_list.append(average_origin_ts[-1])
        feature_list.append(average_retweet_ts[-1])

    # load model and prediction
    if int(during) == 3600:
        if total_count[-1] - total_count[-2] >= -0.2 * total_count[-2]:
            with open("model-up.pkl", "r") as f:
                gbdt = pickle.load(f)
        else:
            with open("model-down.pkl", "r") as f:
                gbdt = pickle.load(f)
    elif int(during) == 3 * 3600:
        with open("model-3.pkl", "r") as f:
            gbdt = pickle.load(f)

    print "feature_list: ", feature_list
    pred = gbdt.predict(feature_list)
    for item in pred:
        prediction_value = item
        prediction_value = math.exp(prediction_value)
        print "prediction_valie: ", prediction_value

    # update scan processing
    #es_prediction.update(index=index_manage_prediction_task,doc_type=type_manage_prediction_task, \
    #        id=origin_task_name, body={"doc":{"scan_text_processing":"0"}})

    # update prediction value in es
    task_detail = es_prediction.get(index=index_manage_prediction_task, \
            doc_type=type_manage_prediction_task, id=origin_task_name)["_source"]
    if current_ts >= int(task_detail["stop_time"]):
        task_detail["finish"] = "1"
        task_detail["processing_status"] = "0"

        # update task info
        es_prediction.index(index=index_manage_prediction_task, \
            doc_type=type_manage_prediction_task, id=origin_task_name, body=task_detail)

    # update prediction
    es_prediction.update(index=task_name,
                         doc_type=index_type_prediction_task,
                         id=current_ts,
                         body={"doc": {
                             "prediction_value": prediction_value
                         }})

    return True

예제 #11

0

파일 보기

파일: dispose_data.py 프로젝트: yuanhuiru/xnr2

def dispose_data(task_name, current_ts):
    es_result = es_prediction.get(index=index_manage_prediction_task,
                                  doc_type=type_manage_prediction_task,
                                  id=task_name)["_source"]
    macro_during = es_result['macro_during']
    start_ts = datehour2ts(ts2datehour(es_result["submit_time"]))
    task_start_ts = start_ts
    end_ts = datehour2ts(ts2datehour(es_result["stop_time"]))

    index_micro = "micro_prediction_" + task_name
    query_body = {
        "query": {
            "filtered": {
                "filter": {
                    "range": {
                        "update_time": {
                            "lte": current_ts
                        }
                    }
                }
            }
        },
        "size": 10000,
        "sort": {
            "update_time": {
                "order": "asc"
            }
        }
    }
    micro_results = es_prediction.search(index=index_micro,
                                         doc_type="micro_task",
                                         body=query_body)["hits"]["hits"]
    total_list = []

    for item in micro_results:
        total_list.append(item["_source"]["total_count"])
    # 每个时间段内的微博量

    total_len = (end_ts - start_ts) / macro_during
    times = int(macro_during) / 3600
    lenth = len(total_list) / times
    adjust_list = []
    time_list = []
    count = 0
    i = 0
    for item in total_list:
        count += item
        i += 1
        if i % times == 0:
            if start_ts <= current_ts:
                adjust_list.append(count)
                count = 0
                time_list.append(start_ts)
            else:
                break
        start_ts += 3600

    # 总得时间走势图
    total_time_list = []
    for i in range(total_len):
        total_time_list.append(task_start_ts + i * macro_during)

    left_time = list(set(total_time_list) - set(time_list))
    left_time = sorted(left_time)

    return adjust_list, total_len, time_list, left_time

예제 #12

0

파일 보기

def organize_feature(task_name, mid, ts):

    result = dict()
    try:
        result = es.get(index=task_name, doc_type="text", id=mid)["_source"]
    except:
        pass
    if not result:
        return [0, 0, 0, 0, 0, 0, 0]

    ts = result["timestamp"]

    query_body = {"query": {"term": {"root_mid": mid}}}
    #total_weibo
    #count = es.count(index=index_list, doc_type="text", body=query_body)["count"]

    query_body_uid = {
        "query": {
            "term": {
                "root_mid": mid
            }
        },
        "aggs": {
            "uid_count": {
                "cardinality": {
                    "field": "uid"
                }
            }
        }
    }
    # total_uid
    #total_uid_count = es.search(index=index_list, doc_type="text", body=query_body_uid)['aggregations']["uid_count"]["value"]

    feature_list = []
    feature_list.append(math.log(result["user_fansnum"] + 1))
    query_body_ts = {
        "query": {
            "bool": {
                "must": [{
                    "term": {
                        "root_mid": mid
                    }
                }, {
                    "range": {
                        "timestamp": {
                            "lt": ts + 3600 * 10
                        }
                    }
                }]
            }
        },
        "aggs": {
            "weibo_type": {
                "terms": {
                    "field": "message_type"
                }
            }
        }
    }
    comment = 0
    retweet = 0
    tmp_count = es.search(
        index=task_name, doc_type="text",
        body=query_body_ts)['aggregations']["weibo_type"]["buckets"]
    if tmp_count:
        for item in tmp_count:
            if int(item["key"]) == 2:
                comment = item["doc_count"]
            elif int(item["key"]) == 3:
                retweet = item["doc_count"]
    feature_list.append(comment + retweet)
    feature_list.append(retweet)
    feature_list.append(comment)
    feature_list.append(retweet / float(comment + retweet + 1))
    feature_list.append(comment / float(comment + retweet + 1))
    query_body_uid = {
        "query": {
            "bool": {
                "must": [{
                    "term": {
                        "root_mid": mid
                    }
                }, {
                    "range": {
                        "timestamp": {
                            "lt": ts + 3600 * 10
                        }
                    }
                }]
            }
        },
        "aggs": {
            "uid_count": {
                "cardinality": {
                    "field": "uid"
                }
            }
        }
    }
    uid_count = es.search(
        index=task_name, doc_type="text",
        body=query_body_uid)['aggregations']["uid_count"]["value"]
    feature_list.append(uid_count)
    #feature_list.append(topic_field_dict[topic])

    return feature_list

예제 #13

0

파일 보기

파일: update_prediction.py 프로젝트: yuanhuiru/xnr2

def update_prediction(ts):  # current ts
    query_body = {
        "query": {
            "range": {
                "timestamp": {
                    "gte": ts - 10 * 3600,
                    "lte": ts
                }
            }
        },
        "size": 20000,
        "sort": {
            "timestamp": {
                "order": "asc"
            }
        }
    }

    es_results = es_prediction.search(index="social_sensing_text",doc_type="text",\
            body=query_body, _source=False,fields=["mid","timestamp"])["hits"]["hits"]
    print "get results lenth: ", len(es_results)
    mid_list = []
    mid_ts_list = []
    feature_list = []
    count = 0
    bulk_action = []
    with open("prediction_uid.pkl", "r") as f:
        uid_model = pickle.load(f)
    with open("prediction_weibo.pkl", "r") as f:
        weibo_model = pickle.load(f)

    print "finish loading"
    for item in es_results:
        mid = item["fields"]["mid"][0]
        mid_ts = item["fields"]["timestamp"][0]
        iter_feature = organize_feature(mid, mid_ts)
        feature_list.append(iter_feature)
        mid_list.append(mid)
        mid_ts_list.append(mid_ts)
        count += 1
        if count % 100 == 0:
            """
            weibo_prediction_result = weibo_model.predict(feature_list)
            uid_prediction_result = uid_model.predict(feature_list)
            print "finish prediction"
            for i in range(len(mid_list)):
                iter_dict = dict()
                iter_dict["mid"] = mid_list[i]
                iter_dict["uid_prediction"] = uid_prediction_result[i]
                iter_dict["weibo_prediction"] = weibo_prediction_result[i]
                tmp_trendline = trendline_list(mid_list[i], weibo_prediction_result[i],mid_ts_list[i])
                iter_dict["trendline"] = json.dumps(tmp_trendline)
                bulk_action.extend([{"update":{"_id":mid_list[i]}}, {"doc":iter_dict}])
                print uid_prediction_result[i], weibo_prediction_result[i],mid_list[i]
            print es_prediction.bulk(bulk_action,index="social_sensing_text",doc_type="text",timeout=600)
            """
            bulk_action = []
            mid_list = []
            mid_ts_list = []
            feature_list = []
            print "iter count: ", count
    if mid_list:
        weibo_prediction_result = weibo_model.predict(feature_list)
        uid_prediction_result = uid_model.predict(feature_list)
        print "finish prediction"
        for i in range(len(mid_list)):
            iter_dict = dict()
            iter_dict["mid"] = mid_list[i]
            iter_dict["uid_prediction"] = uid_prediction_result[i]
            iter_dict["weibo_prediction"] = weibo_prediction_result[i]
            tmp_trendline = trendline_list(mid_list[i],
                                           weibo_prediction_result[i],
                                           mid_ts_list[i])
            iter_dict["trendline"] = json.dumps(tmp_trendline)
            bulk_action.extend([{
                "update": {
                    "_id": mid_list[i]
                }
            }, {
                "doc": iter_dict
            }])
            print uid_prediction_result[i], weibo_prediction_result[
                i], mid_list[i]
        es_prediction.bulk(bulk_action,
                           index="social_sensing_text",
                           doc_type="text",
                           timeout=600)

예제 #14

0

파일 보기

파일: dispose_results.py 프로젝트: Ymm0008/diffusion-prediction

def potential_user(task_name, ts):
    index_name = "stimulation_"+task_name
    index_type = "stimulation_results"

    #查询当前root_mid
    query_body = {
        "query": {
            "bool":{
                "must":[
                    {"range":{
                        "timestamp":{
                            "lt": ts
                        }
                    }},
                    {"term":{"message_type":1}},
                    {"range":{
                        "user_fansnum":{
                            "gte": 10000
                        }
                    }}
                ]
            }
        },
        "size": 10000
    }

    es_results = es.search(index=task_name, doc_type="text", body=query_body)["hits"]["hits"]

    mid_list = []
    uid_list = []
    feature_list = []
    prediction_uid = []
    prediction_weibo = []
    with open("prediction_uid.pkl", "r") as f:
        uid_model = pickle.load(f)
    with open("prediction_weibo.pkl", "r") as f:
        weibo_model = pickle.load(f)

    for item in es_results:
        mid_list.append(item["_id"])
        uid_list.append(item["_source"]["uid"])
        tmp_feature_list = organize_feature(task_name,item["_id"], ts)
        feature_list.append(tmp_feature_list)
        weibo_prediction_result = weibo_model.predict(feature_list)
        uid_prediction_result = uid_model.predict(feature_list)

    future_total = 0
    current_total = 0

    results_dict = dict()
    in_potential_list = []
    for i in range(len(mid_list)):
        mid = mid_list[i]
        uid = uid_list[i]
        iter_count = es.count(index=task_name, doc_type="text", body={"query":{"term":{"root_mid":mid}}})["count"]
        pre_count = weibo_prediction_result[i]
        future_total += abs(pre_count-iter_count)
        if pre_count >= 500 and iter_count <= 500:
            current_total += abs(pre_count-iter_count)
            if not results_dict.has_key(uid):
                results_dict[uid] = dict()
            tmp = dict()
            tmp["mid"] = mid
            tmp["current_count"] = iter_count
            tmp["prediction_count"] = int(pre_count)
            weibo_detail = es.get(index=task_name, doc_type="text", id=mid)["_source"]
            tmp.update(weibo_detail)
            retweet, comment = search_retweet_comment(task_name, mid)
            tmp["retweeted"] = retweet
            tmp["comment"] = comment
            results_dict[uid][mid] = tmp


    # user profile
    tmp_in_list = results_dict.keys()
    if tmp_in_list:
        profile_results = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":tmp_in_list})["docs"]
        for i in range(len(tmp_in_list)):
            detail = profile_results[i]
            tmp = []
            uid = tmp_in_list[i]
            if detail["found"]:
                tmp.append(detail["_source"]["nick_name"])
                tmp.append(detail["_source"]["photo_url"])
                tmp.append(detail["_source"]["fansnum"])
                tmp.append(detail["_source"]["statusnum"])
            else:
                tmp.append(detail["_id"])
                tmp.extend(["","",""])
            results_dict[uid]["user_profile"] = tmp


    return results_dict, future_total, current_total

예제 #15

0

파일 보기

파일: feature_compute.py 프로젝트: NoahChanInvictus/finance_rumor_detect

def get_event_trend(task_name,start_ts,end_ts):

    #trend_end_ts = end_ts
    '''
    if end_ts - start_ts < 10800:
        if end_ts - start_ts < 7200:
            trend_start_ts = 

    else:
        trend_start_ts = trend_end_ts - 10800
    '''
    '''
    trend_middle_ts = trend_start_ts + 36000


    #事件趋势统计
    #event_trend = defaultdict(list)
    event_trend_delta_list = []
    #trend_input_list = []

    if trend_end_ts != trend_start_ts:

        trend_middle_ts = trend_start_ts + 3600

        query_body = {
            "query":{
                "bool":{
                    "must":[
                        {"range":{
                            "timestamp":{
                                "gte":trend_start_ts,
                                "lt":trend_middle_ts
                                }
                            }
                        }
                    ]
                }
            },
            "size":99999999
        }

        es_result = es.count(index=task_name,doc_type='text',body=query_body)
        weibo_count = es_result['count']

        
    '''

    event_trend = []
    trend_input_list = []
    event_trend_delta = []
    i = 0 
    while i<10: 
        end_ts = start_ts + 2*3600
        trend_input_list.append((task_name,start_ts,end_ts))
        start_ts = end_ts + 1
        i = i + 2
    #print 'input_list::::',input_list
    for item in trend_input_list:
        trend_event = item[0]
        trend_start_ts = item[1]
        trend_end_ts = item[2]
        
        query_body = {
            "query":{
                "bool":{
                "must":[

                    {"range":{
                    "timestamp":{
                        "gte":trend_start_ts,
                        "lt":trend_end_ts
                        }
                    }
                    }
                ]
                }
            },
            "size":99999999,
            "sort":{"user_fansnum":"desc"}
        }
        
        es_results = es.search(index=task_name,doc_type='text',body=query_body)["hits"]["hits"] 

        #前几小时微博数量趋势统计
        event_trend.append(len(es_results))

    for i in range(len(event_trend)-1):
        #print 'event_trend[event][i+1]::',event_trend[event][i+1]
        #print 'event_trend[event][i]::',event_trend[event]
        delta = event_trend[i+1]-event_trend[i]
        print 'delta::::',delta
        event_trend_delta.append(delta)

            #fo_trend.write(str(delta)+'\t')
            #fo_trend.write('\n')   
    print 'event_trend_delta:::',event_trend_delta 
    return event_trend_delta

예제 #16

0

파일 보기

파일: feature_compute.py 프로젝트: NoahChanInvictus/finance_rumor_detect

def feature_compute(task_name,start_ts,end_ts):

    query_body = {
        "query":{
            "bool":{
                "must":[
                    {"range":{
                        "timestamp":{
                            "gte":start_ts,
                            "lt":end_ts
                            }
                        }
                    }
                ]
            }
        },
        "size":99999999,
        "sort":{"user_fansnum":"desc"}
    }


    es_results = es.search(index=task_name,doc_type='text',body=query_body)["hits"]["hits"] 

    #前几小时微博数量趋势统计
    #event_trend[event].append(len(es_results))
     
    #话题领域
    #print 'len(es_re)',len(es_results)
    #field_multi = topic_field(es_results)    

    #参与用户粉丝总数\转发总数\评论总数及平均数统计
    total_user_fans = 0
    average_user_fans = 0
 
    total_comment = 0
    average_comment = 0

    total_retweet = 0
    average_retweet = 0

    #敏感微博数量及比例统计
    total_sensitive = 0
    total_sensitive_ratio = 0
 
    #负向情绪微博数量及比例统计
    total_negtive = 0
    total_negtive_ratio = 0

    #重要用户数量及比例统计（粉丝数>100000）
    total_important_user = 0
    total_important_user_ratio = 0
    
    #微博总数
    total_num = len(es_results)

    #@层级数统计
    #at_count = defaultdict(int)
    at_count = {}
    at_count['at_0'] = 0
    at_count['at_1'] = 0
    at_count['at_2'] = 0
    at_count['at>3'] = 0


    for result in es_results:
        
        total_user_fans += result['_source']['user_fansnum']
        total_comment += result['_source']['comment']
        total_retweet += result['_source']['retweeted']

        if result['_source']['sensitive'] > 0:
            total_sensitive += 1
        if result['_source']['sentiment'] > 1:
            total_negtive += 1
        if result['_source']['user_fansnum'] > 10000:
            total_important_user += 1
        
        text = result['_source']['text']
        at_list = re.findall('//@',text)
        #print 'at_list:::',at_list
        if len(at_list) == 0:
            at_count['at_0'] += 1
        elif len(at_list) == 1:
            at_count['at_1'] += 1
        elif len(at_list) == 2:
            at_count['at_2'] += 1
        else:
            at_count['at>3'] += 1
    
    #print 'at+count::',at_count        
    average_user_fans = float(total_user_fans)/total_num
    average_comment = float(total_comment)/total_num
    average_retweet = float(total_retweet)/total_num
    
    total_sensitive_ratio = float(total_sensitive)/total_num
    total_negtive_ratio = float(total_negtive)/total_num
    total_important_use_ratio = float(total_important_user)/total_num

 
    query_body_type_count = {
        'query':{
            'bool':{
                'must':[
                    {'range':{
                        'timestamp':{
                            'gte':start_ts,
                            'lt':end_ts
                        }
                    }}

                ]
            }
        },

        'size':999999999,
        'aggs':{
            'all_weibo':{
            'terms':{'field':'message_type'}
        }
    }
    }

    #统计各类型微博数量
    
    es_weibo_type_count = es.search(index=task_name,doc_type='text',body=query_body_type_count,request_timeout=999999)['aggregations']['all_weibo']['buckets']

    total_origin_type = 0
    total_retweet_type = 0
    total_comment_type = 0
    total_type = 0
    #print 'es_weibo_type_count:::',es_weibo_type_count
    weibo_type_count = dict()
    for item in es_weibo_type_count:
        if item['key'] == 1:
            total_origin_type = item['doc_count']
        elif item['key'] == 2:
            total_retweet_type = item['doc_count']
        elif item['key'] == 3:
            total_comment_type = item['doc_count']

    total_type = total_origin_type + total_retweet_type + total_comment_type

    origin_ratio = float(total_origin_type)/total_type
    retweet_ratio = float(total_retweet_type)/total_type
    comment_ratio = float(total_comment_type)/total_type
    
    uids_count = get_event_uid_count(task_name,start_ts,end_ts)
    
    event_trend_delta_list = get_event_trend(task_name,start_ts,end_ts)

    feature_list = [uids_count,total_num,total_user_fans,total_comment,total_retweet,\
                    total_sensitive,total_sensitive_ratio,total_negtive,total_important_user,total_origin_type,\
                    origin_ratio,total_retweet_type,retweet_ratio,total_comment_type,comment_ratio,\
                    at_count,event_trend_delta_list]

    return feature_list