예제 #1
0
def test():
    item = {}
    '''
    item['task_name'] = '天津老太摆射击摊被判刑' #'毛泽东诞辰纪念日'
    item['pinyin_task_name'] = 'tian_jin_lao_tai_she_ji_qiang_bei_pan_xing' #"mao_ze_dong_dan_chen_ji_nian_ri"
    item['start_time'] = 1482768502 #1482681600
    item['stop_time'] = 1483455435 #1483113600
    item['submit_user'] = '******' 
    item['submit_time'] = time.time() 
    item['must_keywords'] = ['射击','判刑']
    item['should_keywords'] = ['天津','老太']
    item['event_value_finish'] = 0
    item['scan_text_finish'] = 0
    '''

    item['task_name'] = '毛泽东诞辰纪念日'
    item['pinyin_task_name'] = "mao_ze_dong_dan_chen_ji_nian_ri"
    item['start_time'] = 1482681600
    item['stop_time'] = 1483113600
    item['submit_user'] = '******'
    item['submit_time'] = time.time()
    item['must_keywords'] = ['毛泽东']
    item['should_keywords'] = ['诞辰', '纪念日']
    item['event_value_finish'] = 0
    item['scan_text_finish'] = 0

    mappings_event_analysis_task()

    es.index(index=index_manage_event_analysis,
             doc_type=type_manage_event_analysis,
             id=item['pinyin_task_name'],
             body=item)
    '''
예제 #2
0
def save_results(task_name, ts, prediction_in, future_dict):
    mappings_stimulation(task_name)

    work_index = "stimulation_"+task_name
    work_type = "stimulation_results"

    update_body = {"update_time": ts, "in_results":json.dumps(prediction_in), "future_results": json.dumps(future_dict)}

    es_prediction.index(index=work_index, doc_type=work_type, id=ts, body=update_body)
    return True
예제 #3
0
def test():
    item = {}
    item['task_name'] = '毛泽东诞辰纪念日'
    item['pinyin_task_name'] = "mao_ze_dong_dan_chen_ji_nian_ri"
    item['start_ts'] = 1482681600
    item['end_ts'] = 1483113600
    item['event_value_finish'] = 0
    item['scan_text_finish'] = 0

    mappings_event_analysis_task()

    es.index(index=index_manage_event_analysis,
             doc_type=type_manage_event_analysis,
             id=item['pinyin_task_name'],
             body=item)
    '''
def organize_feature(task_name, event, start_ts, end_ts, during=3600):
    data = []
    index_list = []
    task_name = "micro_prediction_" + task_name

    while 1:
        data_dict = dict()
        if start_ts >= end_ts:
            break
        results_list = user_fansnum(event, start_ts, start_ts + during)
        for i in range(len(data_order)):
            data_dict[data_order[i]] = results_list[i]
        data_dict["update_time"] = start_ts + minimal_time_interval
        start_ts += during
        print "start timestamp: ", start_ts
        es_prediction.index(index=task_name,
                            doc_type=index_type_prediction_task,
                            id=start_ts,
                            body=data_dict)
def dispose_data(task_name, current_ts, during=3600):
    K = 2  ########

    task_detail = es_prediction.get(index=index_manage_prediction_task,
                                    doc_type=type_manage_prediction_task,
                                    id=task_name)["_source"]
    start_time = int(task_detail["start_time"])

    origin_task_name = task_name
    task_name = "micro_prediction_" + task_name

    query_body = {
        "query": {
            "range": {
                "update_time": {
                    "lte": current_ts
                }
            }
        },
        "size": K,
        "sort": {
            "update_time": {
                "order": "desc"
            }
        }
    }

    sort_query_body = {
        "query": {
            "range": {
                "update_time": {
                    "lte": current_ts
                }
            }
        }
    }

    total_count = []
    total_fans_list = []
    total_origin_list = []
    total_retweet_list = []
    total_comment_list = []
    total_uid_list = []
    total_positive_list = []
    total_negetive_list = []
    average_origin_ts = []
    average_retweet_ts = []

    feature_list = []
    results = es_prediction.search(index=task_name,
                                   doc_type=index_type_prediction_task,
                                   body=query_body)["hits"]["hits"]
    location = es_prediction.count(index=task_name,
                                   doc_type=index_type_prediction_task,
                                   body=sort_query_body)["count"]

    if len(results) != K:
        short_len = K - len(results)
        results.extend([[]] * short_len)
    print "former result: ", len(results), K
    results.reverse()
    for item in results:
        if item:
            item = item["_source"]
            #total_fans_list.append(item["total_fans_number"])
            total_origin_list.append(item["origin_weibo_number"])
            total_retweet_list.append(item["retweeted_weibo_number"])
            total_comment_list.append(item["comment_weibo_number"])
            total_count.append(item["total_count"])
            total_uid_list.append(item["total_uid_count"])
            total_positive_list.append(item["positive_count"])
            total_negetive_list.append(item["negetive_count"])
            average_origin_ts.append(item["average_origin_ts"])
            average_retweet_ts.append(item["average_retweet_ts"])
        else:
            #total_fans_list.append(0)
            total_origin_list.append(0)
            total_retweet_list.append(0)
            total_comment_list.append(0)
            total_uid_list.append(0)
            total_count.append(0)
            total_positive_list.append(0)
            total_negetive_list.append(0)
            average_origin_ts.append(0)
            average_retweet_ts.append(0)
    print "total_count: ", total_count

    feature_list = []
    feature_list.append(math.log(int(total_retweet_list[-1] + 1)))
    feature_list.append(math.log(int(total_comment_list[-1] + 1)))
    feature_list.append(math.log(int(total_positive_list[-1] + 1)))
    feature_list.append(math.log(int(total_negetive_list[-2] + 1)))
    feature_list.append(math.log(int(total_negetive_list[-1] + 1)))
    feature_list.append(math.log(int(total_count[-1] + 1)))
    feature_list.append(math.log(int(total_uid_list[-1] + 1)))
    if int(during) == 3 * 3600:
        feature_list.append(average_origin_ts[-1])
        feature_list.append(average_retweet_ts[-1])

    # load model and prediction
    if int(during) == 3600:
        if total_count[-1] - total_count[-2] >= -0.2 * total_count[-2]:
            with open("model-up.pkl", "r") as f:
                gbdt = pickle.load(f)
        else:
            with open("model-down.pkl", "r") as f:
                gbdt = pickle.load(f)
    elif int(during) == 3 * 3600:
        with open("model-3.pkl", "r") as f:
            gbdt = pickle.load(f)

    print "feature_list: ", feature_list
    pred = gbdt.predict(feature_list)
    for item in pred:
        prediction_value = item
        prediction_value = math.exp(prediction_value)
        print "prediction_valie: ", prediction_value

    # update scan processing
    #es_prediction.update(index=index_manage_prediction_task,doc_type=type_manage_prediction_task, \
    #        id=origin_task_name, body={"doc":{"scan_text_processing":"0"}})

    # update prediction value in es
    task_detail = es_prediction.get(index=index_manage_prediction_task, \
            doc_type=type_manage_prediction_task, id=origin_task_name)["_source"]
    if current_ts >= int(task_detail["stop_time"]):
        task_detail["finish"] = "1"
        task_detail["processing_status"] = "0"

        # update task info
        es_prediction.index(index=index_manage_prediction_task, \
            doc_type=type_manage_prediction_task, id=origin_task_name, body=task_detail)

    # update prediction
    es_prediction.update(index=task_name,
                         doc_type=index_type_prediction_task,
                         id=current_ts,
                         body={"doc": {
                             "prediction_value": prediction_value
                         }})

    return True
from elasticsearch import Elasticsearch


social_sensors = ["1738004582", "1784473157", "2286908003", "1717833412", "1314608344", "1644114654",\
        "1686546714", "1656737654", "2028810631", "1677991972", "3881380517", "1847582585", "1651428902",\
        "1420157965", "1913382117", "1884334303", "1734530730", "1893278624", "1720962692", "1700648435",\
        "3288875501", "1672519561", "2034347300", "1688864597", "2615417307", "1191965271", "1643971635", \
        "1778758223", "1216431741", "1698823241", "1977460817", "1644729004", "1231759973", "1231759973",\
        "1315591982", "1656831930", "1926909715", "1699432410", "1660452532", "1722628512", "1267454277",\
        "1640601392", "2443459455", "3921730119", "1867571077", "1718493627", "1653460650", "1737737970",\
        "2616293707", "3271121353", "1642591402", "1326410461", "1645705403", "1985593262", "1654164742",\
        "1638781994", "2993049293", "1653944045", "5977555696", "1992613670", "1726393244", "1216431741",\
        "1724367710", "1880087643", "2827102952", "1974808274", "1700720163", "3164957712", "3266943013",\
        "2127460165", "2083844833", "5305757517", "2803301701", "2656274875", "1618051664", "1974576991", \
        "1642512402", "1649173367", "1658388624", "1697601814", "1703371307", "1638782947", "1402977920", \
        "1893801487", "2108053230", "1649469284", "1975995305", "2810373291", "1749990115", "1663937380", \
        "1497087080", "1652484947", "2162541102", "2462605080", "1650111241", "1265998927", "1698857957", \
        "1887790981", "1698233740", "3712035812", "5044281310", "1701401324", "1571497285", "1635764393"]

user = "******"

task_detail = dict()
task_detail["task_name"] = id_sensing
task_detail["remark"] = "感知热门事件"
task_detail["social_sensors"] = json.dumps(list(social_sensors))
task_detail["history_status"] = json.dumps([])
print es.index(index=index_sensing,
               doc_type=type_sensing,
               id=id_sensing,
               body=task_detail)
예제 #7
0
def social_sensing(task_detail):
    # 任务名 传感器 终止时间 之前状态 创建者 时间
    task_name = task_detail[0]
    social_sensors = task_detail[1]
    ts = int(task_detail[2])
    wb = Workbook()
    ws = wb.create_sheet()


    print ts2date(ts)
    # PART 1
    
    #forward_result = get_forward_numerical_info(task_name, ts, create_by)
    # 之前时间阶段内的原创微博list/retweeted
    forward_origin_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range)
    forward_retweeted_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range, 3)
    # 当前阶段内原创微博list
    current_mid_list = query_mid_list(ts, social_sensors, time_interval)
    current_retweeted_mid_list = query_mid_list(ts, social_sensors, time_interval, 3)
    all_mid_list = []
    all_mid_list.extend(current_mid_list)
    all_mid_list.extend(current_retweeted_mid_list)
    all_mid_list.extend(forward_origin_weibo_list)
    all_mid_list.extend(forward_retweeted_weibo_list)
    all_origin_list = []
    all_origin_list.extend(current_mid_list)
    all_origin_list.extend(forward_origin_weibo_list)
    all_origin_list = list(set(all_origin_list))
    all_retweeted_list = []
    all_retweeted_list.extend(current_retweeted_mid_list)
    all_retweeted_list.extend(forward_retweeted_weibo_list)#被转发微博的mid/root-mid
    all_retweeted_list = list(set(all_retweeted_list))
    print "all mid list: ", len(all_mid_list)
    #print "all_origin_list", all_origin_list
    #print "all_retweeted_list", all_retweeted_list

    # 查询微博在当前时间内的转发和评论数, 聚合按照message_type
    statistics_count = query_related_weibo(ts, all_mid_list, time_interval)
    if all_origin_list:
        origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval) # 原创微博详情
    else:
        origin_weibo_detail = {}
    if all_retweeted_list:
        retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval) # 转发微博详情
    else:
        retweeted_weibo_detail = {}
    current_total_count = statistics_count['total_count']

    # 当前阶段内所有微博总数
    current_retweeted_count = statistics_count['retweeted']
    current_comment_count = statistics_count['comment']


    """
    # 聚合当前时间内重要的人
    important_uid_list = []
    datetime = ts2datetime(ts-time_interval)
    index_name = flow_text_index_name_pre + datetime
    exist_es = es_text.indices.exists(index_name)
    if exist_es:
        search_results = get_important_user(ts, all_mid_list, time_interval)
        important_uid_list = search_results
    # 根据获得uid_list,从人物库中匹配重要人物
    if important_uid_list:
        important_results = es_user_portrait.mget(index=portrait_index_name,doc_type=portrait_index_type, body={"ids": important_uid_list})['docs']
    else:
        important_results = []
    filter_important_list = [] # uid_list
    if important_results:
        for item in important_results:
            if item['found']:
                #if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD:
                filter_important_list.append(item['_id'])

    print "filter_important_list", filter_important_list
    print "important_results", important_uid_list
    """

    #判断感知



    # 感知到的事, all_mid_list
    sensitive_text_list = []
    tmp_sensitive_warning = ""
    text_dict = dict() # 文本信息
    mid_value = dict() # 文本赋值
    duplicate_dict = dict() # 重合字典
    portrait_dict = dict() # 背景信息
    classify_text_dict = dict() # 分类文本
    classify_uid_list = []
    duplicate_text_list = []
    sensitive_words_dict = dict()
    sensitive_weibo_detail = {}

    # 有事件发生时开始
    if 1:
        index_list = []
        important_words = []
        datetime_1 = ts2datetime(ts)
        index_name_1 = flow_text_index_name_pre + datetime_1
        exist_es = es_text.indices.exists(index=index_name_1)
        if exist_es:
            index_list.append(index_name_1)
        datetime_2 = ts2datetime(ts-DAY)
        index_name_2 = flow_text_index_name_pre + datetime_2
        exist_es = es_text.indices.exists(index=index_name_2)
        if exist_es:
            index_list.append(index_name_2)
        if index_list and all_mid_list:
            query_body = {
                "query":{
                    "filtered":{
                        "filter":{
                            "terms":{"mid": all_mid_list}
                        }
                    }
                },
                "size": 5000
            }
            search_results = es_text.search(index=index_list, doc_type="text", body=query_body)['hits']['hits']
            tmp_sensitive_warning = ""
            text_dict = dict() # 文本信息
            mid_value = dict() # 文本赋值
            duplicate_dict = dict() # 重合字典
            portrait_dict = dict() # 背景信息
            classify_text_dict = dict() # 分类文本
            classify_uid_list = []
            duplicate_text_list = []
            sensitive_words_dict = dict()
            if search_results:
                for item in search_results:
                    iter_uid = item['_source']['uid']
                    iter_mid = item['_source']['mid']
                    iter_text = item['_source']['text'].encode('utf-8', 'ignore')
                    iter_sensitive = item['_source'].get('sensitive', 0)

                    duplicate_text_list.append({"_id":iter_mid, "title": "", "content":iter_text.decode("utf-8",'ignore')})

                    if iter_sensitive:
                        tmp_sensitive_warning = signal_sensitive_variation #涉及到敏感词的微博
                        sensitive_words_dict[iter_mid] = iter_sensitive

                    keywords_dict = json.loads(item['_source']['keywords_dict'])
                    personal_keywords_dict = dict()
                    for k, v in keywords_dict.iteritems():
                        k = k.encode('utf-8', 'ignore')
                        personal_keywords_dict[k] = v
                    classify_text_dict[iter_mid] = personal_keywords_dict
                    classify_uid_list.append(iter_uid)

                # 去重
                if duplicate_text_list:
                    dup_results = duplicate(duplicate_text_list)
                    for item in dup_results:
                        if item['duplicate']:
                            duplicate_dict[item['_id']] = item['same_from']

                # 分类
                mid_value = dict()
                if classify_text_dict:
                     classify_results = topic_classfiy(classify_uid_list, classify_text_dict)
                     #print "classify_results: ", classify_results
                     for k,v in classify_results.iteritems(): # mid:value
                        mid_value[k] = topic_value_dict[v[0]]

            if sensitive_words_dict:
                sensitive_mid_list = sensitive_words_dict.keys()
                sensitivie_weibo_detail = query_hot_weibo(ts, sensitive_mid_list, time_interval)


    results = dict()
    results['mid_topic_value'] = json.dumps(mid_value)
    results['duplicate_dict'] = json.dumps(duplicate_dict)
    results['sensitive_words_dict'] = json.dumps(sensitive_words_dict)
    results['sensitive_weibo_detail'] = json.dumps(sensitive_weibo_detail)
    results['origin_weibo_number'] = len(all_origin_list)
    results['retweeted_weibo_number'] = len(all_retweeted_list)
    results['origin_weibo_detail'] = json.dumps(origin_weibo_detail)
    results['retweeted_weibo_detail'] = json.dumps(retweeted_weibo_detail)
    results['retweeted_weibo_count'] = current_retweeted_count
    results['comment_weibo_count'] = current_comment_count
    results['weibo_total_number'] = current_total_count
    results['timestamp'] = ts
    # es存储当前时段的信息
    es_prediction.index(index=index_sensing_task, doc_type=type_sensing_task, id=ts, body=results)
    #print results
    #temp_titles = list(results.keys())
    #temp_results = list(results.values())
    #ws.append(temp_titles)
    #ws.append(temp_results)
    #wb.save('./temp/temp'+str(ts)+'.xlsx')
    #查找并展示经济类的相关微博
    #eco_mid_list = get_economics_mids(mid_value)
    #size = 10
    #get_origin_weibo_detail(ts,size,'retweeted')
    #print eco_mid_list
    #eco_weibos = get_weibo_content(index_list,eco_mid_list)
    #print eco_weibos
    #eco_content = eco_weibos['_source']['text']
    #weibo_content = ''
    #for aaa in eco_weibos:
        #weibo_content += aaa['_source']['text']+'\n'
    #save_results(weibo_content,ts)
    return "1"
def rank_predict(event, start_ts, end_ts):

    feature_list = feature_compute(event, start_ts, end_ts)
    print 'feature_list:::::', feature_list
    feature_list_gbdt = []
    for i in range(len(feature_list)):
        #把多个分开(extend

        if i == 15:
            feature_list[i] = json.loads(json.dumps(feature_list[i]))
            print 'type::', type(feature_list[i])
            print 'feature_list[i][at_0]::', feature_list[i]['at_0']

            feature_list_gbdt.append(feature_list[i]['at_0'])
            feature_list_gbdt.append(feature_list[i]['at_1'])
            feature_list_gbdt.append(feature_list[i]['at_2'])
            feature_list_gbdt.append(feature_list[i]['at>3'])
        elif i == 16:
            feature_list[i] = json.loads(json.dumps(feature_list[i]))

            print 'feature_list[i]:::', feature_list[i]
            feature_list_gbdt.append(feature_list[i][0])
            feature_list_gbdt.append(feature_list[i][1])
            feature_list_gbdt.append(feature_list[i][2])
            feature_list_gbdt.append(feature_list[i][3])
        else:
            feature_list_gbdt.append(feature_list[i])

    print 'feature_list_gbdt:::::', feature_list_gbdt

    #加载微博模型
    with open("0305_macro-prediction-weibos-value.pkl", "rb") as f:
        gbdt = pickle.load(f)

    pred = gbdt.predict(feature_list_gbdt)

    for item in pred:
        predict_weibo_value = item

    #加载用户模型
    with open("0305_macro-prediction-uids-value.pkl", "rb") as f:
        gbdt = pickle.load(f)

    pred = gbdt.predict(feature_list_gbdt)

    for item in pred:
        predict_user_value = item

    predict_rank = get_rank(predict_user_value)

    ## 存入事件信息表
    #for i in range(len(feature_list)):
    feature_results = {}
    feature_results['event'] = event
    '''
    feature_results['topic_field'] = feature_list[0]
    feature_results['total_num'] = feature_list[1]
    feature_results['total_user_fans'] = feature_list[2]
    feature_results['total_comment'] = feature_list[3]
    feature_results['total_retweet'] = feature_list[4]
    feature_results['total_sensitive'] = feature_list[5]
    feature_results['total_sensitive_ratio'] = feature_list[6]
    feature_results['total_negtive'] = feature_list[7]
    feature_results['total_important_user'] = feature_list[8]   
    feature_results['total_origin_type'] = feature_list[9]
    feature_results['origin_ratio'] = feature_list[10]
    feature_results['total_retweet_type'] = feature_list[11]
    feature_results['retweet_ratio'] = feature_list[12]
    feature_results['total_comment_type'] = feature_list[13]
    feature_results['comment_ratio'] = feature_list[14]
    feature_results['at_count'] = feature_list[15]
    feature_results['event_uid_count'] = feature_list[16]
    feature_results['event_trend_delta'] = feature_list[17]
    feature_results['predict_weibo_value'] = predict_weibo_value
    feature_results['predict_user_value'] = predict_user_value
    feature_results['predict_rank'] = predict_rank
    feature_results['update_time'] = time.time()
    '''
    #feature_results['topic_field'] = feature_list[0]
    feature_results['uid_count'] = feature_list[0]
    feature_results['total_num'] = feature_list[1]
    feature_results['total_user_fans'] = feature_list[2]
    feature_results['total_comment'] = feature_list[3]
    feature_results['total_retweet'] = feature_list[4]
    feature_results['total_sensitive'] = feature_list[5]
    feature_results['total_sensitive_ratio'] = feature_list[6]
    feature_results['total_negtive'] = feature_list[7]
    feature_results['total_important_user'] = feature_list[8]
    feature_results['total_origin_type'] = feature_list[9]
    feature_results['origin_ratio'] = feature_list[10]
    feature_results['total_retweet_type'] = feature_list[11]
    feature_results['retweet_ratio'] = feature_list[12]
    feature_results['total_comment_type'] = feature_list[13]
    feature_results['comment_ratio'] = feature_list[14]
    feature_results['at_count'] = json.dumps(feature_list[15])

    feature_results['event_trend_delta'] = json.dumps(feature_list[16])
    feature_results['predict_weibo_value'] = predict_weibo_value
    feature_results['predict_user_value'] = predict_user_value
    feature_results['predict_rank'] = predict_rank
    feature_results['update_time'] = time.time()
    '''
    save_event_info_results(event,topic_field,total_num,total_user_fans,\
                                total_comment,total_retweet,total_sensitive,\
                                total_sensitive_ratio,total_negtive,total_important_user,\
                                total_origin_type,origin_ratio,total_retweet_type,retweet_ratio,\
                                total_comment_type,comment_ratio,at_count,event_uid_count,\
                                event_trend_delta,predict_value,predict_rank,update_time)
    '''
    #update macro features & results

    feature_results = json.dumps(feature_results)
    try:
        item_exists = es_prediction.get(index=index_macro_feature_result,doc_type= type_macro_feature_result,\
                                        id=event)['_source']
        es_prediction.update(index=index_macro_feature_result,doc_type=type_macro_feature_result,\
                            id=event,body={'doc':feature_results})
    except:
        es_prediction.index(index=index_macro_feature_result,doc_type=type_macro_feature_result,\
                                id=event,body=feature_results)

    # update task info —— "macro_value_finish"
    task_detail = es_prediction.get(index=index_manage_prediction_task, \
            doc_type=type_manage_prediction_task, id=event)["_source"]
    task_detail["macro_value_finish"] = '1'
    es_prediction.index(index=index_manage_prediction_task, \
            doc_type=type_manage_prediction_task, id=event, body=task_detail)
    print 'feature_results::::', feature_results