示例#1
0
def count_text_num(uid_list, fb_flow_text_index_list):
    count_result = {}
    #QQ那边好像就是按照用户来count的    https://github.com/huxiaoqian/xnr1/blob/82ff9704792c84dddc3e2e0f265c46f3233a786f/xnr/qq_xnr_manage/qq_history_count_timer.py
    for uid in uid_list:
        textnum_query_body = {
            'query': {
                "filtered": {
                    "filter": {
                        "bool": {
                            "must": [
                                {
                                    "term": {
                                        "uid": uid
                                    }
                                },
                            ]
                        }
                    }
                }
            }
        }
        text_num = 0
        for index_name in fb_flow_text_index_list:
            result = es.count(index=index_name,
                              doc_type=flow_text_index_type,
                              body=textnum_query_body)
            if result['_shards']['successful'] != 0:
                text_num += result['count']
        count_result[uid] = text_num
    return count_result
示例#2
0
def qq_history_count(xnr_user_no, qq_number, current_time):

    current_date = ts2datetime(current_time)
    timestamp = datetime2ts(current_date)
    last_date = ts2datetime(current_time - DAY)

    group_message_index_name = group_message_index_name_pre + current_date

    query_body = {
        'query': {
            'bool': {
                'must': [{
                    'term': {
                        'speaker_qq_number': qq_number
                    }
                }, {
                    'term': {
                        'xnr_qq_number': qq_number
                    }
                }]
            }
        }
    }

    count_result = es.count(index=group_message_index_name,
                            doc_type=group_message_index_type,
                            body=query_body)

    if count_result['_shards']['successful'] != 0:
        today_count = count_result['count']
    else:
        print 'es index rank error'
        today_count = 0

    _id_last = xnr_user_no + last_date

    try:
        get_result = es.get(index=qq_xnr_history_count_index_name,doc_type=qq_xnr_history_count_index_type,\
                            id=_id_last)['_source']
        total_count_history = get_result['total_post_num']
    except:
        total_count_history = 0

    total_count_totay = total_count_history + today_count

    _id_today = xnr_user_no + '_' + current_date

    item_dict = dict()
    item_dict['date_time'] = current_date
    item_dict['xnr_user_no'] = xnr_user_no
    item_dict['total_post_num'] = total_count_totay
    item_dict['daily_post_num'] = today_count
    item_dict['qq_number'] = qq_number
    item_dict['timestamp'] = timestamp

    es.index(index=qq_xnr_history_count_index_name,doc_type=qq_xnr_history_count_index_type,\
                id=_id_today,body=item_dict)
示例#3
0
def influence_active(uid, index_name):
    query_body = {'query': {'term': {'uid': uid}}}

    #index_name = facebook_flow_text_index_name_pre + ts2datetime(current_time)

    es_count = es.count(index=index_name,
                        doc_type=facebook_flow_text_index_type,
                        body=query_body)

    if es_count['_shards']['successful'] != 0:
        active_num = es_count['count']
    else:
        active_num = 0

    return active_num
示例#4
0
def qq_history_count(xnr_user_no, qq_number, current_time):

    if S_TYPE == 'test':
        current_time = datetime2ts(QQ_S_DATE_ASSESSMENT)

    current_date = ts2datetime(current_time)
    last_date = ts2datetime(current_time - DAY)

    group_message_index_name = group_message_index_name_pre + current_date
    qq_xnr_history_count_index_name = qq_xnr_history_count_index_name_pre + last_date

    # 得到当天发帖数量
    query_body = {
        'query': {
            'bool': {
                'must': [{
                    'term': {
                        'speaker_qq_number': qq_number
                    }
                }, {
                    'term': {
                        'xnr_qq_number': qq_number
                    }
                }]
            }
        }
    }

    count_result = es.count(index=group_message_index_name,
                            doc_type=group_message_index_type,
                            body=query_body)

    if count_result['_shards']['successful'] != 0:
        today_count = count_result['count']
    else:
        print 'es index rank error'
        today_count = 0

    # 得到历史发言总数
    try:
        get_result = es.get(index=qq_xnr_history_count_index_name,doc_type=qq_xnr_history_count_index_type,\
                            id=xnr_user_no)['_source']

        total_count_history = get_result['total_post_num']

    except:
        total_count_history = 0

    total_count_totay = total_count_history + today_count

    item_dict = dict()
    item_dict['total_post_num'] = total_count_totay
    item_dict['daily_post_num'] = today_count

    # xnr所在群当天发言最多的人
    query_body_total_day = {
        'query': {
            'filtered': {
                'filter': {
                    'term': {
                        'xnr_qq_number': qq_number
                    }
                }
            }
        },
        'aggs': {
            'all_speakers': {
                'terms': {
                    'field': 'speaker_qq_number',
                    "order": {
                        "_count": "desc"
                    }
                }
            }
        }
    }

    try:

        results_total_day = es_xnr.search(index=group_message_index_name,doc_type=group_message_index_type,\
                    body=query_body_total_day)['aggregations']['all_speakers']['buckets']

        speaker_max = results_total_day[0]['doc_count']
    except:
        speaker_max = today_count

    safe = (float(math.log(today_count + 1)) /
            (math.log(speaker_max + 1) + 1)) * 100

    safe = round(safe, 2)  # 保留两位小数

    item_dict['mark'] = safe

    return item_dict
示例#5
0
def get_influence_at_num(xnr_user_no, qq_number, current_time):

    item_dict = {}

    if S_TYPE == 'test':
        current_time = datetime2ts(QQ_S_DATE_ASSESSMENT)

    current_date = ts2datetime(current_time)

    group_message_index_name = group_message_index_name_pre + current_date

    #虚拟人当天被@数量
    query_body_xnr = {
        'query': {
            'bool': {
                'must': [{
                    'term': {
                        'xnr_qq_number': qq_number
                    }
                }, {
                    'wildcard': {
                        'text': '*' + '@ME' + '*'
                    }
                }]
            }
        }
    }

    try:
        results_xnr = es_xnr.count(index=group_message_index_name,doc_type=group_message_index_type,\
                    body=query_body_xnr)

        if results_xnr['_shards']['successful'] != 0:
            at_num_xnr = results_xnr['count']

        else:
            print 'es index rank error'
            at_num_xnr = 0
    except:
        at_num_xnr = 0

    # 得到历史总数
    current_time_last = current_time - DAY
    current_date_last = ts2datetime(current_time_last)
    qq_xnr_history_count_index_name = qq_xnr_history_count_index_name_pre + current_date_last

    try:
        result_last = es_xnr.get(index=qq_xnr_history_count_index_name,
                                 doc_type=qq_xnr_history_be_at_index_type,
                                 id=xnr_user_no)['_source']
        total_be_at_num_last = result_last['total_be_at_num']
    except:
        total_be_at_num_last = 0

    item_dict['daily_be_at_num'] = at_num_xnr
    item_dict['total_be_at_num'] = at_num_xnr + total_be_at_num_last

    # 被@总数
    query_body_total_day = {
        'query': {
            'bool': {
                'must': [{
                    'term': {
                        'xnr_qq_number': qq_number
                    }
                }, {
                    'wildcard': {
                        'text': '*' + '@' + '*'
                    }
                }]
            }
        }
    }

    try:
        results_total_day = es_xnr.count(index=group_message_index_name,doc_type=group_message_index_type,\
                    body=query_body_total_day)

        if results_total_day['_shards']['successful'] != 0:
            at_num_total_day = results_total_day['count']
        else:
            print 'es index rank error'
            at_num_total_day = 0
    except:
        at_num_total_day = 0

    influence = (float(math.log(at_num_xnr + 1)) /
                 (math.log(at_num_total_day + 1) + 1)) * 100

    influence = round(influence, 2)  # 保留两位小数

    mark = influence
    item_dict['mark'] = mark

    # es_xnr.index(index=qq_xnr_history_count_index_name,doc_type=qq_xnr_history_be_at_index_type,\
    #     body=item_dict)

    return item_dict
示例#6
0
def social_sensing(task_detail):
    '''
    with open("prediction_uid.pkl", "r") as f:
        uid_model = pickle.load(f)
    with open("prediction_weibo.pkl", "r") as f:
        weibo_model = pickle.load(f)
    '''
    # 任务名 传感器 终止时间 之前状态 创建者 时间

    task_name = task_detail[0]
    social_sensors = task_detail[1]
    #ts = int(task_detail[2])
    ts = float(task_detail[2])

    xnr_user_no = task_detail[3]

    print ts2date(ts)
    index_list = []
    important_words = []
    datetime_1 = ts2datetime(ts)
    index_name_1 = flow_text_index_name_pre + datetime_1
    exist_es = es_text.indices.exists(index=index_name_1)
    if exist_es:
        index_list.append(index_name_1)
    datetime_2 = ts2datetime(ts - DAY)
    index_name_2 = flow_text_index_name_pre + datetime_2
    exist_es = es_text.indices.exists(index=index_name_2)
    if exist_es:
        index_list.append(index_name_2)
    if es_text.indices.exists(index=flow_text_index_name_pre +
                              ts2datetime(ts - 2 * DAY)):
        index_list.append(flow_text_index_name_pre + ts2datetime(ts - 2 * DAY))

    # PART 1

    #forward_result = get_forward_numerical_info(task_name, ts, create_by)
    # 之前时间阶段内的原创微博list/retweeted
    forward_origin_weibo_list, forward_1 = query_mid_list(
        ts - time_interval, social_sensors, forward_time_range)
    forward_retweeted_weibo_list, forward_3 = query_mid_list(
        ts - time_interval, social_sensors, forward_time_range, 3)
    # 当前阶段内原创微博list
    current_mid_list, current_1 = query_mid_list(ts, social_sensors,
                                                 time_interval)
    current_retweeted_mid_list, current_3 = query_mid_list(
        ts, social_sensors, time_interval, 3)
    all_mid_list = []
    all_mid_list.extend(current_mid_list)
    all_mid_list.extend(current_retweeted_mid_list)
    all_mid_list.extend(forward_origin_weibo_list)
    all_mid_list.extend(forward_retweeted_weibo_list)
    all_origin_list = []
    all_origin_list.extend(current_mid_list)
    all_origin_list.extend(forward_origin_weibo_list)
    all_origin_list = list(set(all_origin_list))
    all_retweeted_list = []
    all_retweeted_list.extend(current_retweeted_mid_list)
    all_retweeted_list.extend(
        forward_retweeted_weibo_list)  #被转发微博的mid/root-mid
    all_retweeted_list = list(set(all_retweeted_list))

    all_mid_list = filter_mid(all_mid_list)
    all_origin_list = filter_mid(all_origin_list)
    all_retweeted_list = filter_mid(all_retweeted_list)

    print "all mid list: ", len(all_mid_list)
    print "all_origin_list", len(all_origin_list)
    print "all_retweeted_list", len(all_retweeted_list)

    # 查询微博在当前时间内的转发和评论数, 聚合按照message_type
    #statistics_count = query_related_weibo(ts, all_mid_list, time_interval)
    if all_origin_list:
        #origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval) # 原创微博详情
        origin_weibo_detail = dict()
        for mid in all_origin_list:
            retweet_count = es_text.count(
                index=index_list,
                doc_type="text",
                body={"query": {
                    "bool": {
                        "must": [{
                            "term": {
                                "fid": mid
                            }
                        }]
                    }
                }})["count"]
            comment_count = es_text.count(
                index=index_list,
                doc_type="text",
                body={"query": {
                    "bool": {
                        "must": [{
                            "term": {
                                "fid": mid
                            }
                        }]
                    }
                }})["count"]
            tmp = dict()
            tmp["retweeted"] = retweet_count
            tmp["comment"] = comment_count
            origin_weibo_detail[mid] = tmp
    else:
        origin_weibo_detail = {}
    print "len(origin_weibo_detail): ", len(origin_weibo_detail)
    if all_retweeted_list:
        retweeted_weibo_detail = dict()
        for mid in all_retweeted_list:
            retweet_count = es_text.count(index=index_list,
                                          doc_type="text",
                                          body={
                                              "query": {
                                                  "bool": {
                                                      "must": [{
                                                          "term": {
                                                              "root_mid": mid
                                                          }
                                                      }, {
                                                          "term": {
                                                              "message_type": 3
                                                          }
                                                      }]
                                                  }
                                              }
                                          })["count"]
            comment_count = es_text.count(index=index_list,
                                          doc_type="text",
                                          body={
                                              "query": {
                                                  "bool": {
                                                      "must": [{
                                                          "term": {
                                                              "root_mid": mid
                                                          }
                                                      }, {
                                                          "term": {
                                                              "message_type": 2
                                                          }
                                                      }]
                                                  }
                                              }
                                          })["count"]
            tmp = dict()
            tmp["retweeted"] = retweet_count
            tmp["comment"] = comment_count
            retweeted_weibo_detail[mid] = tmp
        #retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval) # 转发微博详情
    else:
        retweeted_weibo_detail = {}
    print "len(retweeted_weibo_detail): ", len(retweeted_weibo_detail)
    #current_total_count = statistics_count['total_count']

    # 当前阶段内所有微博总数
    #current_retweeted_count = statistics_count['retweeted']
    #current_comment_count = statistics_count['comment']

    #all_mid_list = list(set(all_origin_list[:100]) | set(all_retweeted_list[:100]))

    # 感知到的事, all_mid_list
    sensitive_text_list = []
    tmp_sensitive_warning = ""
    text_dict = dict()  # 文本信息
    mid_value = dict()  # 文本赋值
    duplicate_dict = dict()  # 重合字典
    portrait_dict = dict()  # 背景信息
    classify_text_dict = dict()  # 分类文本
    classify_uid_list = []
    duplicate_text_list = []
    sensitive_words_dict = dict()
    sensitive_weibo_detail = {}
    trendline_dict = dict()
    all_text_dict = dict()

    # 有事件发生时开始
    if 1:
        print "index_list:", index_list

        if index_list and all_mid_list:
            query_body = {
                "query": {
                    "filtered": {
                        "filter": {
                            "terms": {
                                "mid": all_mid_list
                            }
                        }
                    }
                },
                "size": 5000
            }
            search_results = es_text.search(index=index_list,
                                            doc_type="text",
                                            body=query_body)['hits']['hits']
            print "search mid len: ", len(search_results)
            tmp_sensitive_warning = ""
            text_dict = dict()  # 文本信息
            mid_value = dict()  # 文本赋值
            duplicate_dict = dict()  # 重合字典
            portrait_dict = dict()  # 背景信息
            classify_text_dict = dict()  # 分类文本
            #classify_uid_list = []
            classify_mid_list = []
            duplicate_text_list = []
            sensitive_words_dict = dict()
            mid_ts_dict = dict()  # 文本发布时间
            uid_prediction_dict = dict()
            weibo_prediction_dict = dict()
            trendline_dict = dict()
            feature_prediction_list = []  # feature
            mid_prediction_list = []  # dui ying mid
            if search_results:
                for item in search_results:
                    iter_uid = item['_source']['uid']
                    iter_mid = item['_source']['mid']
                    mid_ts_dict[iter_mid] = item["_source"]["timestamp"]
                    iter_text = item['_source']['text'].encode(
                        'utf-8', 'ignore')
                    iter_sensitive = item['_source'].get('sensitive', 0)
                    tmp_text = get_weibo(item['_source'])
                    all_text_dict[iter_mid] = tmp_text

                    duplicate_text_list.append({
                        "_id":
                        iter_mid,
                        "title":
                        "",
                        "content":
                        iter_text.decode("utf-8", 'ignore')
                    })

                    if iter_sensitive:
                        tmp_sensitive_warning = signal_sensitive_variation  #涉及到敏感词的微博
                        sensitive_words_dict[iter_mid] = iter_sensitive

                    keywords_dict = json.loads(
                        item['_source']['keywords_dict'])
                    personal_keywords_dict = dict()
                    for k, v in keywords_dict.iteritems():
                        k = k.encode('utf-8', 'ignore')
                        personal_keywords_dict[k] = v
                    classify_text_dict[iter_mid] = personal_keywords_dict
                    #classify_uid_list.append(iter_uid)
                    classify_mid_list.append(iter_mid)

                # 去重
                print "start duplicate"
                if duplicate_text_list:
                    dup_results = duplicate(duplicate_text_list)
                    for item in dup_results:
                        if item['duplicate']:
                            duplicate_dict[item['_id']] = item['same_from']

                # 分类
                print "start classify"
                mid_value = dict()
                if classify_text_dict:
                    #classify_results = topic_classfiy(classify_uid_list, classify_text_dict)
                    classify_results = topic_classfiy(classify_mid_list,
                                                      classify_text_dict)

                    #print "classify_results: ", classify_results

                    for k, v in classify_results.iteritems():  # mid:value
                        #mid_value[k] = topic_value_dict[v[0]]
                        mid_value[k] = v[0]
                        #feature_list = organize_feature(k, mid_ts_dict[k])
                        #feature_prediction_list.append(feature_list) # feature list
                        #mid_prediction_list.append(k) # corresponding

                # prediction
                """
                print "start prediction"
                weibo_prediction_result = weibo_model.predict(feature_prediction_list)
                uid_prediction_result = uid_model.predict(feature_prediction_list)
                for i in range(len(mid_prediction_list)):
                    if  i % 100 == 0:
                        print i
                    uid_prediction_dict[mid_prediction_list[i]] = uid_prediction_result[i]
                    weibo_prediction_dict[mid_prediction_list[i]] = weibo_prediction_result[i]
                    tmp_trendline = trendline_list(mid_prediction_list[i], weibo_prediction_result[i], mid_ts_dict[mid_prediction_list[i]])
                    trendline_dict[mid_prediction_list[i]] = tmp_trendline
                """
    # organize data

    mid_list = all_text_dict.keys()
    print "final mid:", len(mid_list)
    print "intersection: ", len(set(mid_list) & set(all_mid_list))
    bulk_action = []
    count = 0
    for mid in mid_list:
        iter_dict = dict()
        if origin_weibo_detail.has_key(mid):
            iter_dict.update(origin_weibo_detail[mid])
            iter_dict["type"] = 1
        elif retweeted_weibo_detail.has_key(mid):
            iter_dict.update(retweeted_weibo_detail[mid])
            iter_dict["type"] = 3
        else:
            iter_dict["retweeted"] = 0
            iter_dict["comment"] = 0
            print "mid in all_mid_list: ", mid in set(all_mid_list)

        #iter_dict["trendline"] = json.dumps(trendline_dict[mid])
        if duplicate_dict.has_key(mid):
            iter_dict["duplicate"] = duplicate_dict[mid]
        else:
            iter_dict["duplicate"] = ""

        #iter_dict["uid_prediction"] = uid_prediction_dict[mid]
        #iter_dict["weibo_prediction"] = weibo_prediction_dict[mid]
        iter_dict["compute_status"] = 0  # 尚未计算
        iter_dict["topic_field"] = mid_value[mid]
        iter_dict["detect_ts"] = ts
        iter_dict["xnr_user_no"] = xnr_user_no

        iter_dict.update(all_text_dict[mid])
        count += 1
        print 'iter_dict:::', iter_dict
        _id = xnr_user_no + '_' + mid
        bulk_action.extend([{"index": {"_id": _id}}, iter_dict])
        if count % 500 == 0:
            es_xnr.bulk(bulk_action,
                        index="social_sensing_text",
                        doc_type="text",
                        timeout=600)
            bulk_action = []

    if bulk_action:
        es_xnr.bulk(bulk_action,
                    index="social_sensing_text",
                    doc_type="text",
                    timeout=600)

    return "1"