def query_brust(index_name, field_name, range_1=0, range_2=50000, count=0):
    query_body = {
        "query": {
            "filtered": {
                "query": {
                    "match_all": {}
                },
                "filter": {
                    "range": {
                        field_name: {
                            "gte": range_1,
                            "lt": range_2
                        }
                    }
                }
            }
        }
    }

    if count == 1:
        result = es.count(index=index_name, doc_type="bci",
                          body=query_body)['count']
        return result

    else:
        query_body['size'] = 1000
        result = es.search(index=index_name, doc_type="bci",
                           body=query_body)['hits']['hits']

        profile_list = []
        for item in result:
            profile_list.append(item['_id'])

        return profile_list
def search_portrait_history_active_info(uid, date, index_name="copy_user_portrait", doctype="user"):
    # date.formate: 20130901
    date_list = time_series(date)

    try:
        result = es.get(index=index_name, doc_type=doctype, id=uid, _source=True)['_source']
    except NotFoundError:
        return "NotFound"
    except:
        return None
    
    date_max = {}
    for date_str in date_list:
        query_body = {
            'query':{
                'match_all':{}
                },
            'size': 1,
            'sort': [{date_str: {'order': 'desc'}}]
        }
        try:
            max_item = es.search(index=index_name, doc_type=doctype, body=query_body)['hits']['hits']
        except Exception, e:
            raise e
        date_max[date_str] = max_item[0]['_source'][date_str]
Пример #3
0
def get_user_influence(uid, date):
    date = str(date).replace("-","")
    index_name = pre_index + date
    try:
        bci_info = es_cluster.get(index=index_name, doc_type=influence_doctype, id=uid)["_source"]
    except:
        bci_info = {}
    result = {}
    for key in BCI_LIST:
        result[key] = bci_info.get(key, 0)

    user_index = result["user_index"]
    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "range":{
                        "user_index":{
                            "gt": user_index
                        }
                    }
                }
            }
        }
    }
    total_count = es_cluster.count(index=index_name, doc_type=influence_doctype)['count']
    order_count = es_cluster.count(index=index_name, doc_type=influence_doctype, body=query_body)['count']

    result["total_count"] = total_count
    result["order_count"] = order_count + 1

    return result
def get_user_influence(uid, date):
    date1 = str(date).replace("-","")
    index_name = pre_index + date1
    result = bci_detail(date, uid)
    user_index = result["user_index"]
    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "range":{
                        "user_index":{
                            "gt": user_index
                        }
                    }
                }
            }
        }
    }
    total_count = es_cluster.count(index=index_name, doc_type=influence_doctype)['count']
    order_count = es_cluster.count(index=index_name, doc_type=influence_doctype, body=query_body)['count']

    result["total_count"] = total_count
    result["order_count"] = order_count + 1

    return result
def get_user_influence(uid, date):
    date = str(date).replace("-","")
    index_name = pre_index + date
    try:
        bci_info = es_cluster.get(index=index_name, doc_type=influence_doctype, id=uid)["_source"]
    except:
        bci_info = {}
    result = {}
    for key in BCI_LIST:
        result[key] = bci_info.get(key, 0)

    user_index = result["user_index"]
    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "range":{
                        "user_index":{
                            "gt": user_index
                        }
                    }
                }
            }
        }
    }
    total_count = es_cluster.count(index=index_name, doc_type=influence_doctype)['count']
    order_count = es_cluster.count(index=index_name, doc_type=influence_doctype, body=query_body)['count']

    result["total_count"] = total_count
    result["order_count"] = order_count + 1

    return result
def query_brust(index_name,field_name, range_1=0, range_2=50000, count=0):
    query_body = {
        "query":{
            "filtered": {
                "query": {
                    "match_all":{}
                },
                "filter": {
                    "range": {
                        field_name: {
                            "gte": range_1,
                            "lt": range_2
                        }
                    }
                }
            }
        }
    }

    if count == 1:
        result = es.count(index=index_name, doc_type="bci", body=query_body)['count']
        return result

    else:
        query_body['size'] = 1000
        result = es.search(index=index_name, doc_type="bci", body=query_body)['hits']['hits']

        profile_list = []
        for item in result:
            profile_list.append(item['_id'])

        return profile_list
def search_top_index(index_name, top_k=1, index_type="bci", top=False, sort_order="user_index"):
    query_body = {
        "query": {
            "match_all": {}
        },
        "size": top_k,
        "sort": [{sort_order: {"order": "desc"}}]
    }

    if top:
        result = es.search(index=index_name, doc_type=index_type, body=query_body)['hits']['hits'][0]['_source'][sort_order]
    else:
        search_result = es.search(index=index_name, doc_type=index_type, body=query_body)['hits']['hits']

        uid_list = []
        for item in search_result:
            uid_list.append(item['_id'])
        profile_result = es_profile.mget(index="weibo_user",doc_type="user", body={"ids":uid_list}, _source=True)['docs']
        portrait_result = es_portrait.mget(index="user_portrait", doc_type="user", body={"ids":uid_list}, _source=True)['docs']

        result = []
        rank = 1
        for i in range(len(search_result)):
            info = ['','','','']
            info[0] = rank
            if profile_result[i]['found']:
                info[1] = profile_result[i]['_source'].get('photo_url','')
                info[3] = profile_result[i]['_source'].get('nick_name','')

            info[2] = search_result[i].get('_id','')
            if sort_order in ["user_index","origin_weibo_retweeted_brust_average","origin_weibo_comment_brust_average"]:
                info.append(search_result[i]['_source'][sort_order])
                if portrait_result[i]['found']:
                    info.append("1")
                else:
                    info.append("0")
            elif sort_order == "origin_weibo_retweeted_top_number":
               info.append(search_result[i]['_source']['origin_weibo_retweeted_top_number']) 
               mid = search_result[i]['_source']['origin_weibo_top_retweeted_id']
               info.append(weiboinfo2url(info[2],mid))
               if portrait_result[i]['found']:
                   info.append("1")
               else:
                   info.append("0")
            elif sort_order == "origin_weibo_comment_top_number":
                info.append(search_result[i]['_source']['origin_weibo_comment_top_number'])
                mid = search_result[i]['_source']['origin_weibo_top_comment_id']
                info.append(weiboinfo2url(info[2],mid))
                if portrait_result[i]['found']:
                    info.append("1")
                else:
                    info.append("0")

            rank += 1
            result.append(info)

    return result
Пример #8
0
def update_record_index(uid_list):
    bulk_action = []
    for each in uid_list:
        info = {}
        info['uid'] = str(each)
        info['low_number'] = 0
        xdata = expand_update_action(info)
        bulk_action.extend([xdata[0], xdata[1]])

    es.bulk(bulk_action, index=index_destination, doc_type=index_destination_doctype, timeout=30)
def count_es(es,
             index_name,
             doctype,
             sort_order="user_index",
             range_1=0,
             range_2=3000):
    query_body = {
        "query": {
            "filtered": {
                "query": {
                    "match_all": {}
                },
                "filter": {
                    "range": {
                        sort_order: {
                            "gte": range_1,
                            "lt": range_2
                        }
                    }
                }
            }
        }
    }

    result = es.count(index=index_name, doc_type=doctype,
                      body=query_body)['count']

    return result
def search_influence_detail(uid_list, index_name, doctype):
    result = es.mget(index=index_name,
                     doc_type=doctype,
                     body={"ids": uid_list},
                     _source=True)["docs"]

    return result[0]['_source']
Пример #11
0
def influenced_detail(uid, date, style):
    date1 = str(date).replace("-", "")
    index_name = pre_index + date1
    # detail_text = {}
    style = int(style)
    try:
        user_info = es_cluster.get(index=index_name, doc_type=influence_doctype, id=uid)["_source"]
    except:
        result = {}
        return result
    origin_retweetd = json.loads(user_info["origin_weibo_retweeted_top"])
    origin_comment = json.loads(user_info["origin_weibo_comment_top"])
    retweeted_retweeted = json.loads(user_info["retweeted_weibo_retweeted_top"])
    retweeted_comment = json.loads(user_info["retweeted_weibo_comment_top"])

    if style == 0:
        detail_text = get_text(origin_retweetd, date, user_info, style)
    elif style == 1:
        detail_text = get_text(origin_comment, date, user_info, style)
    elif style == 2:
        detail_text = get_text(retweeted_retweeted, date, user_info, style)
    else:
        detail_text = get_text(retweeted_comment, date, user_info, style)
    # detail_text["origin_retweeted"] = get_text(origin_retweetd, date)
    # detail_text["origin_comment"] = get_text(origin_comment, date)
    # detail_text["retweeted_retweeted"] = get_text(retweeted_retweeted, date)
    # detail_text["retweeted_comment"] = get_text(retweeted_comment, date)

    return detail_text
def tag_vector(uid, date):
    date1 = str(date).replace('-', '')
    index_name = pre_index + date1
    index_flow_text = pre_text_index + date
    result = []

    try:
        bci_result = es_cluster.get(index=index_name, doc_type=influence_doctype, id=uid)["_source"]
    except:
        tag = influence_tag["0"]
        result.append(tag)
        return result

    origin_retweeted = json.loads(bci_result["origin_weibo_retweeted_detail"])
    retweeted_retweeted = json.loads(bci_result["retweeted_weibo_retweeted_detail"])
    origin_comment = json.loads(bci_result["origin_weibo_comment_detail"])
    retweeted_comment = json.loads(bci_result["retweeted_weibo_comment_detail"])
    sum_retweeted = sum(origin_retweeted.values()) + sum(origin_comment.values())
    sum_comment = sum(retweeted_retweeted.values()) + sum(retweeted_comment.values())

    if sum_retweeted >= retweeted_threshold:
        if sum_comment >= comment_threshold:
            tag = influence_tag['3']
        else:
            tag = influence_tag['1']
    else:
        if sum_comment >= comment_threshold:
            tag = influence_tag['2']
        else:
            tag = influence_tag['4']
    result.append(tag)
    return result
Пример #13
0
def influenced_detail(uid, date, style):
    date1 = str(date).replace('-', '')
    index_name = pre_index + date1
    #detail_text = {}
    style = int(style)
    try:
        user_info = es_cluster.get(index=index_name,
                                   doc_type=influence_doctype,
                                   id=uid)["_source"]
    except:
        result = {}
        return result
    origin_retweetd = json.loads(user_info["origin_weibo_retweeted_top"])
    origin_comment = json.loads(user_info['origin_weibo_comment_top'])
    retweeted_retweeted = json.loads(
        user_info["retweeted_weibo_retweeted_top"])
    retweeted_comment = json.loads(user_info["retweeted_weibo_comment_top"])

    if style == 0:
        detail_text = get_text(origin_retweetd, date, user_info, style)
    elif style == 1:
        detail_text = get_text(origin_comment, date, user_info, style)
    elif style == 2:
        detail_text = get_text(retweeted_retweeted, date, user_info, style)
    else:
        detail_text = get_text(retweeted_comment, date, user_info, style)
    #detail_text["origin_retweeted"] = get_text(origin_retweetd, date)
    #detail_text["origin_comment"] = get_text(origin_comment, date)
    #detail_text["retweeted_retweeted"] = get_text(retweeted_retweeted, date)
    #detail_text["retweeted_comment"] = get_text(retweeted_comment, date)

    return detail_text
def tag_vector(uid, date):
    date1 = str(date).replace('-', '')
    index_name = pre_index + date1
    index_flow_text = pre_text_index + date
    result = []

    try:
        bci_result = es_cluster.get(index=index_name, doc_type=influence_doctype, id=uid)["_source"]
    except:
        tag = influence_tag["0"]
        result.append(tag)
        return result

    origin_retweeted = json.loads(bci_result["origin_weibo_retweeted_detail"])
    retweeted_retweeted = json.loads(bci_result["retweeted_weibo_retweeted_detail"])
    origin_comment = json.loads(bci_result["origin_weibo_comment_detail"])
    retweeted_comment = json.loads(bci_result["retweeted_weibo_comment_detail"])
    sum_retweeted = sum(origin_retweeted.values()) + sum(origin_comment.values())
    sum_comment = sum(retweeted_retweeted.values()) + sum(retweeted_comment.values())

    if sum_retweeted >= retweeted_threshold:
        if sum_comment >= comment_threshold:
            tag = influence_tag['3']
        else:
            tag = influence_tag['1']
    else:
        if sum_comment >= comment_threshold:
            tag = influence_tag['2']
        else:
            tag = influence_tag['4']
    result.append(tag)
    return result
Пример #15
0
def get_user_detail(date, input_result, status):
    results = []
    if status=='show_in':
        uid_list = input_result
    if status=='show_compute':
        uid_list = input_result.keys()
    if status=='show_in_history':
        uid_list = input_result.keys()
    if date!='all':
        index_name = 'bci_' + ''.join(date.split('-'))
    else:
        now_ts = time.time()
        now_date = ts2datetime(now_ts)
        index_name = 'bci_' + ''.join(now_date.split('-'))
    index_type = 'bci'
    user_bci_result = es_cluster.mget(index=index_name, doc_type=index_type, body={'ids':uid_list}, _source=True)['docs']
    user_profile_result = es_user_profile.mget(index='weibo_user', doc_type='user', body={'ids':uid_list}, _source=True)['docs']
    max_evaluate_influ = get_evaluate_max(index_name)
    for i in range(0, len(uid_list)):
        uid = uid_list[i]
        bci_dict = user_bci_result[i]
        profile_dict = user_profile_result[i]
        try:
            bci_source = bci_dict['_source']
        except:
            bci_source = None
        if bci_source:
            influence = bci_source['user_index']
            influence = math.log(influence/max_evaluate_influ['user_index'] * 9 + 1 ,10)
            influence = influence * 100
        else:
            influence = ''
        try:
            profile_source = profile_dict['_source']
        except:
            profile_source = None
        if profile_source:
            uname = profile_source['nick_name'] 
            location = profile_source['user_location']
            fansnum = profile_source['fansnum']
            statusnum = profile_source['statusnum']
        else:
            uname = ''
            location = ''
            fansnum = ''
            statusnum = ''
        if status == 'show_in':
            results.append([uid, uname, location, fansnum, statusnum, influence])
        if status == 'show_compute':
            in_date = json.loads(input_result[uid])[0]
            compute_status = json.loads(input_result[uid])[1]
            if compute_status == '1':
                compute_status = '3'
            results.append([uid, uname, location, fansnum, statusnum, influence, in_date, compute_status])
        if status == 'show_in_history':
            in_status = input_result[uid]
            results.append([uid, uname, location, fansnum, statusnum, influence, in_status])

    return results
Пример #16
0
def statistics_influence_people(uid, date, style):
    # output: different retweeted and comment, uids' domain distribution, topic distribution, registeration geo distribution
    results = {}  # retwweted weibo people and comment weibo people
    date1 = str(date).replace("-", "")
    index_name = pre_index + date1
    index_flow_text = pre_text_index + date

    try:
        bci_result = es_cluster.get(index=index_name, doc_type=influence_doctype, id=uid)["_source"]
    except:
        bci_result = []
        return results
    origin_retweeted_mid = []  # origin weibo mid
    retweeted_retweeted_mid = []  # retweeted weibo mid
    origin_comment_mid = []
    retweeted_comment_mid = []
    origin_retweeted = json.loads(bci_result["origin_weibo_retweeted_detail"])
    retweeted_retweeted = json.loads(bci_result["retweeted_weibo_retweeted_detail"])
    origin_comment = json.loads(bci_result["origin_weibo_comment_detail"])
    retweeted_comment = json.loads(bci_result["retweeted_weibo_comment_detail"])
    retweeted_total_number = sum(origin_retweeted.values()) + sum(retweeted_retweeted.values())
    comment_total_number = sum(origin_comment.values()) + sum(retweeted_comment.values())
    if origin_retweeted:
        origin_retweeted_mid = filter_mid(origin_retweeted)
    if retweeted_retweeted:
        retweeted_retweeted_mid = filter_mid(retweeted_retweeted)
    if origin_comment:
        origin_comment_mid = filter_mid(origin_comment)
    if retweeted_comment:
        retweeted_comment_mid = filter_mid(retweeted_comment)

    query_body = {"query": {"filtered": {"filter": {"bool": {"should": [], "must": []}}}}, "size": 10000}

    if int(style) == 0:  # retweeted
        retweeted_origin = []
        if retweeted_retweeted_mid:
            text_result = es.mget(
                index=index_flow_text, doc_type=flow_text_index_type, body={"ids": retweeted_retweeted_mid}
            )["docs"]
            for item in text_result:
                mid = item.get("source", {}).get("root_mid", "0")
                retweeted_origin.append(mid)
        retweeted_results = influenced_user_detail(uid, date, origin_retweeted_mid, retweeted_origin, 3)
        retweeted_results["total_number"] = retweeted_total_number
        results = retweeted_results
    else:
        retweeted_origin = []
        if retweeted_comment_mid:
            text_result = es.mget(
                index=index_flow_text, doc_type=flow_text_index_type, body={"ids": retweeted_comment_mid}
            )["docs"]
            for item in text_result:
                mid = item.get("source", {}).get("root_mid", "0")
                retweeted_origin.append(mid)
        comment_results = influenced_user_detail(uid, date, origin_comment_mid, retweeted_origin, 2)
        comment_results["total_number"] = comment_total_number
        results = comment_results

    return results
Пример #17
0
def get_recommentation(submit_user):
    if RUN_TYPE:
        now_ts = time.time()
    else:
        now_ts = datetime2ts(RUN_TEST_TIME)

    in_portrait_set = set(r.hkeys("compute"))
    result = []
    for i in range(7):
        iter_ts = now_ts - i*DAY
        iter_date = ts2datetime(iter_ts)
        submit_user_recomment = "recomment_" + submit_user + "_" + str(iter_date)
        bci_date = ts2datetime(iter_ts - DAY)
        submit_user_recomment = r.hkeys(submit_user_recomment)
        bci_index_name = "bci_" + bci_date.replace('-', '')
        exist_bool = es_cluster.indices.exists(index=bci_index_name)
        if not exist_bool:
            continue
        if submit_user_recomment:
            user_bci_result = es_cluster.mget(index=bci_index_name, doc_type="bci", body={'ids':submit_user_recomment}, _source=True)['docs']
            user_profile_result = es_user_profile.mget(index='weibo_user', doc_type='user', body={'ids':submit_user_recomment}, _source=True)['docs']
            max_evaluate_influ = get_evaluate_max(bci_index_name)
            for i in range(len(submit_user_recomment)):
                uid = submit_user_recomment[i]
                bci_dict = user_bci_result[i]
                profile_dict = user_profile_result[i]
                try:
                    bci_source = bci_dict['_source']
                except:
                    bci_source = None
                if bci_source:
                    influence = bci_source['user_index']
                    influence = math.log(influence/max_evaluate_influ['user_index'] * 9 + 1 ,10)
                    influence = influence * 100
                else:
                    influence = ''
                try:
                    profile_source = profile_dict['_source']
                except:
                    profile_source = None
                if profile_source:
                    uname = profile_source['nick_name']
                    location = profile_source['user_location']
                    fansnum = profile_source['fansnum']
                    statusnum = profile_source['statusnum']
                else:
                    uname = ''
                    location = ''
                    fansnum = ''
                    statusnum = ''
                if uid in in_portrait_set:
                    in_portrait = "1"
                else:
                    in_portrait = "0"
                recomment_day = iter_date
                result.append([iter_date, uid, uname, location, fansnum, statusnum, influence, in_portrait])

    return result    
def statistics_influence_people(uid, date, style):
    # output: different retweeted and comment, uids' domain distribution, topic distribution, registeration geo distribution
    results = {} # retwweted weibo people and comment weibo people
    date1 = str(date).replace('-', '')
    index_name = pre_index + date1
    index_flow_text = pre_text_index + date

    try:
        bci_result = es_cluster.get(index=index_name, doc_type=influence_doctype, id=uid)["_source"]
    except:
        bci_result = []
        return results
    origin_mid = [] # origin weibo mid
    retweeted_mid = [] # retweeted weibo mid

    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "must":[
                        ]
                    }
                }
            }
        },
        "size":1000
    }

    body_1 = copy.deepcopy(query_body)
    body_2 = copy.deepcopy(query_body)

    body_1["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term":{"message_type": 1}}, {"term":{"uid": uid}}])
    result_1 = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=body_1)["hits"]["hits"]
    if result_1:
        for item in result_1:
            origin_mid.append(item['_id'])

    body_1["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term":{"message_type": 3}}, {"term":{"uid": uid}}])
    result_2 = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=body_2)["hits"]["hits"]
    if result_2:
        for item in result_2:
            if item['_source'].get('root_mid', ''):
                retweeted_mid.append(item['_source']['root_mid'])    
    

    if int(style) == 0: # retweeted
        retweeted_results = influenced_user_detail(uid, date, origin_mid, retweeted_mid, 3)
        results = retweeted_results
    else:
        comment_results = influenced_user_detail(uid, date, origin_mid, retweeted_mid, 2)
        results = comment_results
    return results
Пример #19
0
def comment_on_influence(uid, date):
    date1 = str(date).replace('-', '')
    index_name = pre_index + date1
    index_flow_text = pre_text_index + date
    result = []
    underline = []

    try:
        bci_result = es_cluster.get(index=index_name,
                                    doc_type=influence_doctype,
                                    id=uid)["_source"]
    except:
        description = CURRENT_INFLUENCE_CONCLUSION['0']
        result.append(description)
        return ([result, underline])

    user_index = bci_result['user_index']
    if user_index < CURRNET_INFLUENCE_THRESHOULD[0]:
        description = CURRENT_INFLUENCE_CONCLUSION['0']
    elif user_index >= CURRNET_INFLUENCE_THRESHOULD[
            0] and user_index < CURRNET_INFLUENCE_THRESHOULD[1]:
        description = CURRENT_INFLUENCE_CONCLUSION['1']
    elif user_index >= CURRNET_INFLUENCE_THRESHOULD[
            1] and user_index < CURRNET_INFLUENCE_THRESHOULD[2]:
        description = CURRENT_INFLUENCE_CONCLUSION['2']
    elif user_index >= CURRNET_INFLUENCE_THRESHOULD[
            2] and user_index < CURRNET_INFLUENCE_THRESHOULD[3]:
        description = CURRENT_INFLUENCE_CONCLUSION['3']
    elif user_index >= CURRNET_INFLUENCE_THRESHOULD[
            3] and user_index < CURRNET_INFLUENCE_THRESHOULD[4]:
        description = CURRENT_INFLUENCE_CONCLUSION['4']
    else:
        description = CURRENT_INFLUENCE_CONCLUSION['5']
    result.append(description)

    for i in range(4):
        if bci_result[INFLUENCE_TOTAL_LIST[i]] > INFLUENCE_TOTAL_THRESHOULD[i]:
            result.append(INFLUENCE_TOTAL_CONCLUSION[i])
            if bci_result[
                    INFLUENCE_BRUST_LIST[i]] > INFLUENCE_BRUST_THRESHOULD[i]:
                result.append(INFLUENCE_BRUST_CONCLUSION[i])
                underline.append(UNDERLINE_CONCLUSION[i])
            else:
                result.append('')
                underline.append('')
        else:
            result.extend(['', ''])
            underline.append('')

    return [result, underline]
def search_portrait_history_active_info(uid,
                                        date,
                                        index_name=copy_portrait_index_name,
                                        doctype=copy_portrait_index_name):
    # date.formate: 20130901
    date_list = time_series(date)

    try:
        result = es.get(index=index_name,
                        doc_type=doctype,
                        id=uid,
                        _source=True)['_source']
    except NotFoundError:
        return "NotFound"
    except:
        return None

    date_max = {}
    for date_str in date_list:
        query_body = {
            'query': {
                'match_all': {}
            },
            'size': 1,
            'sort': [{
                date_str: {
                    'order': 'desc'
                }
            }]
        }
        try:
            max_item = es.search(index=index_name,
                                 doc_type=doctype,
                                 body=query_body)['hits']['hits']
        except Exception, e:
            raise e
        date_max[date_str] = max_item[0]['_source'][date_str]
def search_k(es, index_name, index_type, start, field="user_index", size=100):
    query_body = {
        "query":{
            "match_all": {}
            },
        "size": size,
        "from": start,
        "sort": [{field: {"order": "desc"}}]
    }

    result = es.search(index=index_name, doc_type=index_type, body=query_body)['hits']['hits']

    search_list = []
    for item in result:
        search_list.append(item['_source'])

    return search_list
Пример #22
0
def get_evaluate_max(index_name):
    max_result = {}
    index_type = 'bci'
    evaluate_index = ['user_index']
    for evaluate in evaluate_index:
        query_body = {
            'query':{
                'match_all':{}
                },
            'size':1,
            'sort':[{evaluate: {'order': 'desc'}}]
            }
        try:
            result = es_cluster.search(index=index_name, doc_type=index_type, body=query_body)['hits']['hits']
        except Exception, e:
            raise e
        max_evaluate = result[0]['_source'][evaluate]
        max_result[evaluate] = max_evaluate
Пример #23
0
def get_evaluate_max(index_name):
    max_result = {}
    index_type = 'bci'
    evaluate_index = ['user_index']
    for evaluate in evaluate_index:
        query_body = {
            'query':{
                'match_all':{}
                },
            'size':1,
            'sort':[{evaluate: {'order': 'desc'}}]
            }
        try:
            result = es_cluster.search(index=index_name, doc_type=index_type, body=query_body)['hits']['hits']
        except Exception, e:
            raise e
        max_evaluate = result[0]['_source'][evaluate]
        max_result[evaluate] = max_evaluate
Пример #24
0
def comment_on_influence(uid, date):
    date1 = str(date).replace("-", "")
    index_name = pre_index + date1
    index_flow_text = pre_text_index + date
    result = []
    underline = []

    try:
        bci_result = es_cluster.get(index=index_name, doc_type=influence_doctype, id=uid)["_source"]
    except:
        description = CURRENT_INFLUENCE_CONCLUSION["0"]
        result.append(description)
        return [result, underline]

    user_index = bci_result["user_index"]
    if user_index < CURRNET_INFLUENCE_THRESHOULD[0]:
        description = CURRENT_INFLUENCE_CONCLUSION["0"]
    elif user_index >= CURRNET_INFLUENCE_THRESHOULD[0] and user_index < CURRNET_INFLUENCE_THRESHOULD[1]:
        description = CURRENT_INFLUENCE_CONCLUSION["1"]
    elif user_index >= CURRNET_INFLUENCE_THRESHOULD[1] and user_index < CURRNET_INFLUENCE_THRESHOULD[2]:
        description = CURRENT_INFLUENCE_CONCLUSION["2"]
    elif user_index >= CURRNET_INFLUENCE_THRESHOULD[2] and user_index < CURRNET_INFLUENCE_THRESHOULD[3]:
        description = CURRENT_INFLUENCE_CONCLUSION["3"]
    elif user_index >= CURRNET_INFLUENCE_THRESHOULD[3] and user_index < CURRNET_INFLUENCE_THRESHOULD[4]:
        description = CURRENT_INFLUENCE_CONCLUSION["4"]
    else:
        description = CURRENT_INFLUENCE_CONCLUSION["5"]
    result.append(description)

    for i in range(4):
        if bci_result[INFLUENCE_TOTAL_LIST[i]] > INFLUENCE_TOTAL_THRESHOULD[i]:
            result.append(INFLUENCE_TOTAL_CONCLUSION[i])
            if bci_result[INFLUENCE_BRUST_LIST[i]] > INFLUENCE_BRUST_THRESHOULD[i]:
                result.append(INFLUENCE_BRUST_CONCLUSION[i])
                underline.append(UNDERLINE_CONCLUSION[i])
            else:
                result.append("")
                underline.append("")
        else:
            result.extend(["", ""])
            underline.append("")

    return [result, underline]
def comment_on_influence(uid, date):
    date1 = str(date).replace('-', '')
    index_name = pre_index + date1
    index_flow_text = pre_text_index + date
    result = []
    underline = []

    try:
        bci_result = es_cluster.get(index=index_name, doc_type=influence_doctype, id=uid)["_source"]
    except:
        description = CURRENT_INFLUENCE_CONCLUSION['0']
        result.append(description)
        return ([result, underline])

    user_index = bci_result['user_index']
    if user_index < CURRNET_INFLUENCE_THRESHOULD[0]:
        description = CURRENT_INFLUENCE_CONCLUSION['0']
    elif user_index >= CURRNET_INFLUENCE_THRESHOULD[0] and user_index < CURRNET_INFLUENCE_THRESHOULD[1]:
        description = CURRENT_INFLUENCE_CONCLUSION['1']
    elif user_index >= CURRNET_INFLUENCE_THRESHOULD[1] and user_index < CURRNET_INFLUENCE_THRESHOULD[2]:
        description = CURRENT_INFLUENCE_CONCLUSION['2']
    elif user_index >= CURRNET_INFLUENCE_THRESHOULD[2] and user_index < CURRNET_INFLUENCE_THRESHOULD[3]:
        description = CURRENT_INFLUENCE_CONCLUSION['3']
    elif user_index >= CURRNET_INFLUENCE_THRESHOULD[3] and user_index < CURRNET_INFLUENCE_THRESHOULD[4]:
        description = CURRENT_INFLUENCE_CONCLUSION['4']
    else:
        description = CURRENT_INFLUENCE_CONCLUSION['5']
    result.append(description)

    for i in range(4):
        if bci_result[INFLUENCE_TOTAL_LIST[i]] > INFLUENCE_TOTAL_THRESHOULD[i]:
            result.append(INFLUENCE_TOTAL_CONCLUSION[i])
            if bci_result[INFLUENCE_BRUST_LIST[i]] > INFLUENCE_BRUST_THRESHOULD[i]:
                result.append(INFLUENCE_BRUST_CONCLUSION[i])
                underline.append(UNDERLINE_CONCLUSION[i])
            else:
                result.append('')
                underline.append('')
        else:
            result.extend(['',''])
            underline.append('')

    return [result, underline]
def search_k(es, index_name, index_type, start, field="user_index", size=100):
    query_body = {
        "query": {
            "match_all": {}
        },
        "size": size,
        "from": start,
        "sort": [{
            field: {
                "order": "desc"
            }
        }]
    }

    result = es.search(index=index_name, doc_type=index_type,
                       body=query_body)['hits']['hits']

    search_list = []
    for item in result:
        search_list.append(item['_source'])

    return search_list
def count_es(es, index_name,doctype, sort_order="user_index",range_1=0, range_2=3000):
    query_body = {
        "query":{
            "filtered": {
                "query": {
                    "match_all":{}
                },
                "filter": {
                    "range": {
                        sort_order: {
                            "gte": range_1,
                            "lt": range_2
                        }
                    }
                }
            }
        }
    }


    result = es.count(index=index_name, doc_type=doctype, body=query_body)['count']

    return result
def search_portrait_history_active_info(uid, date, index_name="copy_user_portrait", doctype="user"):
    # date.formate: 20130901
    date_list = time_series(date)

    try:
        result = es.get(index=index_name, doc_type=doctype, id=uid, _source=True)['_source']
    except NotFoundError:
        return "NotFound"
    except:
        return None

    return_dict = {}
    for item in date_list:
        return_dict[item] = result.get(item, 0)

    in_list = []
    for item in sorted(date_list):
        in_list.append(return_dict[item])
    #print 'in_list:', in_list
    max_influence = max(in_list)
    ave_influence = sum(in_list) / float(7)
    min_influence = min(in_list)
    if max_influence - min_influence <= 400 and ave_influence >= 900:
        mark = u'平稳高影响力'
    elif max_influence - min_influence > 400 and ave_influence >= 900:
        mark = u'波动高影响力'
    elif max_influence - min_influence <= 400 and ave_influence < 900 and ave_influence >= 500:
        mark = u'平稳一般影响力'
    elif max_influence - min_influence > 400 and ave_influence < 900 and ave_influence >= 500:
        mark = u'波动一般影响力'
    elif max_influence - min_influence <= 400 and ave_influence < 500:
        mark = u'平稳低影响力'
    else:
        mark = u'波动低影响力'
    description = [u'该用户为', mark]
    return [in_list, description]
Пример #29
0
 sensitive_string = "sensitive_score_" + tmp_ts
 query_sensitive_body = {
     "query":{
         "match_all":{}
     },
     "size":1,
     "sort":{sensitive_string:{"order":"desc"}}
 }
 try:
     top_sensitive_result = es_bci_history.search(index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX, body=query_sensitive_body, _source=False, fields=[sensitive_string])['hits']['hits']
     top_sensitive = top_sensitive_result[0]['fields'][sensitive_string][0]
 except Exception, reason:
     print Exception, reason
     top_sensitive = 400
 index_type = 'bci'
 user_bci_result = es_cluster.mget(index=index_name, doc_type=index_type, body={'ids':uid_list}, _source=True)['docs']
 user_profile_result = es_user_profile.mget(index='weibo_user', doc_type='user', body={'ids':uid_list}, _source=True)['docs']
 bci_history_result = es_bci_history.mget(index=bci_history_index_name, doc_type=bci_history_index_type, body={"ids":uid_list}, fields=['user_fansnum', 'weibo_month_sum'])['docs']
 sensitive_history_result = es_bci_history.mget(index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX, body={'ids':uid_list}, fields=[sensitive_string], _source=False)['docs']
 max_evaluate_influ = get_evaluate_max(index_name)
 for i in range(0, len(uid_list)):
     uid = uid_list[i]
     bci_dict = user_bci_result[i]
     profile_dict = user_profile_result[i]
     bci_history_dict = bci_history_result[i]
     sensitive_history_dict = sensitive_history_result[i]
     #print sensitive_history_dict
     try:
         bci_source = bci_dict['_source']
     except:
         bci_source = None
def search_influence_detail(uid_list, index_name, doctype):
    result = es.mget(index=index_name, doc_type=doctype, body={"ids": uid_list}, _source=True)["docs"]

    return result[0]['_source']
Пример #31
0
def get_user_detail(date, input_result, status, user_type="influence", auth=""):
    results = []
    if status=='show_in':
        uid_list = input_result
    if status=='show_compute':
        uid_list = input_result.keys()
    if status=='show_in_history':
        uid_list = input_result.keys()
    if date!='all':
        index_name = 'bci_' + ''.join(date.split('-'))
    else:
        now_ts = time.time()
        now_date = ts2datetime(now_ts)
        index_name = 'bci_' + ''.join(now_date.split('-'))
    index_type = 'bci'
    user_bci_result = es_cluster.mget(index=index_name, doc_type=index_type, body={'ids':uid_list}, _source=True)['docs']
    user_profile_result = es_user_profile.mget(index='weibo_user', doc_type='user', body={'ids':uid_list}, _source=True)['docs']
    max_evaluate_influ = get_evaluate_max(index_name)
    for i in range(0, len(uid_list)):
        uid = uid_list[i]
        bci_dict = user_bci_result[i]
        profile_dict = user_profile_result[i]
        try:
            bci_source = bci_dict['_source']
        except:
            bci_source = None
        if bci_source:
            influence = bci_source['user_index']
            influence = math.log(influence/max_evaluate_influ['user_index'] * 9 + 1 ,10)
            influence = influence * 100
        else:
            influence = ''
        try:
            profile_source = profile_dict['_source']
        except:
            profile_source = None
        if profile_source:
            uname = profile_source['nick_name'] 
            location = profile_source['user_location']
            fansnum = profile_source['fansnum']
            statusnum = profile_source['statusnum']
        else:
            uname = ''
            location = ''
            fansnum = ''
            statusnum = ''
        if status == 'show_in':
            if user_type == "sensitive":
                tmp_ts = datetime2ts(date) - DAY
                tmp_data = r_cluster.hget("sensitive_"+str(tmp_ts), uid)
                if tmp_data:
                    sensitive_dict = json.loads(tmp_data)
                    sensitive_words = sensitive_dict.keys()
                else:
                    senstive_words = []
                results.append([uid, uname, location, fansnum, statusnum, influence, sensitive_words])
            else:
                results.append([uid, uname, location, fansnum, statusnum, influence])
            if auth:
                hashname_submit = "submit_recomment_" + date
                tmp_data = json.loads(r.hget(hashname_submit, uid))
                recommend_list = (tmp_data['operation']).split('&')
                admin_list = []
                admin_list.append(tmp_data['system'])
                admin_list.append(list(set(recommend_list)))
                admin_list.append(len(recommend_list))
                results[-1].extend(admin_list)
        if status == 'show_compute':
            in_date = json.loads(input_result[uid])[0]
            compute_status = json.loads(input_result[uid])[1]
            if compute_status == '1':
                compute_status = '3'
            results.append([uid, uname, location, fansnum, statusnum, influence, in_date, compute_status])
        if status == 'show_in_history':
            in_status = input_result[uid]
            if user_type == "sensitive":
                tmp_ts = datetime2ts(date) - DAY
                tmp_data = r_cluster.hget("sensitive_"+str(tmp_ts), uid)
                if tmp_data:
                    sensitive_dict = json.loads(tmp_data)
                    sensitive_words = sensitive_dict.keys()
                results.append([uid, uname, location, fansnum, statusnum, influence, in_status, sensitive_words])
            else:
                results.append([uid, uname, location, fansnum, statusnum, influence, in_status])

    return results
def search_top_index(index_name,
                     top_k=1,
                     index_type="bci",
                     top=False,
                     sort_order="user_index"):
    query_body = {
        "query": {
            "match_all": {}
        },
        "size": top_k,
        "sort": [{
            sort_order: {
                "order": "desc"
            }
        }]
    }

    if top:
        result = es.search(
            index=index_name, doc_type=index_type,
            body=query_body)['hits']['hits'][0]['_source'][sort_order]
    else:
        search_result = es.search(index=index_name,
                                  doc_type=index_type,
                                  body=query_body)['hits']['hits']

        uid_list = []
        for item in search_result:
            uid_list.append(item['_id'])
        profile_result = es_profile.mget(index=profile_index_name,
                                         doc_type=profile_index_type,
                                         body={"ids": uid_list},
                                         _source=True)['docs']
        portrait_result = es_portrait.mget(index=portrait_index_name,
                                           doc_type=portrait_index_type,
                                           body={"ids": uid_list},
                                           _source=True)['docs']

        result = []
        rank = 1
        for i in range(len(search_result)):
            info = ['', '', '', '']
            info[0] = rank
            if profile_result[i]['found']:
                info[1] = profile_result[i]['_source'].get('photo_url', '')
                info[3] = profile_result[i]['_source'].get('nick_name', '')

            info[2] = search_result[i].get('_id', '')
            if sort_order in [
                    "user_index", "origin_weibo_retweeted_brust_average",
                    "origin_weibo_comment_brust_average"
            ]:
                info.append(search_result[i]['_source'][sort_order])
                if portrait_result[i]['found']:
                    info.append("1")
                else:
                    info.append("0")
            elif sort_order == "origin_weibo_retweeted_top_number":
                info.append(search_result[i]['_source']
                            ['origin_weibo_retweeted_top_number'])
                mid = search_result[i]['_source'][
                    'origin_weibo_top_retweeted_id']
                info.append(weiboinfo2url(info[2], mid))
                if portrait_result[i]['found']:
                    info.append("1")
                else:
                    info.append("0")
            elif sort_order == "origin_weibo_comment_top_number":
                info.append(search_result[i]['_source']
                            ['origin_weibo_comment_top_number'])
                mid = search_result[i]['_source'][
                    'origin_weibo_top_comment_id']
                info.append(weiboinfo2url(info[2], mid))
                if portrait_result[i]['found']:
                    info.append("1")
                else:
                    info.append("0")

            rank += 1
            result.append(info)

    return result
def influenced_detail(uid, date, style):
    date1 = str(date).replace('-', '')
    index_name = pre_index + date1
    index_text = "flow_text_" + date
    #detail_text = {}
    style = int(style)
    try:
        user_info = es_cluster.get(index=index_name, doc_type=influence_doctype, id=uid)["_source"]
    except:
        result = {}
        return result
    origin_retweetd_dict = json.loads(user_info["origin_weibo_retweeted_detail"])
    origin_comment_dict = json.loads(user_info['origin_weibo_comment_detail'])
    retweeted_retweeted_dict = json.loads(user_info["retweeted_weibo_retweeted_detail"])
    retweeted_comment_dict = json.loads(user_info["retweeted_weibo_comment_detail"])

    origin_retweetd = sorted(origin_retweetd_dict.items(), key=lambda x:x[1], reverse=True)
    origin_comment = sorted(origin_comment_dict.items(), key=lambda x:x[1], reverse=True)
    retweeted_retweeted = sorted(retweeted_retweeted_dict.items(), key=lambda x:x[1], reverse=True)
    retweeted_comment = sorted(retweeted_comment_dict.items(), key=lambda x:x[1], reverse=True)

    query_body_origin = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "must":[
                            {"term":{"message_type": 1}},
                            {"term":{"uid": uid}}
                        ]
                    }
                }
            }
        },
        "size": 10000
    }
    result_1 = es.search(index=index_text, doc_type="text", body=query_body_origin)['hits']['hits']
    origin_set = set()
    if result_1:
        for item in result_1:
            origin_set.add(item['_id'])

    query_body_retweeted = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "must":[
                            {"term":{"message_type": 3}},
                            {"term":{"uid": uid}}
                        ]
                    }
                }
            }
        },
        "size": 10000
    }
    result_2 = es.search(index=index_text, doc_type="text", body=query_body_retweeted)['hits']['hits']
    retweeted_set = set()
    if result_2:
        for item in retweeted_set:
            retweeted_set.add(item['_id'])
    
    if origin_retweetd:
        for item in origin_retweetd:
            if item[0] not in origin_set:
                origin_retweetd.remove(item)

    if origin_comment:
        for item in origin_comment:
            if item[0] not in origin_set:
                origin_comment.remove(item)

    if retweeted_retweeted:
        for item in retweeted_retweeted:
            if item[0] not in retweeted_set:
                retweeted_retweeted.remove(item)

    if retweeted_comment:
        for item in retweeted_comment:
            if item[0] not in retweeted_set:
                retweeted_comment.remove(item)

    if style == 0:
        detail_text = get_text(origin_retweetd[:20], date, user_info, style)
    elif style == 1:
        detail_text = get_text(origin_comment[:20], date, user_info, style)
    elif style == 2:
        detail_text = get_text(retweeted_retweeted[:20], date, user_info, style)
    else:
        detail_text = get_text(retweeted_comment[:20], date, user_info, style)
    #detail_text["origin_retweeted"] = get_text(origin_retweetd, date)
    #detail_text["origin_comment"] = get_text(origin_comment, date)
    #detail_text["retweeted_retweeted"] = get_text(retweeted_retweeted, date)
    #detail_text["retweeted_comment"] = get_text(retweeted_comment, date)

    return detail_text
Пример #34
0
def get_user_detail(date, input_result, status, user_type="influence", auth=""):
    bci_date = ts2datetime(datetime2ts(date) - DAY)
    results = []
    if status=='show_in':
        uid_list = input_result
    if status=='show_compute':
        uid_list = input_result.keys()
    if status=='show_in_history':
        uid_list = input_result.keys()
    if date!='all':
        index_name = 'bci_' + ''.join(bci_date.split('-'))
    else:
        now_ts = time.time()
        now_date = ts2datetime(now_ts)
        index_name = 'bci_' + ''.join(now_date.split('-'))
    index_type = 'bci'
    user_bci_result = es_cluster.mget(index=index_name, doc_type=index_type, body={'ids':uid_list}, _source=True)['docs']
    user_profile_result = es_user_profile.mget(index='weibo_user', doc_type='user', body={'ids':uid_list}, _source=True)['docs']
    max_evaluate_influ = get_evaluate_max(index_name)
    for i in range(0, len(uid_list)):
        uid = uid_list[i]
        bci_dict = user_bci_result[i]
        profile_dict = user_profile_result[i]
        try:
            bci_source = bci_dict['_source']
        except:
            bci_source = None
        if bci_source:
            influence = bci_source['user_index']
            influence = math.log(influence/max_evaluate_influ['user_index'] * 9 + 1 ,10)
            influence = influence * 100
        else:
            influence = ''
        try:
            profile_source = profile_dict['_source']
        except:
            profile_source = None
        if profile_source:
            uname = profile_source['nick_name'] 
            location = profile_source['user_location']
            fansnum = profile_source['fansnum']
            statusnum = profile_source['statusnum']
        else:
            uname = ''
            location = ''
            fansnum = ''
            statusnum = ''
        if status == 'show_in':
            if user_type == "sensitive":
                tmp_ts = datetime2ts(date) - DAY
                tmp_data = r_cluster.hget("sensitive_"+str(tmp_ts), uid)
                if tmp_data:
                    sensitive_dict = json.loads(tmp_data)
                    sensitive_words = sensitive_dict.keys()
                else:
                    sensitive_words = []
                results.append([uid, uname, location, fansnum, statusnum, influence, sensitive_words])
            else:
                results.append([uid, uname, location, fansnum, statusnum, influence])
            if auth:
                hashname_submit = "submit_recomment_" + date
                tmp_data = json.loads(r.hget(hashname_submit, uid))
                recommend_list = (tmp_data['operation']).split('&')
                admin_list = []
                admin_list.append(tmp_data['system'])
                admin_list.append(list(set(recommend_list)))
                admin_list.append(len(recommend_list))
                results[-1].extend(admin_list)
        if status == 'show_compute':
            in_date = json.loads(input_result[uid])[0]
            compute_status = json.loads(input_result[uid])[1]
            if compute_status == '1':
                compute_status = '3'
            results.append([uid, uname, location, fansnum, statusnum, influence, in_date, compute_status])
        if status == 'show_in_history':
            in_status = input_result[uid]
            if user_type == "sensitive":
                tmp_ts = datetime2ts(date) - DAY
                tmp_data = r_cluster.hget("sensitive_"+str(tmp_ts), uid)
                if tmp_data:
                    sensitive_dict = json.loads(tmp_data)
                    sensitive_words = sensitive_dict.keys()
                results.append([uid, uname, location, fansnum, statusnum, influence, in_status, sensitive_words])
            else:
                results.append([uid, uname, location, fansnum, statusnum, influence, in_status])

    return results
Пример #35
0
 sensitive_string = "sensitive_score_" + tmp_ts
 query_sensitive_body = {
     "query":{
         "match_all":{}
     },
     "size":1,
     "sort":{sensitive_string:{"order":"desc"}}
 }
 try:
     top_sensitive_result = es_bci_history.search(index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX, body=query_sensitive_body, _source=False, fields=[sensitive_string])['hits']['hits']
     top_sensitive = top_sensitive_result[0]['fields'][sensitive_string][0]
 except Exception, reason:
     print Exception, reason
     top_sensitive = 400
 index_type = 'bci'
 user_bci_result = es_cluster.mget(index=index_name, doc_type=index_type, body={'ids':uid_list}, _source=True)['docs']
 user_profile_result = es_user_profile.mget(index='weibo_user', doc_type='user', body={'ids':uid_list}, _source=True)['docs']
 bci_history_result = es_bci_history.mget(index=bci_history_index_name, doc_type=bci_history_index_type, body={"ids":uid_list}, fields=['user_fansnum', 'weibo_month_sum'])['docs']
 sensitive_history_result = es_bci_history.mget(index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX, body={'ids':uid_list}, fields=[sensitive_string], _source=False)['docs']
 max_evaluate_influ = get_evaluate_max(index_name)
 for i in range(0, len(uid_list)):
     uid = uid_list[i]
     bci_dict = user_bci_result[i]
     profile_dict = user_profile_result[i]
     bci_history_dict = bci_history_result[i]
     sensitive_history_dict = sensitive_history_result[i]
     #print sensitive_history_dict
     try:
         bci_source = bci_dict['_source']
     except:
         bci_source = None
Пример #36
0
def get_recommentation(submit_user):
    if RUN_TYPE:
        now_ts = time.time()
    else:
        now_ts = datetime2ts(RUN_TEST_TIME)

    in_portrait_set = set(r.hkeys("compute"))
    result = []
    for i in range(7):
        iter_ts = now_ts - i * DAY
        iter_date = ts2datetime(iter_ts)
        submit_user_recomment = "recomment_" + submit_user + "_" + str(
            iter_date)
        bci_date = ts2datetime(iter_ts - DAY)
        submit_user_recomment = r.hkeys(submit_user_recomment)
        bci_index_name = "bci_" + bci_date.replace('-', '')
        exist_bool = es_cluster.indices.exists(index=bci_index_name)
        if not exist_bool:
            continue
        if submit_user_recomment:
            user_bci_result = es_cluster.mget(
                index=bci_index_name,
                doc_type="bci",
                body={'ids': submit_user_recomment},
                _source=True)['docs']
            user_profile_result = es_user_profile.mget(
                index='weibo_user',
                doc_type='user',
                body={'ids': submit_user_recomment},
                _source=True)['docs']
            max_evaluate_influ = get_evaluate_max(bci_index_name)
            for i in range(len(submit_user_recomment)):
                uid = submit_user_recomment[i]
                bci_dict = user_bci_result[i]
                profile_dict = user_profile_result[i]
                try:
                    bci_source = bci_dict['_source']
                except:
                    bci_source = None
                if bci_source:
                    influence = bci_source['user_index']
                    influence = math.log(
                        influence / max_evaluate_influ['user_index'] * 9 + 1,
                        10)
                    influence = influence * 100
                else:
                    influence = ''
                try:
                    profile_source = profile_dict['_source']
                except:
                    profile_source = None
                if profile_source:
                    uname = profile_source['nick_name']
                    location = profile_source['user_location']
                    fansnum = profile_source['fansnum']
                    statusnum = profile_source['statusnum']
                else:
                    uname = ''
                    location = ''
                    fansnum = ''
                    statusnum = ''
                if uid in in_portrait_set:
                    in_portrait = "1"
                else:
                    in_portrait = "0"
                recomment_day = iter_date
                result.append([
                    iter_date, uid, uname, location, fansnum, statusnum,
                    influence, in_portrait
                ])

    return result
def statistics_influence_people(uid, date, style):
    # output: different retweeted and comment, uids' domain distribution, topic distribution, registeration geo distribution
    results = {} # retwweted weibo people and comment weibo people
    date1 = str(date).replace('-', '')
    index_name = pre_index + date1
    index_flow_text = pre_text_index + date

    try:
        bci_result = es_cluster.get(index=index_name, doc_type=influence_doctype, id=uid)["_source"]
    except:
        bci_result = []
        return results
    origin_mid = [] # origin weibo mid
    retweeted_mid = [] # retweeted weibo mid

    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "must":[
                        ]
                    }
                }
            }
        },
        "size":1000
    }

    body_1 = copy.deepcopy(query_body)
    body_2 = copy.deepcopy(query_body)

    body_1["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term":{"message_type": 1}}, {"term":{"uid": uid}}])
    result_1 = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=body_1)["hits"]["hits"]
    if result_1:
        for item in result_1:
            origin_mid.append(item['_id'])

    body_1["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term":{"message_type": 3}}, {"term":{"uid": uid}}])
    result_2 = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=body_2)["hits"]["hits"]
    if result_2:
        for item in result_2:
            if item['_source'].get('root_mid', ''):
                retweeted_mid.append(item['_source']['root_mid'])    
    
    origin_retweeted = json.loads(bci_result["origin_weibo_retweeted_detail"])
    retweeted_retweeted = json.loads(bci_result["retweeted_weibo_retweeted_detail"])
    origin_comment = json.loads(bci_result["origin_weibo_comment_detail"])
    retweeted_comment = json.loads(bci_result["retweeted_weibo_comment_detail"])

    """
    retweeted_total_number = sum(origin_retweeted.values()) + sum(retweeted_retweeted.values())
    comment_total_number = sum(origin_comment.values()) + sum(retweeted_comment.values())
    if origin_retweeted:
        origin_retweeted_mid = filter_mid(origin_retweeted)
    if retweeted_retweeted:
        retweeted_retweeted_mid = filter_mid(retweeted_retweeted)
    if origin_comment:
        origin_comment_mid = filter_mid(origin_comment)
    if retweeted_comment:
        retweeted_comment_mid = filter_mid(retweeted_comment)

    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "should":[
                        ],
                        "must": [
                        ]
                    }
                }
            }
        },
        "size":10000
    }
    """

    if int(style) == 0: # retweeted
        retweeted_results = influenced_user_detail(uid, date, origin_mid, retweeted_mid, 3)
        results = retweeted_results
    else:
        comment_results = influenced_user_detail(uid, date, origin_mid, retweeted_mid, 2)
        results = comment_results

    return results
Пример #38
0
def statistics_influence_people(uid, date, style):
    # output: different retweeted and comment, uids' domain distribution, topic distribution, registeration geo distribution
    results = {}  # retwweted weibo people and comment weibo people
    date1 = str(date).replace('-', '')
    index_name = pre_index + date1
    index_flow_text = pre_text_index + date

    try:
        bci_result = es_cluster.get(index=index_name,
                                    doc_type=influence_doctype,
                                    id=uid)["_source"]
    except:
        bci_result = []
        return results
    origin_retweeted_mid = []  # origin weibo mid
    retweeted_retweeted_mid = []  # retweeted weibo mid
    origin_comment_mid = []
    retweeted_comment_mid = []
    origin_retweeted = json.loads(bci_result["origin_weibo_retweeted_detail"])
    retweeted_retweeted = json.loads(
        bci_result["retweeted_weibo_retweeted_detail"])
    origin_comment = json.loads(bci_result["origin_weibo_comment_detail"])
    retweeted_comment = json.loads(
        bci_result["retweeted_weibo_comment_detail"])
    retweeted_total_number = sum(origin_retweeted.values()) + sum(
        retweeted_retweeted.values())
    comment_total_number = sum(origin_comment.values()) + sum(
        retweeted_comment.values())
    if origin_retweeted:
        origin_retweeted_mid = filter_mid(origin_retweeted)
    if retweeted_retweeted:
        retweeted_retweeted_mid = filter_mid(retweeted_retweeted)
    if origin_comment:
        origin_comment_mid = filter_mid(origin_comment)
    if retweeted_comment:
        retweeted_comment_mid = filter_mid(retweeted_comment)

    query_body = {
        "query": {
            "filtered": {
                "filter": {
                    "bool": {
                        "should": [],
                        "must": []
                    }
                }
            }
        },
        "size": 10000
    }

    if int(style) == 0:  # retweeted
        retweeted_origin = []
        if retweeted_retweeted_mid:
            text_result = es.mget(index=index_flow_text,
                                  doc_type=flow_text_index_type,
                                  body={"ids":
                                        retweeted_retweeted_mid})["docs"]
            for item in text_result:
                mid = item.get("source", {}).get("root_mid", '0')
                retweeted_origin.append(mid)
        retweeted_results = influenced_user_detail(uid, date,
                                                   origin_retweeted_mid,
                                                   retweeted_origin, 3)
        retweeted_results["total_number"] = retweeted_total_number
        results = retweeted_results
    else:
        retweeted_origin = []
        if retweeted_comment_mid:
            text_result = es.mget(index=index_flow_text,
                                  doc_type=flow_text_index_type,
                                  body={"ids": retweeted_comment_mid})["docs"]
            for item in text_result:
                mid = item.get("source", {}).get("root_mid", '0')
                retweeted_origin.append(mid)
        comment_results = influenced_user_detail(uid, date, origin_comment_mid,
                                                 retweeted_origin, 2)
        comment_results["total_number"] = comment_total_number
        results = comment_results

    return results
Пример #39
0
def influenced_user_detail(uid, date, origin_retweeted_mid, retweeted_retweeted_mid, message_type, default_number=20):
    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "must": [
                        ]
                    }
                }
            }
        },
        "size":100000,
        "sort":{"user_fansnum":{"order":"desc"}}
    }
    #详细影响到的人 
    date1 = str(date).replace('-', '')
    index_name = pre_index + date1
    index_flow_text = pre_text_index + date
    origin_retweeted_uid = [] # influenced user uid_list
    retweeted_retweeted_uid = []
    origin_comment_uid = []
    retweeted_comment_uid = []
    query_origin = copy.deepcopy(query_body)
    query_retweeted = copy.deepcopy(query_body)
    if origin_retweeted_mid: # 所有转发该条原创微博的用户
        query_origin["query"]["filtered"]["filter"]["bool"]["must"].append({"terms": {"root_mid": origin_retweeted_mid}})
        query_origin["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term":{"message_type": message_type}}, {"term":{"root_uid": uid}}])
        origin_retweeted_result = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=query_origin, fields=["uid"])["hits"]["hits"]
        if origin_retweeted_result:
            for item in origin_retweeted_result:
                origin_retweeted_uid.append(item["fields"]["uid"][0])
    if retweeted_retweeted_mid: # 所有评论该条原创微博的用户
        query_retweeted["query"]["filtered"]["filter"]["bool"]["must"].append({"terms": {"root_mid": retweeted_retweeted_mid}})
        query_retweeted["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term":{"message_type": message_type}},{"term": {"directed_uid": uid}}])
        retweeted_retweeted_result = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=query_retweeted, fields=["uid"])["hits"]["hits"]
        if retweeted_retweeted_result:
            for item in retweeted_retweeted_result:
                retweeted_retweeted_uid.append(item["fields"]["uid"][0])
    retweeted_uid_list = [] # all retweeted user list
    retweeted_results = {} # statistics of all retweeted uid information
    retweeted_domain = {}
    retweeted_topic = {}
    retweeted_geo = {}
    bci_results = {}
    in_portrait = []
    out_portrait = []
    average_influence = 0
    total_influence = 0
    count = 0
    all_uid_set = set(origin_retweeted_uid) | set(retweeted_retweeted_uid)

    retweeted_uid_list.extend(origin_retweeted_uid)
    retweeted_uid_list.extend(retweeted_retweeted_uid)
    retweeted_uid_list = list(set(retweeted_uid_list) - set([uid])) # filter uids
    if retweeted_uid_list:
        user_portrait_result = es_user_portrait.mget(index=user_portrait, doc_type=portrait_index_type, body={"ids": retweeted_uid_list}, fields=["domain", "topic_string", "activity_geo_dict","importance", "influence"])["docs"]
        bci_index = "bci_" + date.replace('-', '')
        bci_results = es_cluster.mget(index=bci_index, doc_type="bci", body={"ids":retweeted_uid_list}, fields=['user_index'])["docs"]
        for item in user_portrait_result:
            if item["found"]:
                temp = []
                count += 1
                temp.append(item['_id'])
                temp.append(item["fields"]["importance"][0])
                in_portrait.append(temp)
                temp_domain = item["fields"]["domain"][0].split('&')
                temp_topic = item["fields"]["topic_string"][0].split('&')
                temp_geo = json.loads(item["fields"]["activity_geo_dict"][0])[-1].keys()
                #total_influence += item["fields"]["influence"][0]
                retweeted_domain = aggregation(temp_domain, retweeted_domain)
                retweeted_topic = aggregation(temp_topic, retweeted_topic)
                retweeted_geo = aggregation(temp_geo, retweeted_geo)
            else:
                out_portrait.append(item['_id'])
        retweeted_domain = proportion(retweeted_domain)
        retweeted_topic = proportion(retweeted_topic)
        retweeted_geo = proportion(retweeted_geo)


    if bci_results:
        total_influence = 0
        for item in bci_results:
            if item['found']:
                total_influence += item['fields']['user_index'][0]
    try:
        average_influence = total_influence/len(retweeted_uid_list)
    except:
        average_influence = 0

    sorted_retweeted_domain = sorted(retweeted_domain.items(),key=lambda x:x[1], reverse=True)
    sorted_retweeted_topic = sorted(retweeted_topic.items(),key=lambda x:x[1], reverse=True)
    sorted_retweeted_geo = sorted(retweeted_geo.items(), key=lambda x:x[1], reverse=True)
    retweeted_results["domian"] = sorted_retweeted_domain[:5]
    retweeted_results["topic"] = sorted_retweeted_topic[:5]
    retweeted_results["geo"] = sorted_retweeted_geo[:5]
    retweeted_results["influence"] = average_influence
    in_portrait = sorted(in_portrait, key=lambda x:x[1], reverse=True)

    temp_list = []
    for item in in_portrait:
        temp_list.append(item[0])
    retweeted_results['in_portrait_number'] = len(temp_list)
    retweeted_results['out_portrait_number'] = len(out_portrait)
    in_portrait_url = get_user_url(temp_list[:default_number])
    out_portrait_url = get_user_url(out_portrait[:default_number])
    retweeted_results["in_portrait"] = in_portrait_url
    retweeted_results["out_portrait"] = out_portrait_url
    retweeted_results["total_number"] = len(temp_list) + len(out_portrait)
 

    return retweeted_results