def get_user_influence(uid, date):
    date1 = str(date).replace("-","")
    index_name = pre_index + date1
    result = bci_detail(date, uid)
    sensitive_result = bci_detail(date, uid,1)
    user_index = result["user_index"]
    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "range":{
                        "user_index":{
                            "gt": user_index
                        }
                    }
                }
            }
        }
    }
    total_count = es_cluster.count(index=index_name, doc_type=influence_doctype)['count']
    order_count = es_cluster.count(index=index_name, doc_type=influence_doctype, body=query_body)['count']

    result["total_count"] = total_count
    result["order_count"] = order_count + 1

    return [sensitive_result, result]
def tag_vector(uid, date):
    date1 = str(date).replace('-', '')
    index_name = pre_index + date1
    index_flow_text = pre_text_index + date
    result = []

    try:
        bci_result = es_cluster.get(index=index_name, doc_type=influence_doctype, id=uid)["_source"]
    except:
        tag = influence_tag["0"]
        result.append(tag)
        return result

    origin_retweeted = json.loads(bci_result["origin_weibo_retweeted_detail"])
    retweeted_retweeted = json.loads(bci_result["retweeted_weibo_retweeted_detail"])
    origin_comment = json.loads(bci_result["origin_weibo_comment_detail"])
    retweeted_comment = json.loads(bci_result["retweeted_weibo_comment_detail"])
    sum_retweeted = sum(origin_retweeted.values()) + sum(origin_comment.values())
    sum_comment = sum(retweeted_retweeted.values()) + sum(retweeted_comment.values())

    if sum_retweeted >= retweeted_threshold:
        if sum_comment >= comment_threshold:
            tag = influence_tag['3']
        else:
            tag = influence_tag['1']
    else:
        if sum_comment >= comment_threshold:
            tag = influence_tag['2']
        else:
            tag = influence_tag['4']
    result.append(tag)
    return result
Пример #3
0
def get_sensitive_user_detail(uid_list, date, sensitive):
    results = []
    index_name = str(date).replace('-','') # index_name:20130901
    user_bci_results = es_cluster.mget(index=index_name, doc_type='bci', body={'ids':uid_list}, _source=True)['docs']
    user_profile_results = es_user_profile.mget(index="weibo_user", doc_type="user", body={"ids":uid_list}, _source=True)['docs']
    for i in range(0, len(uid_list)):
        personal_info = ['']*6
        uid = uid_list[i]
        personal_info[0] = uid_list[i]
        if user_profile_results[i]['found']:
            profile_dict = user_profile_results[i]['_source']
            personal_info[1] = profile_dict['nick_name']
            personal_info[2] = profile_dict['user_location']
            personal_info[3] = profile_dict['fansnum']
            personal_info[4] = profile_dict['statusnum']
        if user_bci_results[i]['found']:
            personal_info[5] = user_bci_results[i]['_source'].get('user_index', 0)
        else:
            personal_info[5] = 0
        if sensitive:
            sensitive_words = r_cluster.hget('sensitive_' + index_name, str(uid))
            if sensitive_words:
                sensitive_dict = json.loads(sensitive_words)
                personal_info.append(sensitive_dict.keys())
            else:
                personal_info.append([])
        results.append(personal_info)
    return results
def statistics_influence_people(uid, date, style, sensitive=0):
    # output: different retweeted and comment, uids' domain distribution, topic distribution, registeration geo distribution
    results = {} # retwweted weibo people and comment weibo people
    date1 = str(date).replace('-', '')
    index_name = pre_index + date1
    print index_name
    index_flow_text = pre_text_index + date

    try:
        bci_result = es_cluster.get(index=index_name, doc_type=influence_doctype, id=uid)["_source"]
    except:
        bci_result = []
        return results
    origin_mid = [] # origin weibo mid
    retweeted_mid = [] # retweeted weibo mid

    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "must":[
                        ]
                    }
                }
            }
        },
        "size":1000
    }

    if sensitive:
        query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"range":{"sensitive":{"gt":0}}})

    body_1 = copy.deepcopy(query_body)
    body_2 = copy.deepcopy(query_body)

    body_1["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term":{"message_type": 1}}, {"term":{"uid": uid}}])
    result_1 = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=body_1)["hits"]["hits"]
    if result_1:
        for item in result_1:
            origin_mid.append(item['_id'])

    body_1["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term":{"message_type": 3}}, {"term":{"uid": uid}}])
    result_2 = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=body_2)["hits"]["hits"]
    if result_2:
        for item in result_2:
            if item['_source'].get('root_mid', ''):
                retweeted_mid.append(item['_source']['root_mid'])    

    if int(style) == 0: # retweeted
        retweeted_results = influenced_user_detail(uid, date, origin_mid, retweeted_mid, 3)
        results = retweeted_results
    else:
        comment_results = influenced_user_detail(uid, date, origin_mid, retweeted_mid, 2)
        results = comment_results
    return results
Пример #5
0
def get_history_max():
    max_results = {}
    bci_max = ES_CLUSTER_FLOW1.search(index="bci_history", doc_type="bci", body={"query":{"match_all":{}}, "size":1, \
            "sort":{"bci_day_last":{"order":"desc"}}})["hits"]["hits"]
    sensitive_max = es_sensitive_history.search(index="sensitive_history", doc_type="sensitive", body={"query":{"match_all":{}},\
            "size":1,"sort":{"last_value":{"order":"desc"}}})["hits"]["hits"]
    max_results["max_bci"] = bci_max[0]["_source"]["bci_day_last"]
    max_results["max_sensitive"] = sensitive_max[0]["_source"]["last_value"]

    return max_results
Пример #6
0
def get_history_max():
    max_results = {}
    bci_max = ES_CLUSTER_FLOW1.search(index="bci_history", doc_type="bci", body={"query":{"match_all":{}}, "size":1, \
            "sort":{"bci_day_last":{"order":"desc"}}})["hits"]["hits"]
    sensitive_max = es_sensitive_history.search(index="sensitive_history", doc_type="sensitive", body={"query":{"match_all":{}},\
            "size":1,"sort":{"last_value":{"order":"desc"}}})["hits"]["hits"]
    max_results["max_bci"] = bci_max[0]["_source"]["bci_day_last"]
    max_results["max_sensitive"] = sensitive_max[0]["_source"]["last_value"]

    return max_results
Пример #7
0
def get_sensitive_user_detail(uid_list, date, sensitive):
    es_cluster = es_user_profile
    ts = datetime2ts(date)
    results = []
    index_name = pre_influence_index + str(date).replace(
        '-', '')  # index_name:20130901
    user_bci_results = es_bci.mget(index=index_name,
                                   doc_type='bci',
                                   body={'ids': uid_list},
                                   _source=False,
                                   fields=['user_index'])['docs']
    user_profile_results = es_user_profile.mget(index="weibo_user",
                                                doc_type="user",
                                                body={"ids": uid_list},
                                                _source=True)['docs']
    top_influnce_value = get_top_value("user_index", es_bci, index_name, "bci")
    for i in range(0, len(uid_list)):
        personal_info = [''] * 6
        uid = uid_list[i]
        personal_info[0] = uid_list[i]
        personal_info[1] = uid_list[i]
        if user_profile_results[i]['found']:
            profile_dict = user_profile_results[i]['_source']
            uname = profile_dict['nick_name']
            if uname:
                personal_info[1] = uname
            personal_info[2] = profile_dict['user_location']
            personal_info[3] = profile_dict['fansnum']
            personal_info[4] = profile_dict['statusnum']
        if user_bci_results[i]['found']:
            try:
                tmp_bci = user_bci_results[i]['fields']['user_index'][0]
                influence = math.log(
                    tmp_bci / float(top_influnce_value) * 9 + 1, 10) * 100
                personal_info[5] = influence
            except:
                personal_info[5] = 0
        else:
            personal_info[5] = 0
        if sensitive:
            sensitive_words = redis_cluster.hget('sensitive_' + str(ts),
                                                 str(uid))
            if sensitive_words:
                sensitive_dict = json.loads(sensitive_words)
                personal_info.append(sensitive_dict.keys())
            else:
                personal_info.append([])
        else:
            personal_info.append([])
        results.append(personal_info)
    return results
def comment_on_influence(uid, date):
    date1 = str(date).replace('-', '')
    index_name = pre_index + date1
    index_flow_text = pre_text_index + date
    result = []
    underline = []

    try:
        bci_result = es_cluster.get(index=index_name, doc_type=influence_doctype, id=uid)["_source"]
    except:
        description = CURRENT_INFLUENCE_CONCLUSION['0']
        result.append(description)
        return ([result, underline])

    user_index = bci_result['user_index']
    if user_index < CURRNET_INFLUENCE_THRESHOULD[0]:
        description = CURRENT_INFLUENCE_CONCLUSION['0']
    elif user_index >= CURRNET_INFLUENCE_THRESHOULD[0] and user_index < CURRNET_INFLUENCE_THRESHOULD[1]:
        description = CURRENT_INFLUENCE_CONCLUSION['1']
    elif user_index >= CURRNET_INFLUENCE_THRESHOULD[1] and user_index < CURRNET_INFLUENCE_THRESHOULD[2]:
        description = CURRENT_INFLUENCE_CONCLUSION['2']
    elif user_index >= CURRNET_INFLUENCE_THRESHOULD[2] and user_index < CURRNET_INFLUENCE_THRESHOULD[3]:
        description = CURRENT_INFLUENCE_CONCLUSION['3']
    elif user_index >= CURRNET_INFLUENCE_THRESHOULD[3] and user_index < CURRNET_INFLUENCE_THRESHOULD[4]:
        description = CURRENT_INFLUENCE_CONCLUSION['4']
    else:
        description = CURRENT_INFLUENCE_CONCLUSION['5']
    result.append(description)

    for i in range(4):
        if bci_result[INFLUENCE_TOTAL_LIST[i]] > INFLUENCE_TOTAL_THRESHOULD[i]:
            result.append(INFLUENCE_TOTAL_CONCLUSION[i])
            if bci_result[INFLUENCE_BRUST_LIST[i]] > INFLUENCE_BRUST_THRESHOULD[i]:
                result.append(INFLUENCE_BRUST_CONCLUSION[i])
                underline.append(UNDERLINE_CONCLUSION[i])
            else:
                result.append('')
                underline.append('')
        else:
            result.extend(['',''])
            underline.append('')

    return [result, underline]
def get_sensitive_user_detail(uid_list, date, sensitive):
    es_cluster = es_user_profile
    ts = datetime2ts(date)
    results = []
    index_name = pre_influence_index + str(date).replace('-','') # index_name:20130901
    user_bci_results = es_bci.mget(index=index_name, doc_type='bci', body={'ids':uid_list}, _source=False, fields=['user_index'])['docs']
    user_profile_results = es_user_profile.mget(index="weibo_user", doc_type="user", body={"ids":uid_list}, _source=True)['docs']
    top_influnce_value = get_top_value("user_index", es_bci, index_name, "bci")
    for i in range(0, len(uid_list)):
        personal_info = ['']*6
        uid = uid_list[i]
        personal_info[0] = uid_list[i]
        personal_info[1] = uid_list[i]
        if user_profile_results[i]['found']:
            profile_dict = user_profile_results[i]['_source']
            uname = profile_dict['nick_name']
            if uname:
                personal_info[1] = uname
            personal_info[2] = profile_dict['user_location']
            personal_info[3] = profile_dict['fansnum']
            personal_info[4] = profile_dict['statusnum']
        if user_bci_results[i]['found']:
            try:
                tmp_bci = user_bci_results[i]['fields']['user_index'][0]
                influence = math.log(tmp_bci/float(top_influnce_value)*9+1, 10)*100
                personal_info[5] = influence
            except:
                personal_info[5] = 0
        else:
            personal_info[5] = 0
        if sensitive:
            sensitive_words = redis_cluster.hget('sensitive_' + str(ts), str(uid))
            if sensitive_words:
                sensitive_dict = json.loads(sensitive_words)
                personal_info.append(sensitive_dict.keys())
            else:
                personal_info.append([])
        else:
            personal_info.append([])
        results.append(personal_info)
    return results
Пример #10
0
def get_sensitive_user_detail(uid_list, date, sensitive):
    results = []
    index_name = str(date).replace('-', '')  # index_name:20130901
    user_bci_results = es_cluster.mget(index=index_name,
                                       doc_type='bci',
                                       body={'ids': uid_list},
                                       _source=True)['docs']
    user_profile_results = es_user_profile.mget(index="weibo_user",
                                                doc_type="user",
                                                body={"ids": uid_list},
                                                _source=True)['docs']
    for i in range(0, len(uid_list)):
        personal_info = [''] * 6
        uid = uid_list[i]
        personal_info[0] = uid_list[i]
        if user_profile_results[i]['found']:
            profile_dict = user_profile_results[i]['_source']
            personal_info[1] = profile_dict['nick_name']
            personal_info[2] = profile_dict['user_location']
            personal_info[3] = profile_dict['fansnum']
            personal_info[4] = profile_dict['statusnum']
        if user_bci_results[i]['found']:
            personal_info[5] = user_bci_results[i]['_source'].get(
                'user_index', 0)
        else:
            personal_info[5] = 0
        if sensitive:
            sensitive_words = r_cluster.hget('sensitive_' + index_name,
                                             str(uid))
            if sensitive_words:
                sensitive_dict = json.loads(sensitive_words)
                personal_info.append(sensitive_dict.keys())
            else:
                personal_info.append([])
        results.append(personal_info)
    return results
Пример #11
0
def full_text_search(keywords, uid, start_time, end_time, size):
    results = []
    uid_list = []
    user_profile_list = []
    query_body = {
        "query": {
            "filtered":{
                "filter":{
                    "bool": {
                        "must": []
                    }
                }
            }
        },
        "size":size,
        "sort":{"timestamp":{"order": 'desc'}}
    }

    if RUN_TYPE:
        query_body["sort"] = {"user_fansnum":{"order": 'desc'}}

    if uid:
        query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"term":{"uid":uid}})

    if keywords:
        keywords_list = keywords.split(',')
        for word in keywords_list:
            query_body["query"]["filtered"]["filter"]["bool"]["must"].append({'wildcard':{'text':{'wildcard':'*'+word+'*'}}})

    index_list = []
    exist_bool = es_flow_text.indices.exists(index="flow_text_"+end_time)
    if start_time:
        start_ts = datetime2ts(start_time)
        end_ts = datetime2ts(end_time)
        ts = end_ts
        while 1:
            index_name = "flow_text_"+ts2datetime(ts)
            exist_bool = es_flow_text.indices.exists(index=index_name)
            if exist_bool:
                index_list.append(index_name)
            if ts == start_ts:
                break
            else:
                ts -= 3600*24

    print index_list
    #  没有可行的es
    if not index_list:
        return []

    search_results = es_flow_text.search(index=index_list, doc_type="text", body=query_body)["hits"]["hits"]
    for item in search_results:
        uid_list.append(item['_source']['uid'])
    history_max = get_history_max()
    personal_field = ["nick_name", "fansnum", "statusnum","user_location"]
    user_info = get_user_profile(uid_list, personal_field)
    bci_results = ES_CLUSTER_FLOW1.mget(index="bci_history", doc_type="bci", body={"ids":uid_list}, _source=False, fields=["bci_day_last"])["docs"]
    sensitive_results = es_sensitive_history.mget(index="sensitive_history", doc_type="sensitive", body={"ids":uid_list}, _source=False, fields=["last_value"])["docs"]

    count = 0
    for item in search_results:
        item = item['_source']
        uid_list.append(item['uid'])
        iter_item = []
        iter_item.append(item['uid'])
        iter_item.append(user_info[count][1])
        iter_item.append(item['text'])
        iter_item.append(ts2date(item['timestamp']))
        iter_item.append(item['geo'])
        if item.get("sensitive_words_string", ''):
            iter_item.append(item['sensitive_words_string'].split('&'))
        else:
            iter_item.append([])
        iter_item.append(item.get('retweeted', 0))
        iter_item.append(item.get('comment', 0))
        count += 1
        results.append(iter_item)

    user_set = set()
    count = 0
    for item in user_info:
        if item[0] in user_set:
            continue
        else:
            user_set.add(item[0])
        if bci_results[count]["found"]:
            bci_value = bci_results[count]["fields"]["bci_day_last"][0]
            item.append(normalize_index(bci_value, history_max["max_bci"]))
        else:
            item.append(0)
        if sensitive_results[count]["found"]:
            sensitive_value = sensitive_results[count]['fields']['last_value'][0]
            item.append(normalize_index(sensitive_value, history_max["max_sensitive"]))
        else:
            item.append(0)
        user_profile_list.append(item)

    return results, user_profile_list
def influenced_user_detail(uid, date, origin_retweeted_mid, retweeted_retweeted_mid, message_type, default_number=20):
    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "must": [
                        ]
                    }
                }
            }
        },
        "size":20000,
    }
    if RUN_TYPE == 1:
        query_body["sort"] = {"user_fansnum":{"order":"desc"}}

    #详细影响到的人 
    date1 = str(date).replace('-', '')
    index_name = pre_index + date1
    index_flow_text = pre_text_index + date
    origin_retweeted_uid = [] # influenced user uid_list
    retweeted_retweeted_uid = []
    origin_comment_uid = []
    retweeted_comment_uid = []
    query_origin = copy.deepcopy(query_body)
    query_retweeted = copy.deepcopy(query_body)
    if origin_retweeted_mid: # 所有转发该条原创微博的用户
        query_origin["query"]["filtered"]["filter"]["bool"]["must"].append({"terms": {"root_mid": origin_retweeted_mid}})
        query_origin["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term":{"message_type": message_type}}, {"term":{"root_uid": uid}}])
        origin_retweeted_result = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=query_origin, fields=["uid"])["hits"]["hits"]
        if origin_retweeted_result:
            for item in origin_retweeted_result:
                origin_retweeted_uid.append(item["fields"]["uid"][0])
    if retweeted_retweeted_mid: # 所有评论该条原创微博的用户
        query_retweeted["query"]["filtered"]["filter"]["bool"]["must"].append({"terms": {"root_mid": retweeted_retweeted_mid}})
        query_retweeted["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term":{"message_type": message_type}},{"term": {"directed_uid": uid}}])
        retweeted_retweeted_result = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=query_retweeted, fields=["uid"])["hits"]["hits"]
        if retweeted_retweeted_result:
            for item in retweeted_retweeted_result:
                retweeted_retweeted_uid.append(item["fields"]["uid"][0])
    retweeted_uid_list = [] # all retweeted user list
    retweeted_results = {} # statistics of all retweeted uid information
    retweeted_domain = {}
    retweeted_topic = {}
    retweeted_geo = {}
    bci_results = {}
    in_portrait = []
    out_portrait = []
    average_influence = 0
    total_influence = 0
    count = 0
    all_uid_set = set(origin_retweeted_uid) | set(retweeted_retweeted_uid)

    retweeted_uid_list.extend(origin_retweeted_uid)
    retweeted_uid_list.extend(retweeted_retweeted_uid)
    retweeted_uid_list = list(set(retweeted_uid_list) - set([uid])) # filter uids
    if retweeted_uid_list:
        user_portrait_result = es_user_portrait.mget(index=user_portrait, doc_type=portrait_index_type, body={"ids": retweeted_uid_list}, fields=["domain", "topic_string", "activity_geo_dict","importance", "influence"])["docs"]
        bci_index = "bci_" + date.replace('-', '')
        bci_results = es_cluster.mget(index=bci_index, doc_type="bci", body={"ids":retweeted_uid_list}, fields=['user_index'])["docs"]
        for item in user_portrait_result:
            if item["found"]:
                temp = []
                count += 1
                temp.append(item['_id'])
                temp.append(item["fields"]["importance"][0])
                in_portrait.append(temp)
                temp_domain = item["fields"]["domain"][0].split('&')
                temp_topic = item["fields"]["topic_string"][0].split('&')
                temp_geo = json.loads(item["fields"]["activity_geo_dict"][0])[-1].keys()
                #total_influence += item["fields"]["influence"][0]
                retweeted_domain = aggregation(temp_domain, retweeted_domain)
                retweeted_topic = aggregation(temp_topic, retweeted_topic)
                retweeted_geo = aggregation(temp_geo, retweeted_geo)
            else:
                out_portrait.append(item['_id'])
        retweeted_domain = proportion(retweeted_domain)
        retweeted_topic = proportion(retweeted_topic)
        retweeted_geo = proportion(retweeted_geo)


    if bci_results:
        total_influence = 0
        for item in bci_results:
            if item['found']:
                total_influence += item['fields']['user_index'][0]
    try:
        average_influence = total_influence/len(retweeted_uid_list)
    except:
        average_influence = 0

    sorted_retweeted_domain = sorted(retweeted_domain.items(),key=lambda x:x[1], reverse=True)
    sorted_retweeted_topic = sorted(retweeted_topic.items(),key=lambda x:x[1], reverse=True)
    sorted_retweeted_geo = sorted(retweeted_geo.items(), key=lambda x:x[1], reverse=True)
    retweeted_results["domian"] = sorted_retweeted_domain[:5]
    retweeted_results["topic"] = sorted_retweeted_topic[:5]
    retweeted_results["geo"] = sorted_retweeted_geo[:5]
    retweeted_results["influence"] = average_influence
    in_portrait = sorted(in_portrait, key=lambda x:x[1], reverse=True)

    temp_list = []
    for item in in_portrait:
        temp_list.append(item[0])
    retweeted_results['in_portrait_number'] = len(temp_list)
    retweeted_results['out_portrait_number'] = len(out_portrait)
    in_portrait_url = get_user_url(temp_list[:default_number])
    out_portrait_url = get_user_url(out_portrait[:default_number])
    retweeted_results["in_portrait"] = in_portrait_url
    retweeted_results["out_portrait"] = out_portrait_url
    retweeted_results["total_number"] = len(temp_list) + len(out_portrait)
 

    return retweeted_results
def influenced_people(uid, mid, influence_style, date, default_number=20):
# uid 
# which weibo----mid, retweeted weibo ---seek for root_mid
# influence_style: retweeted(0) or comment(1)
    date1 = ts2datetime(datetime2ts(date)).replace('-', '')
    index_name = pre_index + date1
    index_flow_text = pre_text_index + date
    text_result = es.get(index=index_flow_text, doc_type=flow_text_index_type, id=mid)["_source"]
    temp_mid = text_result.get("root_mid",'') #判断微博是否是原创微博
    if temp_mid:
        mid_type = 1 # 非原创微博
    else:
        mid_type = 0 # 原创微博
    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "must":[
                        ]
                    }
                }
            }
        },
        "size": 30000
    }
    if RUN_TYPE:
        query_body["sort"] = {"user_fansnum":{"order":"desc"}}

    if int(mid_type) == 0:
        if int(influence_style) == 0: # origin weibo, all retweeted people
            query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"root_uid": uid}}, {"term": {"message_type": 3}}, {"term": {"root_mid": mid}}])
        else: # commented people
            query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"directed_uid": uid}}, {"term": {"message_type": 2}}, {"term": {"root_mid": mid}}])
    else:
        if int(influence_style) == 0: # origin weibo, all retweeted people
            query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"directed_uid": uid}}, {"term": {"message_type": 3}}, {"term": {"root_mid": temp_mid}}])
        else: # commented people
            query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"directed_uid": uid}}, {"term": {"message_type": 2}}, {"term": {"root_mid": temp_mid}}])
    search_results = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=query_body, _source=False, fields=["uid"], timeout=30)["hits"]["hits"]
    results = [] # uid_list
    if search_results:
        for item in search_results:
            if int(item["fields"]["uid"][0]) == int(uid):
                pass
            else:
                results.append(item["fields"]["uid"][0])
        results = list(set(results))
    else:
        results = []

    bci_index = "bci_" + date.replace('-','')

    if results:
        portrait_results = es_user_portrait.mget(index=user_portrait, doc_type=portrait_index_type, body={"ids": results}, fields=["domain", "topic_string", "activity_geo_dict","importance", "influence"])["docs"]
        bci_results = es_cluster.mget(index=bci_index, doc_type='bci', body={"ids":results}, fields=['user_index'])['docs']
    else:
        portrait_results = {}
        bci_results = {}


    in_portrait = []
    out_portrait = []
    in_portrait_info = []
    retweeted_domain = {}
    retweeted_topic = {}
    retweeted_geo = {}
    average_influence = 0
    total_influence = 0
    count = 0

    if bci_results:
        total_influence = 0
        for item in bci_results:
            if item['found']:
                total_influence += item['fields']['user_index'][0]
    try:
        average_influence = total_influence/len(results)
    except:
        average_influence = 0

    if portrait_results:
        for item in portrait_results:
            if item["found"]:
                temp = []
                count += 1
                temp.append(item['_id'])
                temp.append(item["fields"]["importance"][0])
                in_portrait.append(temp)
                temp_domain = item["fields"]["domain"][0].split('&')
                temp_topic = item["fields"]["topic_string"][0].split('&')
                temp_geo = json.loads(item["fields"]["activity_geo_dict"][0])[-1].keys()
                #total_influence += item["fields"]["influence"][0]
                retweeted_domain = aggregation(temp_domain, retweeted_domain)
                retweeted_topic = aggregation(temp_topic, retweeted_topic)
                retweeted_geo = aggregation(temp_geo, retweeted_geo)
            else:
                out_portrait.append(item['_id'])
        retweeted_domain = proportion(retweeted_domain)
        retweeted_topic = proportion(retweeted_topic)
        retweeted_geo = proportion(retweeted_geo)
        #try:
        #    average_influence = total_influence/count
        #except:
        #    average_influence = 0
    sorted_retweeted_domain = sorted(retweeted_domain.items(),key=lambda x:x[1], reverse=True)
    sorted_retweeted_topic = sorted(retweeted_topic.items(),key=lambda x:x[1], reverse=True)
    sorted_retweeted_geo = sorted(retweeted_geo.items(), key=lambda x:x[1], reverse=True)

    retweeted_results = dict()
    retweeted_results["domian"] = sorted_retweeted_domain[:5]
    retweeted_results["topic"] = sorted_retweeted_topic[:5]
    retweeted_results["geo"] = sorted_retweeted_geo[:5]
    retweeted_results["influence"] = average_influence
    in_portrait = sorted(in_portrait, key=lambda x:x[1], reverse=True)


    temp_list = []
    for item in in_portrait:
        temp_list.append(item[0])
    retweeted_results['in_portrait_number'] = len(temp_list)
    retweeted_results['out_portrait_number'] = len(out_portrait)
    in_portrait_url = get_user_url(temp_list[:default_number])
    out_portrait_url = get_user_url(out_portrait[:default_number])

    return_results = dict()
    return_results["influence_users"] = [in_portrait_url, out_portrait_url]
    return_results["influence_distribution"] = retweeted_results

    return return_results
Пример #14
0
def full_text_search(keywords, uid, start_time, end_time, size):
    results = []
    uid_list = []
    user_profile_list = []
    query_body = {
        "query": {
                    "bool": {
                        "must": []
                    }
        },
        "size":size,
        "sort":{"timestamp":{"order": 'desc'}}
    }

    if RUN_TYPE:
        query_body["sort"] = {"user_fansnum":{"order": 'desc'}}

    if uid:
        query_body["query"]["bool"]["must"].append({"term":{"uid":uid}})

    if keywords:
        keywords_list = keywords.split(',')
        for word in keywords_list:
            query_body["query"]["bool"]["must"].append({'wildcard':{'text':{'wildcard':'*'+word+'*'}}})

    index_list = []
    exist_bool = es_flow_text.indices.exists(index="flow_text_"+end_time)
    if start_time:
        start_ts = datetime2ts(start_time)
        end_ts = datetime2ts(end_time)
        ts = end_ts
        while 1:
            index_name = "flow_text_"+ts2datetime(ts)
            exist_bool = es_flow_text.indices.exists(index=index_name)
            if exist_bool:
                index_list.append(index_name)
            if ts == start_ts:
                break
            else:
                ts -= 3600*24

    print index_list
    #  没有可行的es
    if not index_list:
        return [[], []]

    search_results = es_flow_text.search(index=index_list, doc_type="text", body=query_body)["hits"]["hits"]
    for item in search_results:
        uid_list.append(item['_source']['uid'])
    user_info = []
    if uid_list:
        history_max = get_history_max()
        personal_field = ["nick_name", "fansnum", "statusnum","user_location"]
        user_info = get_user_profile(uid_list, personal_field)
        bci_results = ES_CLUSTER_FLOW1.mget(index="bci_history", doc_type="bci", body={"ids":uid_list}, _source=False, fields=["bci_day_last"])["docs"]
        in_portrait = es_user_portrait.mget(index="sensitive_user_portrait", doc_type="user", body={"ids":uid_list}, _source=False)["docs"]
        sensitive_results = es_sensitive_history.mget(index="sensitive_history", doc_type="sensitive", body={"ids":uid_list}, _source=False, fields=["last_value"])["docs"]
    print "len search: ", len(search_results)

    count = 0
    # uid uname text date geo sensitive_words retweeted comment
    for item in search_results:
        item = item['_source']
        uid_list.append(item['uid'])
        iter_item = []
        iter_item.append(item['uid'])
        iter_item.append(user_info[count][1])
        iter_item.append(item['text'])
        iter_item.append(ts2date(item['timestamp']))
        iter_item.append(item['geo'])
        if item.get("sensitive_words_string", ''):
            iter_item.append(item['sensitive_words_string'].split('&'))
        else:
            iter_item.append([])
        iter_item.append(item.get('retweeted', 0))
        iter_item.append(item.get('comment', 0))
        count += 1
        results.append(iter_item)

    user_set = set()
    count = 0
    # uid "nick_name", "fansnum", "statusnum","user_location", bci, sensitive
    for item in user_info:
        if item[0] in user_set:
            continue
        else:
            user_set.add(item[0])
        if bci_results[count]["found"]:
            if bci_results[count].has_key("fields"):
                bci_value = bci_results[count]["fields"]["bci_day_last"][0]
            else:
                bci_value = 0
            item.append(normalize_index(bci_value, history_max["max_bci"]))
        else:
            item.append(0)
        if sensitive_results[count]["found"]:
            if sensitive_results[count].has_key("fields"):
                sensitive_value = sensitive_results[count]['fields']['last_value'][0]
            else:
                sensitive_value = 0
            item.append(normalize_index(sensitive_value, history_max["max_sensitive"]))
        else:
            item.append(0)
        if in_portrait[count]["found"]:
            item.append("1")
        else:
            item.append("0")
        user_profile_list.append(item)

    return results, user_profile_list
Пример #15
0
def bci_detail(date, uid, sensitive=0):
    if not sensitive:
        bci_index = "bci_" + date.replace("-", "")
        try:
            bci_result = es_bci.get(index=bci_index, doc_type="bci", id=uid)["_source"]
        except:
            bci_result = dict()

        try:
            origin_retweeted = json.loads(bci_result.get("origin_weibo_retweeted_detail", []))
        except:
            origin_retweeted = []
        origin_weibo_retweeted_brust_average = bci_result.get("origin_weibo_retweeted_brust_average", 0)  # 爆发数
        try:
            origin_comment = json.loads(bci_result.get("origin_weibo_comment_detail", []))
        except:
            origin_comment = []
        origin_weibo_comment_brust_average = bci_result.get("origin_weibo_comment_brust_average", 0)
        try:
            retweeted_retweeted = json.loads(bci_result.get("retweeted_weibo_retweeted_detail", []))
        except:
            retweeted_retweeted = []
        retweeted_weibo_retweeted_brust_average = bci_result.get("retweeted_weibo_retweeted_brust_average", 0)
        try:
            retweeted_comment = json.loads(bci_result.get("retweeted_weibo_comment_detail", []))
        except:
            retweeted_comment = []
        retweeted_weibo_comment_brust_average = bci_result.get("retweeted_weibo_comment_brust_average", 0)

    origin_query = query_body(1, uid)
    text_index = "flow_text_" + date
    if not sensitive:
        origin_text = es_text.search(index=text_index, doc_type="text", body=origin_query)["hits"]["hits"]
    else:
        sensitive_origin_query = origin_query["query"]["filtered"]["filter"]["bool"]["must"].append(
            {"range": {"sensitive": {"gt": 0}}}
        )
        origin_text = es_text.search(index=text_index, doc_type="text", body=sensitive_origin_query)["hits"]["hits"]
    # print origin_text
    retweeted_query = query_body(3, uid)
    if not sensitive:
        retweeted_text = es_text.search(index=text_index, doc_type="text", body=retweeted_query)["hits"]["hits"]
    else:
        sensitive_retweeted_query = retweeted_query["query"]["filtered"]["filter"]["bool"]["must"].append(
            {"range": {"sensitive": {"gt": 0}}}
        )
        retweeted_text = es_text.search(index=text_index, doc_type="text", body=sensitive_retweeted_query)["hits"][
            "hits"
        ]

    origin_weibo_number = len(origin_text)  # 1
    retweeted_weibo_number = len(retweeted_text)  # 2

    retweet_total_number = 0  # 转发总数
    comment_total_number = 0  # 评论总数
    origin_retweet_total_number = 0  # 原创被转发总数
    origin_comment_total_number = 0  # 原创被评论总数
    retweet_retweet_total_number = 0  # 转发被转发总数
    retweet_comment_total_number = 0  # 转发被评论总数
    origin_retweet_average_number = 0  # 原创被转发平均数
    origin_comment_average_number = 0  # 原创被评论平均数
    retweet_retweet_average_number = 0  # 转发被转发平均数
    retweet_comment_average_number = 0  # 转发被评论平均数
    origin_retweet_top_number = 0  # 原创被转发最高
    origin_comment_top_number = 0  # 原创被评论最高
    retweet_retweet_top_number = 0  # 转发被转发最高
    retweet_comment_top_number = 0  # 转发被评论最高
    origin_sensitive_words_dict = dict()
    retweeted_sensitive_words_dict = dict()
    for item in origin_text:
        retweet_total_number += item["_source"].get("retweeted", 0)
        comment_total_number += item["_source"].get("comment", 0)
        origin_retweet_total_number += item["_source"].get("retweeted", 0)
        origin_comment_total_number += item["_source"].get("comment", 0)
        if origin_retweet_top_number < item["_source"].get("retweeted", 0):
            origin_retweet_top_number = item["_source"].get("retweeted", 0)
        if origin_comment_top_number < item["_source"].get("comment", 0):
            origin_comment_top_number = item["_source"].get("comment", 0)
        if sensitive:
            sensitive_words_dict = json.loads(item["_source"]["sensitive_words_dict"])
            if sensitive_words_dict:
                for k, v in sensitive_words_dict.iteritems():
                    try:
                        origin_sensitive_words_dict[k] += v
                    except:
                        origin_sensitive_words_dict[k] = v
    for item in retweeted_text:
        retweet_total_number += item["_source"].get("retweeted", 0)
        comment_total_number += item["_source"].get("comment", 0)
        retweet_retweet_total_number += item["_source"].get("retweeted", 0)
        retweet_comment_total_number += item["_source"].get("comment", 0)
        if retweet_retweet_top_number < item["_source"].get("retweeted", 0):
            retweeet_retweet_top_number = item["_source"].get("retweeted", 0)
        if retweet_comment_top_number < item["_source"].get("comment", 0):
            retweet_comment_top_number = item["_source"].get("comment", 0)
        if sensitive:
            sensitive_words_dict = json.loads(item["_source"]["sensitive_words_dict"])
            if sensitive_words_dict:
                for k, v in sensitive_words_dict.iteritems():
                    try:
                        retweeted_sensitive_words_dict[k] += v
                    except:
                        retweeted_sensitive_words_dict[k] = v
    try:
        average_retweet_number = retweet_total_number / (origin_weibo_number + retweeted_weibo_number)  # 平均转发数
    except:
        average_retweet_number = 0
    try:
        average_comment_number = comment_total_number / (origin_weibo_number + retweeted_weibo_number)  # 平均评论数
    except:
        average_comment_number = 0

    try:
        origin_retweet_average_number = origin_retweet_total_number / origin_weibo_number
    except:
        origin_retweet_average_number = 0
    try:
        origin_comment_average_number = origin_comment_total_number / origin_weibo_number
    except:
        origin_comment_average_number = 0
    try:
        retweet_retweet_average_number = retweet_retweet_total_number / retweeted_weibo_number
    except:
        retweet_retweet_average_number = 0
    try:
        retweet_comment_average_number = retweet_comment_total_number / retweeted_weibo_number
    except:
        retweet_comment_average_number = 0

    result = dict()
    result["origin_weibo_number"] = origin_weibo_number
    result["retweeted_weibo_number"] = retweeted_weibo_number
    result["origin_weibo_retweeted_total_number"] = origin_retweet_total_number
    result["origin_weibo_comment_total_number"] = origin_comment_total_number
    result["retweeted_weibo_retweeted_total_number"] = retweet_retweet_total_number
    result["retweeted_weibo_comment_total_number"] = retweet_comment_total_number
    result["origin_weibo_retweeted_average_number"] = origin_retweet_average_number
    result["origin_weibo_comment_average_number"] = origin_comment_average_number
    result["retweeted_weibo_retweeted_average_number"] = retweet_retweet_average_number
    result["retweeted_weibo_comment_average_number"] = retweet_comment_average_number
    result["origin_weibo_retweeted_top_number"] = origin_retweet_top_number
    result["origin_weibo_comment_top_number"] = origin_comment_top_number
    result["retweeted_weibo_retweeted_top_number"] = retweet_retweet_top_number
    result["retweeted_weibo_comment_top_number"] = retweet_comment_top_number
    if not sensitive:
        result["origin_weibo_comment_brust_average"] = origin_weibo_comment_brust_average
        result["origin_weibo_retweeted_brust_average"] = origin_weibo_retweeted_brust_average
        result["retweeted_weibo_comment_brust_average"] = retweeted_weibo_comment_brust_average
        result["retweeted_weibo_retweeted_brust_average"] = retweeted_weibo_retweeted_brust_average
        result["user_index"] = bci_result.get("user_index", 0)
    else:
        result["retweeted_sensitive_words_list"] = sorted(
            retweeted_sensitive_words_dict.items(), key=lambda x: x[1], reverse=True
        )
        result["origin_sensitive_words_list"] = sorted(
            origin_sensitive_words_dict.items(), key=lambda x: x[1], reverse=True
        )
        result["retweeted_sensitive_words_number"] = len(retweeted_sensitive_words_dict)
        result["origin_sensitive_words_number"] = len(origin_sensitive_words_dict)

    return result
Пример #16
0
def bci_detail(date, uid, sensitive=0):
    if not sensitive:
        bci_index = "bci_" + date.replace('-', '')
        try:
            bci_result = es_bci.get(index=bci_index, doc_type="bci",
                                    id=uid)['_source']
        except:
            bci_result = dict()

        try:
            origin_retweeted = json.loads(
                bci_result.get("origin_weibo_retweeted_detail", []))
        except:
            origin_retweeted = []
        origin_weibo_retweeted_brust_average = bci_result.get(
            "origin_weibo_retweeted_brust_average", 0)  # 爆发数
        try:
            origin_comment = json.loads(
                bci_result.get("origin_weibo_comment_detail", []))
        except:
            origin_comment = []
        origin_weibo_comment_brust_average = bci_result.get(
            "origin_weibo_comment_brust_average", 0)
        try:
            retweeted_retweeted = json.loads(
                bci_result.get("retweeted_weibo_retweeted_detail", []))
        except:
            retweeted_retweeted = []
        retweeted_weibo_retweeted_brust_average = bci_result.get(
            'retweeted_weibo_retweeted_brust_average', 0)
        try:
            retweeted_comment = json.loads(
                bci_result.get("retweeted_weibo_comment_detail", []))
        except:
            retweeted_comment = []
        retweeted_weibo_comment_brust_average = bci_result.get(
            'retweeted_weibo_comment_brust_average', 0)

    origin_query = query_body(1, uid)
    text_index = "flow_text_" + date
    if not sensitive:
        origin_text = es_text.search(index=text_index,
                                     doc_type="text",
                                     body=origin_query)["hits"]["hits"]
    else:
        sensitive_origin_query = origin_query["query"]["filtered"]["filter"][
            "bool"]["must"].append({"range": {
                "sensitive": {
                    "gt": 0
                }
            }})
        origin_text = es_text.search(
            index=text_index, doc_type="text",
            body=sensitive_origin_query)["hits"]["hits"]
    #print origin_text
    retweeted_query = query_body(3, uid)
    if not sensitive:
        retweeted_text = es_text.search(index=text_index,
                                        doc_type="text",
                                        body=retweeted_query)["hits"]["hits"]
    else:
        sensitive_retweeted_query = retweeted_query["query"]["filtered"][
            "filter"]["bool"]["must"].append(
                {"range": {
                    "sensitive": {
                        "gt": 0
                    }
                }})
        retweeted_text = es_text.search(
            index=text_index, doc_type="text",
            body=sensitive_retweeted_query)["hits"]["hits"]

    origin_weibo_number = len(origin_text)  # 1
    retweeted_weibo_number = len(retweeted_text)  #2

    retweet_total_number = 0  # 转发总数
    comment_total_number = 0  # 评论总数
    origin_retweet_total_number = 0  # 原创被转发总数
    origin_comment_total_number = 0  # 原创被评论总数
    retweet_retweet_total_number = 0  # 转发被转发总数
    retweet_comment_total_number = 0  # 转发被评论总数
    origin_retweet_average_number = 0  # 原创被转发平均数
    origin_comment_average_number = 0  # 原创被评论平均数
    retweet_retweet_average_number = 0  # 转发被转发平均数
    retweet_comment_average_number = 0  # 转发被评论平均数
    origin_retweet_top_number = 0  # 原创被转发最高
    origin_comment_top_number = 0  # 原创被评论最高
    retweet_retweet_top_number = 0  # 转发被转发最高
    retweet_comment_top_number = 0  # 转发被评论最高
    origin_sensitive_words_dict = dict()
    retweeted_sensitive_words_dict = dict()
    for item in origin_text:
        retweet_total_number += item['_source'].get('retweeted', 0)
        comment_total_number += item['_source'].get('comment', 0)
        origin_retweet_total_number += item['_source'].get('retweeted', 0)
        origin_comment_total_number += item['_source'].get('comment', 0)
        if origin_retweet_top_number < item['_source'].get('retweeted', 0):
            origin_retweet_top_number = item['_source'].get('retweeted', 0)
        if origin_comment_top_number < item['_source'].get('comment', 0):
            origin_comment_top_number = item['_source'].get('comment', 0)
        if sensitive:
            sensitive_words_dict = json.loads(
                item['_source']['sensitive_words_dict'])
            if sensitive_words_dict:
                for k, v in sensitive_words_dict.iteritems():
                    try:
                        origin_sensitive_words_dict[k] += v
                    except:
                        origin_sensitive_words_dict[k] = v
    for item in retweeted_text:
        retweet_total_number += item['_source'].get('retweeted', 0)
        comment_total_number += item['_source'].get('comment', 0)
        retweet_retweet_total_number += item['_source'].get('retweeted', 0)
        retweet_comment_total_number += item['_source'].get('comment', 0)
        if retweet_retweet_top_number < item['_source'].get('retweeted', 0):
            retweeet_retweet_top_number = item['_source'].get('retweeted', 0)
        if retweet_comment_top_number < item['_source'].get('comment', 0):
            retweet_comment_top_number = item['_source'].get('comment', 0)
        if sensitive:
            sensitive_words_dict = json.loads(
                item['_source']['sensitive_words_dict'])
            if sensitive_words_dict:
                for k, v in sensitive_words_dict.iteritems():
                    try:
                        retweeted_sensitive_words_dict[k] += v
                    except:
                        retweeted_sensitive_words_dict[k] = v
    try:
        average_retweet_number = retweet_total_number / (
            origin_weibo_number + retweeted_weibo_number)  # 平均转发数
    except:
        average_retweet_number = 0
    try:
        average_comment_number = comment_total_number / (
            origin_weibo_number + retweeted_weibo_number)  # 平均评论数
    except:
        average_comment_number = 0

    try:
        origin_retweet_average_number = origin_retweet_total_number / origin_weibo_number
    except:
        origin_retweet_average_number = 0
    try:
        origin_comment_average_number = origin_comment_total_number / origin_weibo_number
    except:
        origin_comment_average_number = 0
    try:
        retweet_retweet_average_number = retweet_retweet_total_number / retweeted_weibo_number
    except:
        retweet_retweet_average_number = 0
    try:
        retweet_comment_average_number = retweet_comment_total_number / retweeted_weibo_number
    except:
        retweet_comment_average_number = 0

    result = dict()
    result["origin_weibo_number"] = origin_weibo_number
    result["retweeted_weibo_number"] = retweeted_weibo_number
    result["origin_weibo_retweeted_total_number"] = origin_retweet_total_number
    result["origin_weibo_comment_total_number"] = origin_comment_total_number
    result[
        "retweeted_weibo_retweeted_total_number"] = retweet_retweet_total_number
    result[
        "retweeted_weibo_comment_total_number"] = retweet_comment_total_number
    result[
        "origin_weibo_retweeted_average_number"] = origin_retweet_average_number
    result[
        "origin_weibo_comment_average_number"] = origin_comment_average_number
    result[
        "retweeted_weibo_retweeted_average_number"] = retweet_retweet_average_number
    result[
        "retweeted_weibo_comment_average_number"] = retweet_comment_average_number
    result["origin_weibo_retweeted_top_number"] = origin_retweet_top_number
    result["origin_weibo_comment_top_number"] = origin_comment_top_number
    result["retweeted_weibo_retweeted_top_number"] = retweet_retweet_top_number
    result["retweeted_weibo_comment_top_number"] = retweet_comment_top_number
    if not sensitive:
        result[
            "origin_weibo_comment_brust_average"] = origin_weibo_comment_brust_average
        result[
            "origin_weibo_retweeted_brust_average"] = origin_weibo_retweeted_brust_average
        result[
            "retweeted_weibo_comment_brust_average"] = retweeted_weibo_comment_brust_average
        result[
            "retweeted_weibo_retweeted_brust_average"] = retweeted_weibo_retweeted_brust_average
        result['user_index'] = bci_result.get('user_index', 0)
    else:
        result["retweeted_sensitive_words_list"] = sorted(
            retweeted_sensitive_words_dict.items(),
            key=lambda x: x[1],
            reverse=True)
        result["origin_sensitive_words_list"] = sorted(
            origin_sensitive_words_dict.items(),
            key=lambda x: x[1],
            reverse=True)
        result["retweeted_sensitive_words_number"] = len(
            retweeted_sensitive_words_dict)
        result["origin_sensitive_words_number"] = len(
            origin_sensitive_words_dict)

    return result