Пример #1
0
def get_group_list(task_name):
    results = []
    try:
        es_results = es.get(index=index_name, doc_type=index_type, id=task_name)['_source']
    except:
        return results
    #print 'es_result:', es_results['uid_list'], type(es_results['uid_list'])
    uid_list = es_results['uid_list']
    user_portrait_attribute = es.mget(index='user_portrait', doc_type='user', body={'ids':uid_list})['docs']
    evaluate_max = get_evaluate_max()
    for item in user_portrait_attribute:
        uid = item['_id']
        try:
            source = item['_source']
            uname = source['uname']
            gender = source['gender']
            location = source['location']
            importance = source['importance']
            normal_importance = math.log(importance / evaluate_max['importance'] * 9 + 1, 10) * 100
            influence = source['influence']
            normal_influence = math.log(influence / evaluate_max['influence'] * 9 + 1, 10) * 100
            results.append([uid, uname, gender, location, normal_importance, normal_influence])
        except:
            results.append([uid])
    return results
Пример #2
0
def get_group_list(task_name):
    results = []
    try:
        es_results = es.get(index=index_name,
                            doc_type=index_type,
                            id=task_name)['_source']
    except:
        return results
    #print 'es_result:', es_results['uid_list'], type(es_results['uid_list'])
    uid_list = es_results['uid_list']
    user_portrait_attribute = es.mget(index='user_portrait',
                                      doc_type='user',
                                      body={'ids': uid_list})['docs']
    evaluate_max = get_evaluate_max()
    for item in user_portrait_attribute:
        uid = item['_id']
        try:
            source = item['_source']
            uname = source['uname']
            gender = source['gender']
            location = source['location']
            importance = source['importance']
            normal_importance = math.log(
                importance / evaluate_max['importance'] * 9 + 1, 10) * 100
            influence = source['influence']
            normal_influence = math.log(
                influence / evaluate_max['influence'] * 9 + 1, 10) * 100
            results.append([
                uid, uname, gender, location, normal_importance,
                normal_influence
            ])
        except:
            results.append([uid])
    return results
Пример #3
0
def test_influence_rank(domain, date, order):
    uid_list = domain_dict[domain]
    order = str(order)
    query_body = {
        "query":{
            "match_all": {}
        },
        "sort": {search_order[order]: {"order": "desc"}},
        "size": 100
    }
    search_result = es.search(index=date, doc_type="bci", body=query_body)['hits']['hits']
    portrait_result = es.mget(index='sensitive_user_portrait', doc_type='user',  body={"ids": uid_list})['docs']
    results = []
    for i in range(len(search_result)):
        detail = []
        try:
            detail.append(search_result[i]['_source'][search_order[order]])
        except:
            print uid_list[i]
            detail.append(0)
        try:
            temp = portrait_result[i]['_source']
        except:
            continue
        detail.extend([temp['uid'], temp['uname'], temp['photo_url'], temp['activeness'], temp['importance'], temp['sensitive']])
        results.append(detail)
    sorted_list = sorted(results, key=lambda x:x[0], reverse=True)
    return sorted_list
Пример #4
0
def search_domain(domain, date, order, number=100):
    # top influence user in domain
    count = 0
    count_n = 0
    result_list = []
    date = str(date).replace('-', '')
    order = str(order)

    query_body = {
        "query":{
            "match_all": {}
        },
        "sort": {search_order[order]: {"order": "desc"}},
        "size": 1+count
    }

    while 1:
        search_results = es.search(index=date, doc_type='bci', body=query_body)['hits']['hits']
        uid_list = []
        count += 1
        for item in search_results:
            uid_list.append(item['_id'])
        portrait_results = es.mget(index='sensitive_user_portrait', doc_type='user', body={"ids":uid_list})['docs']
        for item in portrait_results:
            print item
            domain_list = (item['_source']['topic_string']).split('&')  # attention
            if domain in set(domain_list):
                result_list.append([item['_id'], item['_source']['uname'], item['_source']['photo_url'], item['_source']['influence'], item['_source']['sensitive'], item['_source']['importance'], item['_source']['activeness']])
                count_n += 1
        if count_n >= int(number):
            break

    return result_list
def identify_uid_list_in(uid_list):
    in_set = set()
    search_result = es.mget(index='sensitive_user_portrait', doc_type="user", body={'ids':uid_list})['docs']
    for item in search_result:
        if item['found']:
            in_set.add(item['_id'])

    return in_set
Пример #6
0
def identify_uid_list_in(uid_list):
    in_set = set()
    search_result = es.mget(index="sensitive_user_portrait", doc_type="user", body={"ids": uid_list})["docs"]
    for item in search_result:
        if item["found"]:
            in_set.add(item["_id"])

    return in_set
Пример #7
0
def identify_uid_list_in(uid_list):
    in_set = set()
    search_result = es.mget(index='sensitive_user_portrait',
                            doc_type="user",
                            body={'ids': uid_list})['docs']
    for item in search_result:
        if item['found']:
            in_set.add(item['_id'])

    return in_set
Пример #8
0
def get_user_info(uid_list):
    results = {}
    search_results = es.mget(index='sensitive_user_portrait', doc_type='user', body={'ids':uid_list})['docs']
    fields = ['uid', 'uname', 'domain', 'sensitive', 'importance', 'influence', 'activeness']
    for item in search_results:
        detail = []
        for field in fields:
            detail.append(item['_source'][field])
        results[item['_id']] = detail

    return results
Пример #9
0
def search_follower(uid, sensitive):
    results = dict()
    stat_results = dict()
    for db_num in R_DICT:
        r = R_DICT[db_num]
        if sensitive:
            br_uid_results = r.hgetall('sensitive_be_retweet_' + str(uid))
        else:
            br_uid_results = r.hgetall('be_retweet_' + str(uid))
        if br_uid_results:
            for br_uid in br_uid_results:
                if br_uid != uid:
                    try:
                        stat_results[br_uid] += br_uid_results[br_uid]
                    except:
                        stat_results[br_uid] = br_uid_results[br_uid]
    if not stat_results:
        return [None, 0]
    try:
        sort_stat_results = sorted(stat_results.items(),
                                   key=lambda x: x[1],
                                   reverse=True)[:20]
    except:
        return [None, 0]

    uid_list = [item[0] for item in sort_stat_results]
    es_profile_results = es_user_profile.mget(index='weibo_user',
                                              doc_type='user',
                                              body={'ids': uid_list})['docs']
    es_portrait_results = es.mget(index='sensitive_user_portrait',
                                  doc_type='user',
                                  body={'ids': uid_list})['docs']

    result_list = []
    for i in range(len(es_profile_results)):
        item = es_profile_results[i]
        uid = item['_id']
        try:
            source = item['_source']
            uname = source['nick_name']
        except:
            uname = u'unknown'

        portrait_item = es_portrait_results[i]
        try:
            source = portrait_item['_source']
            in_status = 1
        except:
            in_status = 0
        result_list.append([uid, [uname, stat_results[uid], in_status]])
    return [result_list[:20], len(stat_results)]
Пример #10
0
def search_retweet(uid, sensitive):
    stat_results = dict()
    results = dict()
    for db_num in R_DICT:
        r = R_DICT[db_num]
        if not sensitive:
            ruid_results = r.hgetall('retweet_' + str(uid))
        else:
            ruid_results = r.hgetall('sensitive_retweet_' +
                                     str(uid))  # because of sensitive weibo
        if ruid_results:
            for ruid in ruid_results:
                if ruid != uid:
                    if stat_results.has_key(ruid):
                        stat_results[ruid] += ruid_results[ruid]
                    else:
                        stat_results[ruid] = ruid_results[ruid]

    if stat_results:
        sort_stat_results = sorted(stat_results.items(),
                                   key=lambda x: x[1],
                                   reverse=True)[:20]
    else:
        return [None, 0]
    uid_list = [item[0] for item in sort_stat_results]
    es_profile_results = es_user_profile.mget(index='weibo_user',
                                              doc_type='user',
                                              body={'ids': uid_list})['docs']
    es_portrait_results = es.mget(index='sensitive_user_portrait',
                                  doc_type='user',
                                  body={'ids': uid_list})['docs']
    result_list = []
    for i in range(len(es_profile_results)):
        item = es_profile_results[i]
        uid = item['_id']
        if item['found']:
            uname = item['_source']['nick_name']
        else:
            uname = u'unknown'

        portrait_item = es_portrait_results[i]
        if portrait_item['found']:
            in_status = 1
        else:
            in_status = 0

        result_list.append([uid, [uname, stat_results[uid], in_status]])

    return [result_list[:20], len(stat_results)]
Пример #11
0
def search_domain(domain, date, order, number=100):
    # top influence user in domain
    count = 0
    count_n = 0
    result_list = []
    date = str(date).replace('-', '')
    order = str(order)

    query_body = {
        "query": {
            "match_all": {}
        },
        "sort": {
            search_order[order]: {
                "order": "desc"
            }
        },
        "size": 1 + count
    }

    while 1:
        search_results = es.search(index=date, doc_type='bci',
                                   body=query_body)['hits']['hits']
        uid_list = []
        count += 1
        for item in search_results:
            uid_list.append(item['_id'])
        portrait_results = es.mget(index='sensitive_user_portrait',
                                   doc_type='user',
                                   body={"ids": uid_list})['docs']
        for item in portrait_results:
            print item
            domain_list = (item['_source']['topic_string']).split(
                '&')  # attention
            if domain in set(domain_list):
                result_list.append([
                    item['_id'], item['_source']['uname'],
                    item['_source']['photo_url'], item['_source']['influence'],
                    item['_source']['sensitive'],
                    item['_source']['importance'],
                    item['_source']['activeness']
                ])
                count_n += 1
        if count_n >= int(number):
            break

    return result_list
def search_follower(uid, sensitive):
    results = dict()
    stat_results = dict()
    if 1:
        r = r_cluster
        if sensitive:
            br_uid_results = r.hgetall('sensitive_be_retweet_'+str(uid))
        else:
            br_uid_results = r.hgetall('be_retweet_'+str(uid))
        if br_uid_results:
            for br_uid in br_uid_results:
                if br_uid != uid:
                    try:
                        stat_results[br_uid] += br_uid_results[br_uid]
                    except:
                        stat_results[br_uid] = br_uid_results[br_uid]
    if not stat_results:
        return [None, 0]
    try:
        sort_stat_results = sorted(stat_results.items(), key=lambda x:x[1], reverse=True)[:20]
    except:
        return [None, 0]

    uid_list = [item[0] for item in sort_stat_results]
    es_profile_results = es_user_profile.mget(index='weibo_user', doc_type='user', body={'ids':uid_list})['docs']
    es_portrait_results = es.mget(index='sensitive_user_portrait', doc_type='user', body={'ids':uid_list})['docs']

    result_list = []
    for i in range(len(es_profile_results)):
        item = es_profile_results[i]
        uid = item['_id']
        try:
            source = item['_source']
            uname = source['nick_name']
        except:
            uname = u'unknown'

        portrait_item = es_portrait_results[i]
        try:
            source = portrait_item['_source']
            in_status = 1
        except:
            in_status = 0
        result_list.append([uid,[uname, stat_results[uid], in_status]])
    return [result_list[:20], len(stat_results)]
Пример #13
0
def search_follower(uid, sensitive):
    results = dict()
    stat_results = dict()
    for db_num in R_DICT:
        r = R_DICT[db_num]
        if sensitive:
            br_uid_results = r.hgetall("sensitive_be_retweet_" + str(uid))
        else:
            br_uid_results = r.hgetall("be_retweet_" + str(uid))
        if br_uid_results:
            for br_uid in br_uid_results:
                if br_uid != uid:
                    try:
                        stat_results[br_uid] += br_uid_results[br_uid]
                    except:
                        stat_results[br_uid] = br_uid_results[br_uid]
    if not stat_results:
        return [None, 0]
    try:
        sort_stat_results = sorted(stat_results.items(), key=lambda x: x[1], reverse=True)[:20]
    except:
        return [None, 0]

    uid_list = [item[0] for item in sort_stat_results]
    es_profile_results = es_user_profile.mget(index="weibo_user", doc_type="user", body={"ids": uid_list})["docs"]
    es_portrait_results = es.mget(index="sensitive_user_portrait", doc_type="user", body={"ids": uid_list})["docs"]

    result_list = []
    for i in range(len(es_profile_results)):
        item = es_profile_results[i]
        uid = item["_id"]
        try:
            source = item["_source"]
            uname = source["nick_name"]
        except:
            uname = u"unknown"

        portrait_item = es_portrait_results[i]
        try:
            source = portrait_item["_source"]
            in_status = 1
        except:
            in_status = 0
        result_list.append([uid, [uname, stat_results[uid], in_status]])
    return [result_list[:20], len(stat_results)]
Пример #14
0
def get_group_tag(group_name):
    result = {}
    order_result = []
    #get group task uid list
    #get user tag
    #statistic tag
    try:
        group_task_result = es.get(index=group_index_name, doc_type=group_index_type, id=group_name)
    except:
        return 'no group task'
    try:
        uid_list = group_task_result['_source']['uid_list']
    except:
        return 'no user'
    try:
        user_result = es.mget(index=user_index_name, doc_type=user_index_type, body={'ids': uid_list})['docs']
    except Exception, e:
        raise e
Пример #15
0
def get_user_tag(uid_list):
    result = {}
    user_result = es.mget(index=user_index_name, doc_type=user_index_type, body={'ids':uid_list})['docs']
    print 'user_result:', user_result
    for user_item in user_result:
        uid = user_item['_id']
        result[uid] = []
        try:
            source = user_item['_source']
        except:
            source = {}
        for key in source:
            if key not in identify_attribute_list:
                value = source[key]
                tag_string = key+':'+value
                result[uid].append(tag_string)

    return result
Пример #16
0
def search_retweet(uid, sensitive):
    stat_results = dict()
    results = dict()
    for db_num in R_DICT:
        r = R_DICT[db_num]
        if not sensitive:
            ruid_results = r.hgetall("retweet_" + str(uid))
        else:
            ruid_results = r.hgetall("sensitive_retweet_" + str(uid))  # because of sensitive weibo
        if ruid_results:
            for ruid in ruid_results:
                if ruid != uid:
                    if stat_results.has_key(ruid):
                        stat_results[ruid] += ruid_results[ruid]
                    else:
                        stat_results[ruid] = ruid_results[ruid]

    if stat_results:
        sort_stat_results = sorted(stat_results.items(), key=lambda x: x[1], reverse=True)[:20]
    else:
        return [None, 0]
    uid_list = [item[0] for item in sort_stat_results]
    es_profile_results = es_user_profile.mget(index="weibo_user", doc_type="user", body={"ids": uid_list})["docs"]
    es_portrait_results = es.mget(index="sensitive_user_portrait", doc_type="user", body={"ids": uid_list})["docs"]
    result_list = []
    for i in range(len(es_profile_results)):
        item = es_profile_results[i]
        uid = item["_id"]
        if item["found"]:
            uname = item["_source"]["nick_name"]
        else:
            uname = u"unknown"

        portrait_item = es_portrait_results[i]
        if portrait_item["found"]:
            in_status = 1
        else:
            in_status = 0

        result_list.append([uid, [uname, stat_results[uid], in_status]])

    return [result_list[:20], len(stat_results)]
def search_retweet(uid, sensitive):
    stat_results = dict()
    results = dict()
    if 1:
        r = r_cluster
        if not sensitive:
            ruid_results = r.hgetall('retweet_'+str(uid))
        else:
            ruid_results = r.hgetall('sensitive_retweet_'+str(uid)) # because of sensitive weibo
        if ruid_results:
            for ruid in ruid_results:
                if ruid != uid:
                    if stat_results.has_key(ruid):
                        stat_results[ruid] += ruid_results[ruid]
                    else:
                        stat_results[ruid] = ruid_results[ruid]

    if stat_results:
        sort_stat_results = sorted(stat_results.items(), key=lambda x:x[1], reverse=True)[:20]
    else:
        return [None, 0]
    uid_list = [item[0] for item in sort_stat_results]
    es_profile_results = es_user_profile.mget(index='weibo_user', doc_type='user', body={'ids':uid_list})['docs']
    es_portrait_results = es.mget(index='sensitive_user_portrait', doc_type='user', body={'ids':uid_list})['docs']
    result_list = []
    for i in range(len(es_profile_results)):
        item = es_profile_results[i]
        uid = item['_id']
        if item['found']:
            uname = item['_source']['nick_name']
        else:
            uname = u'unknown'

        portrait_item = es_portrait_results[i]
        if portrait_item['found']:
            in_status = 1
        else:
            in_status = 0

        result_list.append([uid,[uname, stat_results[uid], in_status]])

    return [result_list[:20], len(stat_results)]
Пример #18
0
def get_user_tag(uid_list):
    result = {}
    user_result = es.mget(index=user_index_name,
                          doc_type=user_index_type,
                          body={'ids': uid_list})['docs']
    print 'user_result:', user_result
    for user_item in user_result:
        uid = user_item['_id']
        result[uid] = []
        try:
            source = user_item['_source']
        except:
            source = {}
        for key in source:
            if key not in identify_attribute_list:
                value = source[key]
                tag_string = key + ':' + value
                result[uid].append(tag_string)

    return result
Пример #19
0
def get_group_tag(group_name):
    result = {}
    order_result = []
    #get group task uid list
    #get user tag
    #statistic tag
    try:
        group_task_result = es.get(index=group_index_name,
                                   doc_type=group_index_type,
                                   id=group_name)
    except:
        return 'no group task'
    try:
        uid_list = group_task_result['_source']['uid_list']
    except:
        return 'no user'
    try:
        user_result = es.mget(index=user_index_name,
                              doc_type=user_index_type,
                              body={'ids': uid_list})['docs']
    except Exception, e:
        raise e
Пример #20
0
def test_influence_rank(domain, date, order):
    uid_list = domain_dict[domain]
    order = str(order)
    query_body = {
        "query": {
            "match_all": {}
        },
        "sort": {
            search_order[order]: {
                "order": "desc"
            }
        },
        "size": 100
    }
    search_result = es.search(index=date, doc_type="bci",
                              body=query_body)['hits']['hits']
    portrait_result = es.mget(index='sensitive_user_portrait',
                              doc_type='user',
                              body={"ids": uid_list})['docs']
    results = []
    for i in range(len(search_result)):
        detail = []
        try:
            detail.append(search_result[i]['_source'][search_order[order]])
        except:
            print uid_list[i]
            detail.append(0)
        try:
            temp = portrait_result[i]['_source']
        except:
            continue
        detail.extend([
            temp['uid'], temp['uname'], temp['photo_url'], temp['activeness'],
            temp['importance'], temp['sensitive']
        ])
        results.append(detail)
    sorted_list = sorted(results, key=lambda x: x[0], reverse=True)
    return sorted_list
Пример #21
0
def get_group_results(task_name, module):
    result = []
    try:
        es_result = es.get(index=index_name, doc_type=index_type,
                           id=task_name)['_source']
        #print 'result:', result
    except:
        return None
    #basic module: gender, count, verified
    if module == 'overview':
        task_name = es_result['task_name']
        submit_date = es_result['submit_date']
        state = es_result['state']
        tightness = es_result['tightness']
        activeness = es_result['activeness']
        importance = es_result['importance']
        influence = es_result['influence']
        result = [
            task_name, submit_date, state, tightness, activeness, importance,
            influence
        ]
    if module == 'basic':
        gender_dict = json.loads(es_result['gender'])
        count = es_result['count']
        verified = es_result['verified']
        if verified:
            verified_dict = json.loads(verified)
        result = [gender_dict, count, verified]
    if module == 'activity':
        activity_geo_dict = json.loads(es_result['activity_geo'])
        sort_activity_geo = sorted(activity_geo_dict.items(),
                                   key=lambda x: x[1],
                                   reverse=True)
        activity_geo = sort_activity_geo[:50]
        activity_trend = json.loads(es_result['activity_trend'])
        online_pattern_dict = json.loads(es_result['online_pattern'])
        sort_online_pattern = sorted(online_pattern_dict.items(),
                                     key=lambda x: x[1],
                                     reverse=True)
        online_pattern = sort_online_pattern[:50]
        geo_track = json.loads(es_result['geo_track'])
        result = [activity_geo, activity_trend, online_pattern, geo_track]
    if module == 'social':
        #degree_his = json.loads(es_result['degree_his'])
        density = es_result['density']
        retweet_weibo_count = es_result['retweet_weibo_count']
        retweet_user_count = es_result['retweet_user_count']
        retweet_relation = json.loads(es_result['retweet_relation'])
        uid_list = []
        for relation in retweet_relation:
            uid_list.append(relation[0])
            uid_list.append(relation[1])
        es_portrait_result = es.mget(index='user_portrait',
                                     doc_type='user',
                                     body={'ids': uid_list})['docs']
        es_count = 0
        new_retweet_relation = []
        for relation in retweet_relation:
            source_uid = relation[0]
            source_item = es_portrait_result[es_count]
            try:
                source = source_item['_source']
                source_uname = source['uname']
            except:
                source_uname = ''
            target_uid = relation[1]
            es_count += 1
            target_item = es_portrait_result[es_count]
            try:
                source = target_item['_source']
                target_uname = source['uname']
            except:
                target_uname = ''

            count = relation[2]
            new_retweet_relation.append(
                [source_uid, source_uname, target_uid, target_uname, count])
        uid_list = []
        out_beretweet_relation = json.loads(
            es_result['out_beretweet_relation'])
        uid_list = []
        uid_list = [item[0] for item in out_beretweet_relation]
        es_portrait_result = es.mget(index='user_portrait',
                                     doc_type='user',
                                     body={'ids': uid_list})['docs']
        es_count = 0
        new_out_beretweet_relation = []
        for i in range(len(uid_list)):
            item = es_portrait_result[i]
            uid = item['_id']
            try:
                source = item['_source']
                uname = source['uname']
            except:
                uname = ''
            out_relation_item = out_beretweet_relation[i][1:]
            a = [uid, uname]
            a.extend(out_relation_item)
            #print 'add_item:', add_item
            new_out_beretweet_relation.append(a)
        result = [
            new_retweet_relation, density, retweet_weibo_count,
            retweet_user_count, new_out_beretweet_relation
        ]
    if module == 'think':
        domain_dict = json.loads(es_result['domain'])
        topic_dict = json.loads(es_result['topic'])
        psycho_status = json.loads(es_result['psycho_status'])
        psycho_feature = json.loads(es_result['psycho_feature'])
        result = [domain_dict, topic_dict, psycho_status, psycho_feature]
    if module == 'text':
        hashtag_dict = json.loads(es_result['hashtag'])
        sort_hashtag = sorted(hashtag_dict.items(),
                              key=lambda x: x[1],
                              reverse=True)
        hashtag = sort_hashtag[:50]
        emoticon_dict = json.loads(es_result['emoticon'])
        sort_emoticon = sorted(emoticon_dict.items(),
                               key=lambda x: x[1],
                               reverse=True)
        emoticon = sort_emoticon[:5]
        keyword_dict = json.loads(es_result['keywords'])
        sort_keyword = sorted(keyword_dict.items(),
                              key=lambda x: x[1],
                              reverse=True)
        keyword = sort_keyword[:50]
        result = [hashtag, keyword, emoticon]
    if module == 'influence':
        importance_dis = json.loads(es_result['importance_his'])
        activeness_his = json.loads(es_result['activeness_his'])
        influence_his = json.loads(es_result['influence_his'])
        user_influence_list = json.loads(es_result['user_influence_list'])
        user_influence_result = []
        for user_item in user_influence_list:
            uid = user_item[0]
            result_item = user_item[:5]
            for i in range(5, 9):
                item = user_item[i]
                mid = item[1]
                number = item[0]
                if mid != 0 and uid:
                    weibolink = weiboinfo2url(uid, mid)
                else:
                    weibolink = None
                result_item.append((number, mid, weibolink))
            user_influence_result.append(result_item)
        '''
        origin_max_retweeted_number =es_result['origin_max_retweeted_number']
        origin_max_retweeted_id = es_result['origin_max_retweeted_id']
        origin_max_retweeted_user = es_result['origin_max_retweeted_user']
        if origin_max_retweeted_id != 0 and origin_max_retweeted_user != 0:
            origin_max_retweeted_weibolink = weiboinfo2url(origin_max_retweeted_user, origin_max_retweeted_id)
        else:
            origin_max_retweeted_weibolink = None

        origin_max_comment_number = es_result['origin_max_comment_number']
        origin_max_comment_id = es_result['origin_max_comment_id']
        origin_max_comment_user = es_result['origin_max_comment_user']
        if origin_max_comment_id !=0 and origin_max_comment_user != 0:
            origin_max_comment_weibolink = weiboinfo2url(origin_max_comment_user, origin_max_comment_id)
        else:
            origin_max_comment_weibolink = None
        
        retweet_max_retweeted_number = es_result['retweet_max_retweeted_number']
        retweet_max_retweeted_id = es_result['retweet_max_retweeted_id']
        retweet_max_retweeted_user = es_result['retweet_max_retweeted_user']
        if retweet_max_retweeted_id != 0 and retweet_max_retweeted_user != 0:
            retweet_max_retweeted_weibolink = weiboinfo2url(retweet_max_retweeted_user, retweet_max_retweeted_id)
        else:
            retweet_max_retweeted_weibolink = None

        retweet_max_comment_number = es_result['retweet_max_comment_number']
        retweet_max_comment_id = es_result['retweet_max_comment_id']
        retweet_max_comment_user = es_result['retweet_max_comment_user']
        if retweet_max_comment_id != 0 and retweet_max_comment_user != 0:
            retweet_max_comment_weibolink = weiboinfo2url(retweet_max_comment_user, retweet_max_comment_id)
        else:
            retweet_max_comment_weibolink = None
        '''
        result = [
            importance_dis, activeness_his, influence_his,
            user_influence_result
        ]
    #print result
    return result
Пример #22
0
def get_group_results(task_name, module):
    result = []
    try:
        es_result = es.get(index=index_name, doc_type=index_type, id=task_name)['_source']
        #print 'result:', result
    except:
        return None
    #basic module: gender, count, verified
    if module=='overview':
        task_name = es_result['task_name']
        submit_date = es_result['submit_date']
        state = es_result['state']
        tightness = es_result['tightness']
        activeness = es_result['activeness']
        importance = es_result['importance']
        influence = es_result['influence']
        result = [task_name, submit_date, state, tightness, activeness, importance, influence]
    if module=='basic':
        gender_dict = json.loads(es_result['gender'])
        count = es_result['count']
        verified = es_result['verified']
        if verified:
            verified_dict = json.loads(verified)
        result = [gender_dict, count, verified]
    if module=='activity':
        activity_geo_dict = json.loads(es_result['activity_geo'])
        sort_activity_geo = sorted(activity_geo_dict.items(), key=lambda x:x[1], reverse=True)
        activity_geo = sort_activity_geo[:50]
        activity_trend = json.loads(es_result['activity_trend'])
        online_pattern_dict = json.loads(es_result['online_pattern'])
        sort_online_pattern = sorted(online_pattern_dict.items(), key=lambda x:x[1], reverse=True)
        online_pattern = sort_online_pattern[:50]
        geo_track = json.loads(es_result['geo_track'])
        result = [activity_geo, activity_trend, online_pattern, geo_track]
    if module=='social':
        #degree_his = json.loads(es_result['degree_his'])
        density = es_result['density']
        retweet_weibo_count = es_result['retweet_weibo_count']
        retweet_user_count = es_result['retweet_user_count']
        retweet_relation = json.loads(es_result['retweet_relation'])
        uid_list = []
        for relation in retweet_relation:
            uid_list.append(relation[0])
            uid_list.append(relation[1])
        es_portrait_result = es.mget(index='user_portrait', doc_type='user', body={'ids':uid_list})['docs']
        es_count = 0
        new_retweet_relation = []
        for relation in retweet_relation:
            source_uid = relation[0]
            source_item = es_portrait_result[es_count]
            try:
                source = source_item['_source']
                source_uname = source['uname']
            except:
                source_uname = ''
            target_uid = relation[1]
            es_count += 1
            target_item = es_portrait_result[es_count]
            try:
                source = target_item['_source']
                target_uname = source['uname']
            except:
                target_uname = ''

            count = relation[2]
            new_retweet_relation.append([source_uid, source_uname, target_uid, target_uname, count])
        uid_list = []
        out_beretweet_relation = json.loads(es_result['out_beretweet_relation'])
        uid_list = []
        uid_list = [item[0] for item in out_beretweet_relation]
        es_portrait_result = es.mget(index='user_portrait', doc_type='user', body={'ids':uid_list})['docs']
        es_count = 0
        new_out_beretweet_relation = []
        for i in range(len(uid_list)):
            item = es_portrait_result[i]
            uid = item['_id']
            try:
                source = item['_source']
                uname = source['uname']
            except:
                uname = ''
            out_relation_item = out_beretweet_relation[i][1:]
            a = [uid, uname]
            a.extend(out_relation_item)
            #print 'add_item:', add_item
            new_out_beretweet_relation.append(a)
        result = [new_retweet_relation, density, retweet_weibo_count, retweet_user_count, new_out_beretweet_relation]
    if module=='think':
        domain_dict = json.loads(es_result['domain'])
        topic_dict = json.loads(es_result['topic'])
        psycho_status = json.loads(es_result['psycho_status'])
        psycho_feature = json.loads(es_result['psycho_feature'])
        result = [domain_dict, topic_dict, psycho_status, psycho_feature]
    if module=='text':
        hashtag_dict = json.loads(es_result['hashtag'])
        sort_hashtag = sorted(hashtag_dict.items(), key=lambda x:x[1], reverse=True)
        hashtag = sort_hashtag[:50]
        emoticon_dict = json.loads(es_result['emoticon'])
        sort_emoticon = sorted(emoticon_dict.items(), key=lambda x:x[1], reverse=True)
        emoticon = sort_emoticon[:5]
        keyword_dict = json.loads(es_result['keywords'])
        sort_keyword = sorted(keyword_dict.items(), key=lambda x:x[1], reverse=True)
        keyword = sort_keyword[:50]
        result = [hashtag, keyword, emoticon]
    if module=='influence':
        importance_dis = json.loads(es_result['importance_his'])
        activeness_his = json.loads(es_result['activeness_his'])
        influence_his = json.loads(es_result['influence_his'])
        user_influence_list = json.loads(es_result['user_influence_list'])
        user_influence_result = []
        for user_item in user_influence_list:
            uid = user_item[0]
            result_item = user_item[:5]
            for i in range(5,9):
                item = user_item[i]
                mid = item[1]
                number = item[0]
                if mid != 0 and uid:
                    weibolink = weiboinfo2url(uid, mid)
                else:
                    weibolink = None
                result_item.append((number, mid, weibolink))
            user_influence_result.append(result_item)
        '''
        origin_max_retweeted_number =es_result['origin_max_retweeted_number']
        origin_max_retweeted_id = es_result['origin_max_retweeted_id']
        origin_max_retweeted_user = es_result['origin_max_retweeted_user']
        if origin_max_retweeted_id != 0 and origin_max_retweeted_user != 0:
            origin_max_retweeted_weibolink = weiboinfo2url(origin_max_retweeted_user, origin_max_retweeted_id)
        else:
            origin_max_retweeted_weibolink = None

        origin_max_comment_number = es_result['origin_max_comment_number']
        origin_max_comment_id = es_result['origin_max_comment_id']
        origin_max_comment_user = es_result['origin_max_comment_user']
        if origin_max_comment_id !=0 and origin_max_comment_user != 0:
            origin_max_comment_weibolink = weiboinfo2url(origin_max_comment_user, origin_max_comment_id)
        else:
            origin_max_comment_weibolink = None
        
        retweet_max_retweeted_number = es_result['retweet_max_retweeted_number']
        retweet_max_retweeted_id = es_result['retweet_max_retweeted_id']
        retweet_max_retweeted_user = es_result['retweet_max_retweeted_user']
        if retweet_max_retweeted_id != 0 and retweet_max_retweeted_user != 0:
            retweet_max_retweeted_weibolink = weiboinfo2url(retweet_max_retweeted_user, retweet_max_retweeted_id)
        else:
            retweet_max_retweeted_weibolink = None

        retweet_max_comment_number = es_result['retweet_max_comment_number']
        retweet_max_comment_id = es_result['retweet_max_comment_id']
        retweet_max_comment_user = es_result['retweet_max_comment_user']
        if retweet_max_comment_id != 0 and retweet_max_comment_user != 0:
            retweet_max_comment_weibolink = weiboinfo2url(retweet_max_comment_user, retweet_max_comment_id)
        else:
            retweet_max_comment_weibolink = None
        '''
        result = [importance_dis, activeness_his, influence_his, user_influence_result]
    #print result
    return result
def influenced_user_detail(uid, date, origin_retweeted_mid, retweeted_retweeted_mid, message_type, default_number=20):
    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "must": [
                        ]
                    }
                }
            }
        },
        "size":20000,
    }
    if RUN_TYPE == 1:
        query_body["sort"] = {"user_fansnum":{"order":"desc"}}

    #详细影响到的人 
    date1 = str(date).replace('-', '')
    index_name = pre_index + date1
    index_flow_text = pre_text_index + date
    origin_retweeted_uid = [] # influenced user uid_list
    retweeted_retweeted_uid = []
    origin_comment_uid = []
    retweeted_comment_uid = []
    query_origin = copy.deepcopy(query_body)
    query_retweeted = copy.deepcopy(query_body)
    if origin_retweeted_mid: # 所有转发该条原创微博的用户
        query_origin["query"]["filtered"]["filter"]["bool"]["must"].append({"terms": {"root_mid": origin_retweeted_mid}})
        query_origin["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term":{"message_type": message_type}}, {"term":{"root_uid": uid}}])
        origin_retweeted_result = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=query_origin, fields=["uid"])["hits"]["hits"]
        if origin_retweeted_result:
            for item in origin_retweeted_result:
                origin_retweeted_uid.append(item["fields"]["uid"][0])
    if retweeted_retweeted_mid: # 所有评论该条原创微博的用户
        query_retweeted["query"]["filtered"]["filter"]["bool"]["must"].append({"terms": {"root_mid": retweeted_retweeted_mid}})
        query_retweeted["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term":{"message_type": message_type}},{"term": {"directed_uid": uid}}])
        retweeted_retweeted_result = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=query_retweeted, fields=["uid"])["hits"]["hits"]
        if retweeted_retweeted_result:
            for item in retweeted_retweeted_result:
                retweeted_retweeted_uid.append(item["fields"]["uid"][0])
    retweeted_uid_list = [] # all retweeted user list
    retweeted_results = {} # statistics of all retweeted uid information
    retweeted_domain = {}
    retweeted_topic = {}
    retweeted_geo = {}
    bci_results = {}
    in_portrait = []
    out_portrait = []
    average_influence = 0
    total_influence = 0
    count = 0
    all_uid_set = set(origin_retweeted_uid) | set(retweeted_retweeted_uid)

    retweeted_uid_list.extend(origin_retweeted_uid)
    retweeted_uid_list.extend(retweeted_retweeted_uid)
    retweeted_uid_list = list(set(retweeted_uid_list) - set([uid])) # filter uids
    if retweeted_uid_list:
        user_portrait_result = es_user_portrait.mget(index=user_portrait, doc_type=portrait_index_type, body={"ids": retweeted_uid_list}, fields=["domain", "topic_string", "activity_geo_dict","importance", "influence"])["docs"]
        bci_index = "bci_" + date.replace('-', '')
        bci_results = es_cluster.mget(index=bci_index, doc_type="bci", body={"ids":retweeted_uid_list}, fields=['user_index'])["docs"]
        for item in user_portrait_result:
            if item["found"]:
                temp = []
                count += 1
                temp.append(item['_id'])
                temp.append(item["fields"]["importance"][0])
                in_portrait.append(temp)
                temp_domain = item["fields"]["domain"][0].split('&')
                temp_topic = item["fields"]["topic_string"][0].split('&')
                temp_geo = json.loads(item["fields"]["activity_geo_dict"][0])[-1].keys()
                #total_influence += item["fields"]["influence"][0]
                retweeted_domain = aggregation(temp_domain, retweeted_domain)
                retweeted_topic = aggregation(temp_topic, retweeted_topic)
                retweeted_geo = aggregation(temp_geo, retweeted_geo)
            else:
                out_portrait.append(item['_id'])
        retweeted_domain = proportion(retweeted_domain)
        retweeted_topic = proportion(retweeted_topic)
        retweeted_geo = proportion(retweeted_geo)


    if bci_results:
        total_influence = 0
        for item in bci_results:
            if item['found']:
                total_influence += item['fields']['user_index'][0]
    try:
        average_influence = total_influence/len(retweeted_uid_list)
    except:
        average_influence = 0

    sorted_retweeted_domain = sorted(retweeted_domain.items(),key=lambda x:x[1], reverse=True)
    sorted_retweeted_topic = sorted(retweeted_topic.items(),key=lambda x:x[1], reverse=True)
    sorted_retweeted_geo = sorted(retweeted_geo.items(), key=lambda x:x[1], reverse=True)
    retweeted_results["domian"] = sorted_retweeted_domain[:5]
    retweeted_results["topic"] = sorted_retweeted_topic[:5]
    retweeted_results["geo"] = sorted_retweeted_geo[:5]
    retweeted_results["influence"] = average_influence
    in_portrait = sorted(in_portrait, key=lambda x:x[1], reverse=True)

    temp_list = []
    for item in in_portrait:
        temp_list.append(item[0])
    retweeted_results['in_portrait_number'] = len(temp_list)
    retweeted_results['out_portrait_number'] = len(out_portrait)
    in_portrait_url = get_user_url(temp_list[:default_number])
    out_portrait_url = get_user_url(out_portrait[:default_number])
    retweeted_results["in_portrait"] = in_portrait_url
    retweeted_results["out_portrait"] = out_portrait_url
    retweeted_results["total_number"] = len(temp_list) + len(out_portrait)
 

    return retweeted_results
def influenced_people(uid, mid, influence_style, date, default_number=20):
# uid 
# which weibo----mid, retweeted weibo ---seek for root_mid
# influence_style: retweeted(0) or comment(1)
    date1 = ts2datetime(datetime2ts(date)).replace('-', '')
    index_name = pre_index + date1
    index_flow_text = pre_text_index + date
    text_result = es.get(index=index_flow_text, doc_type=flow_text_index_type, id=mid)["_source"]
    temp_mid = text_result.get("root_mid",'') #判断微博是否是原创微博
    if temp_mid:
        mid_type = 1 # 非原创微博
    else:
        mid_type = 0 # 原创微博
    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "must":[
                        ]
                    }
                }
            }
        },
        "size": 30000
    }
    if RUN_TYPE:
        query_body["sort"] = {"user_fansnum":{"order":"desc"}}

    if int(mid_type) == 0:
        if int(influence_style) == 0: # origin weibo, all retweeted people
            query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"root_uid": uid}}, {"term": {"message_type": 3}}, {"term": {"root_mid": mid}}])
        else: # commented people
            query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"directed_uid": uid}}, {"term": {"message_type": 2}}, {"term": {"root_mid": mid}}])
    else:
        if int(influence_style) == 0: # origin weibo, all retweeted people
            query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"directed_uid": uid}}, {"term": {"message_type": 3}}, {"term": {"root_mid": temp_mid}}])
        else: # commented people
            query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"directed_uid": uid}}, {"term": {"message_type": 2}}, {"term": {"root_mid": temp_mid}}])
    search_results = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=query_body, _source=False, fields=["uid"], timeout=30)["hits"]["hits"]
    results = [] # uid_list
    if search_results:
        for item in search_results:
            if int(item["fields"]["uid"][0]) == int(uid):
                pass
            else:
                results.append(item["fields"]["uid"][0])
        results = list(set(results))
    else:
        results = []

    bci_index = "bci_" + date.replace('-','')

    if results:
        portrait_results = es_user_portrait.mget(index=user_portrait, doc_type=portrait_index_type, body={"ids": results}, fields=["domain", "topic_string", "activity_geo_dict","importance", "influence"])["docs"]
        bci_results = es_cluster.mget(index=bci_index, doc_type='bci', body={"ids":results}, fields=['user_index'])['docs']
    else:
        portrait_results = {}
        bci_results = {}


    in_portrait = []
    out_portrait = []
    in_portrait_info = []
    retweeted_domain = {}
    retweeted_topic = {}
    retweeted_geo = {}
    average_influence = 0
    total_influence = 0
    count = 0

    if bci_results:
        total_influence = 0
        for item in bci_results:
            if item['found']:
                total_influence += item['fields']['user_index'][0]
    try:
        average_influence = total_influence/len(results)
    except:
        average_influence = 0

    if portrait_results:
        for item in portrait_results:
            if item["found"]:
                temp = []
                count += 1
                temp.append(item['_id'])
                temp.append(item["fields"]["importance"][0])
                in_portrait.append(temp)
                temp_domain = item["fields"]["domain"][0].split('&')
                temp_topic = item["fields"]["topic_string"][0].split('&')
                temp_geo = json.loads(item["fields"]["activity_geo_dict"][0])[-1].keys()
                #total_influence += item["fields"]["influence"][0]
                retweeted_domain = aggregation(temp_domain, retweeted_domain)
                retweeted_topic = aggregation(temp_topic, retweeted_topic)
                retweeted_geo = aggregation(temp_geo, retweeted_geo)
            else:
                out_portrait.append(item['_id'])
        retweeted_domain = proportion(retweeted_domain)
        retweeted_topic = proportion(retweeted_topic)
        retweeted_geo = proportion(retweeted_geo)
        #try:
        #    average_influence = total_influence/count
        #except:
        #    average_influence = 0
    sorted_retweeted_domain = sorted(retweeted_domain.items(),key=lambda x:x[1], reverse=True)
    sorted_retweeted_topic = sorted(retweeted_topic.items(),key=lambda x:x[1], reverse=True)
    sorted_retweeted_geo = sorted(retweeted_geo.items(), key=lambda x:x[1], reverse=True)

    retweeted_results = dict()
    retweeted_results["domian"] = sorted_retweeted_domain[:5]
    retweeted_results["topic"] = sorted_retweeted_topic[:5]
    retweeted_results["geo"] = sorted_retweeted_geo[:5]
    retweeted_results["influence"] = average_influence
    in_portrait = sorted(in_portrait, key=lambda x:x[1], reverse=True)


    temp_list = []
    for item in in_portrait:
        temp_list.append(item[0])
    retweeted_results['in_portrait_number'] = len(temp_list)
    retweeted_results['out_portrait_number'] = len(out_portrait)
    in_portrait_url = get_user_url(temp_list[:default_number])
    out_portrait_url = get_user_url(out_portrait[:default_number])

    return_results = dict()
    return_results["influence_users"] = [in_portrait_url, out_portrait_url]
    return_results["influence_distribution"] = retweeted_results

    return return_results