Пример #1
0
def query_hot_mid(ts, keywords_list, text_type,size=100):
    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "must":[
                            {"range":{
                                "timestamp":{
                                    "gte":ts - time_interval,
                                    "lt": ts
                                }
                            }},
                            {"terms": {"keywords_string": keywords_list}},
                            {"term": {"message_type": "0"}}
                        ]
                    }
                }
            }
        },
        "aggs":{
            "all_interests":{
                "terms":{"field": "root_mid", "size": size}
            }
        }
    }

    datetime = ts2datetime(ts)
    datetime_1 = ts2datetime(ts-time_interval)
    index_name = flow_text_index_name_pre + datetime
    exist_es = es_text.indices.exists(index_name)
    index_name_1 = flow_text_index_name_pre + datetime_1
    exist_bool_1 = es_text.indices.exists(index_name_1)
    print datetime, datetime_1
    if datetime == datetime_1 and exist_es:
        search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["aggregations"]["all_interests"]["buckets"]
    elif datetime != datetime_1 and exist_bool_1:
        search_results = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body)["aggregations"]["all_interests"]["buckets"]
    else:
        search_results = []

    hot_mid_list = []
    if search_results:
        for item in search_results:
            print item
            temp = []
            temp.append(item['key'])
            temp.append(item['doc_count'])
            hot_mid_list.append(temp)

    #print hot_mid_list

    return hot_mid_list
Пример #2
0
def query_mid_list(ts, keywords_list, time_segment, social_sensors=[]):
    query_body = {
        "query": {
            "filtered": {
                "filter": {
                    "bool": {
                        "must": [
                            {"range": {
                                "timestamp": {
                                    "gte": ts - time_segment,
                                    "lt": ts
                                 }
                            }},
                            {"terms": {"keywords_string": keywords_list}}
                        ]
                    }
                }
            }
        },
        "size": 10000
    }

    if social_sensors:
        query_body['query']['filtered']['filter']['bool']['must'].append({"terms": {"uid": social_sensors}})

    datetime = ts2datetime(ts)
    index_name = flow_text_index_name_pre + datetime
    exist_es = es_text.indices.exists(index_name)
    if exist_es:
        search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body, fields=["root_mid"])["hits"]["hits"]
    else:
        search_results = []
    origin_mid_list = set() # all related weibo mid list
    if search_results:
        for item in search_results:
            #if item.get("fields", ""):
            #    origin_mid_list.append(item["fields"]["root_mid"][0])
            #else:
            origin_mid_list.add(item["_id"])

    datetime_1 = ts2datetime(ts-time_segment)
    index_name_1 = flow_text_index_name_pre + datetime_1
    exist_bool = es_text.indices.exists(index_name_1)
    if datetime != datetime_1 and exist_bool:
        search_results_1 = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body, fields=['root_mid'])["hits"]["hits"]
        if search_results_1:
            for item in search_results_1:
                #if item.get("fields", ""):
                #    origin_mid_list.append(item["fields"]["root_mid"][0])
                #else:
                origin_mid_list.add(item["_id"])

    return list(origin_mid_list)
def statistics_influence_people(uid, date, style):
    # output: different retweeted and comment, uids' domain distribution, topic distribution, registeration geo distribution
    results = {} # retwweted weibo people and comment weibo people
    date1 = str(date).replace('-', '')
    index_name = pre_index + date1
    index_flow_text = pre_text_index + date

    try:
        bci_result = es_cluster.get(index=index_name, doc_type=influence_doctype, id=uid)["_source"]
    except:
        bci_result = []
        return results
    origin_mid = [] # origin weibo mid
    retweeted_mid = [] # retweeted weibo mid

    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "must":[
                        ]
                    }
                }
            }
        },
        "size":1000
    }

    body_1 = copy.deepcopy(query_body)
    body_2 = copy.deepcopy(query_body)

    body_1["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term":{"message_type": 1}}, {"term":{"uid": uid}}])
    result_1 = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=body_1)["hits"]["hits"]
    if result_1:
        for item in result_1:
            origin_mid.append(item['_id'])

    body_1["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term":{"message_type": 3}}, {"term":{"uid": uid}}])
    result_2 = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=body_2)["hits"]["hits"]
    if result_2:
        for item in result_2:
            if item['_source'].get('root_mid', ''):
                retweeted_mid.append(item['_source']['root_mid'])    
    

    if int(style) == 0: # retweeted
        retweeted_results = influenced_user_detail(uid, date, origin_mid, retweeted_mid, 3)
        results = retweeted_results
    else:
        comment_results = influenced_user_detail(uid, date, origin_mid, retweeted_mid, 2)
        results = comment_results
    return results
Пример #4
0
def search_group_sentiment_weibo(task_name, start_ts, sentiment):
    weibo_list = []
    #step1:get task_name uid
    try:
        group_result = es_group_result.get(index=group_index_name, doc_type=group_index_type,\
                        id=task_name, _source=False, fields=['uid_list'])
    except:
        group_result = {}
    if group_result == {}:
        return 'task name invalid'
    try:
        uid_list = group_result['fields']['uid_list']
    except:
        uid_list = []
    if uid_list == []:
        return 'task uid list null'
    #step3: get ui2uname
    uid2uname = {}
    try:
        user_portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type,\
                        body={'ids':uid_list}, _source=False, fields=['uname'])['docs']
    except:
        user_portrait_result = []
    for item in user_portrait_result:
        uid = item['_id']
        if item['found'] == True:
            uname = item['fields']['uname'][0]
            uid2uname[uid] = uname
    #step4:iter date to search weibo
    weibo_list = []
    iter_date = ts2datetime(start_ts)
    flow_text_index_name = flow_text_index_name_pre + str(iter_date)
    #step4: get query_body
    if sentiment != '2':
        query_body = [{'terms': {'uid': uid_list}}, {'term':{'sentiment': sentiment}}, \
                {'range':{'timestamp':{'gte':start_ts, 'lt': start_ts+DAY}}}]
    else:
        query_body = [{'terms':{'uid':uid_list}}, {'terms':{'sentiment': SENTIMENT_SECOND}},\
                {'range':{'timestamp':{'gte':start_ts, 'lt':start_ts+DAY}}}]
    try:
        flow_text_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\
                body={'query':{'bool':{'must': query_body}}, 'sort': [{'timestamp':{'order':'asc'}}], 'size': MAX_VALUE})['hits']['hits']
    except:
        flow_text_result = []
    for flow_text_item in flow_text_result:
        source = flow_text_item['_source']
        weibo = {}
        weibo['uid'] = source['uid']
        weibo['uname'] = uid2uname[weibo['uid']]
        weibo['ip'] = source['ip']
        try:
            weibo['geo'] = '\t'.join(source['geo'].split('&'))
        except:
            weibo['geo'] = ''
        weibo['text'] = source['text']
        weibo['timestamp'] = source['timestamp']
        weibo['sentiment'] = source['sentiment']
        weibo_list.append(weibo)

    return weibo_list
Пример #5
0
def get_user_ip(uid):
    flow_text_index_list = []
    now_timestamp = datetime2ts(ts2datetime(time.time()))
    if RUN_TYPE == 0:
        now_timestamp = datetime2ts(RUN_TEST_TIME)
    for i in range(7, 0, -1):
        iter_date = ts2datetime(now_timestamp - DAY * i)
        flow_text_index_list.append(flow_text_index_name_pre + iter_date)

    weibo_all = es_flow_text.search(index=flow_text_index_list,
                                    doc_type=flow_text_index_type,
                                    body={
                                        'query': {
                                            'filtered': {
                                                'filter': {
                                                    'term': {
                                                        'uid': uid
                                                    }
                                                }
                                            }
                                        },
                                        'size': 10,
                                    })['hits']['hits']
    ip = weibo_all[0]["_source"]["ip"]
    return ip
Пример #6
0
def get_repost_weibo(mid, weibo_timestamp):
    repost_result = []
    index_date = ts2datetime(weibo_timestamp)
    index_name = flow_text_index_name_pre + index_date
    query_body = {
            'query':{
                'bool':{
                    'must':[
                        {'term':{'root_mid': mid}},
                        {'range':{'timestamp':{'gte': weibo_timestamp}}},
                        {'term':{'message_type': 2}}
                        ]
                    }
                }
            }
    try:
        flow_text_result = es_flow_text.search(index=index_name, doc_type=flow_text_index_type,\
                body=query_body)['hits']['hits']
    except:
        flow_text_result = []
    repost_uid_list = [item['_source']['uid'] for item in flow_text_result]
    repost_user_info_dict = get_user_profile_weibo(repost_uid_list)
    statuses = []
    for item in flow_text_result:
        item_source = item['_source']
        item_source['user'] = repost_user_info_dict[item['uid']]
        statuses.append(item_source)
    
    return statuses
def get_repost_weibo(mid, weibo_timestamp):
    repost_result = []
    index_date = ts2datetime(weibo_timestamp)
    index_name = flow_text_index_name_pre + index_date
    query_body = {
            'query':{
                'bool':{
                    'must':[
                        {'term':{'root_mid': mid}},
                        {'range':{'timestamp':{'gte': weibo_timestamp}}},
                        {'term':{'message_type': 2}}
                        ]
                    }
                }
            }
    try:
        flow_text_result = es_flow_text.search(index=index_name, doc_type=flow_text_index_type,\
                body=query_body)['hits']['hits']
    except:
        flow_text_result = []
    repost_uid_list = [item['_source']['uid'] for item in flow_text_result]
    repost_user_info_dict = get_user_profile_weibo(repost_uid_list)
    statuses = []
    for item in flow_text_result:
        item_source = item['_source']
        item_source['user'] = repost_user_info_dict[item['uid']]
        statuses.append(item_source)
    
    return statuses
Пример #8
0
def get_sen_ratio(topic,start_ts,end_ts):
    query_body = {
        'query':{
            'bool':{
                'must':[
                    {'wildcard':{'text':'*'+topic+'*'}},
                    {'range':{'timestamp':{'lte':end_ts,'gte':start_ts}}}
                ]
            }
        },
        'aggs':{
            'all_interests':{
                'terms':{
                    'field': 'sentiment',
                }
            }
        }
    } 
    if RUN_TYPE == 0 :
        date = '2013-09-07'
    else:
        date = ts2datetime(time.time())
    print query_body
    result = es_flow_text.search(index = flow_text_index_name_pre+date,doc_type=flow_text_index_type,body=query_body)\
            ['aggregations']['all_interests']['buckets']

    return result
Пример #9
0
def get_psycho_status(uid_list):
    results = {}
    uid_sentiment_dict = {}
    #time for es_flow_text
    now_ts = time.time()
    now_date_ts = datetime2ts(ts2datetime(now_ts))
    #run_type
    if RUN_TYPE == 0:
        now_date_ts = datetime2ts(RUN_TEST_TIME)
    start_date_ts = now_date_ts - DAY * WEEK
    for i in range(0, WEEK):
        iter_date_ts = start_date_ts + DAY * i
        flow_text_index_date = ts2datetime(iter_date_ts)
        flow_text_index_name = flow_text_index_name_pre + flow_text_index_date
        try:
            flow_text_exist = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\
                    body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'size': MAX_VALUE}, _source=False,  fields=['uid', 'sentiment'])['hits']['hits']
        except:
            flow_text_exist = []
        for flow_text_item in flow_text_exist:
            uid = flow_text_item['fields']['uid'][0]
            sentiment = flow_text_item['fields']['sentiment'][0]
            if uid in uid_sentiment_dict:
                try:
                    uid_sentiment_dict[uid][str(sentiment)] += 1
                except:
                    uid_sentiment_dict[uid][str(sentiment)] = 1
            else:
                uid_sentiment_dict[uid] = {str(sentiment): 1}
    #compute first and second psycho_status
    for uid in uid_list:
        results[uid] = {'first': {}, 'second': {}}
        try:
            user_sentiment_result = uid_sentiment_dict[uid]
        except:
            user_sentiment_result = {}
        all_count = sum(user_sentiment_result.values())
        #compute second level sentiment---negative type sentiment
        second_sentiment_count_list = [
            user_sentiment_result[item] for item in user_sentiment_result
            if item in SENTIMENT_SECOND
        ]
        second_sentiment_all_count = sum(second_sentiment_count_list)
        for sentiment_item in SENTIMENT_SECOND:
            try:
                results[uid]['second'][sentiment_item] = float(
                    user_sentiment_result[sentiment_item]) / all_count
            except:
                results[uid]['second'][sentiment_item] = 0
        #compute first level sentiment---middle, postive, negative
        user_sentiment_result['7'] = second_sentiment_all_count
        for sentiment_item in SENTIMENT_FIRST:
            try:
                sentiment_ratio = float(
                    user_sentiment_result[sentiment_item]) / all_count
            except:
                sentiment_ratio = 0
            results[uid]['first'][sentiment_item] = sentiment_ratio

    return results
Пример #10
0
def search_group_sentiment_weibo(task_name, start_ts, sentiment):
    weibo_list = []
    #step1:get task_name uid
    try:
        group_result = es_group_result.get(index=group_index_name, doc_type=group_index_type,\
                        id=task_name, _source=False, fields=['uid_list'])
    except:
        group_result = {}
    if group_result == {}:
        return 'task name invalid'
    try:
        uid_list = group_result['fields']['uid_list']
    except:
        uid_list = []
    if uid_list == []:
        return 'task uid list null'
    #step3: get ui2uname
    uid2uname = {}
    try:
        user_portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type,\
                        body={'ids':uid_list}, _source=False, fields=['uname'])['docs']
    except:
        user_portrait_result = []
    for item in user_portrait_result:
        uid = item['_id']
        if item['found']==True:
            uname = item['fields']['uname'][0]
            uid2uname[uid] = uname
    #step4:iter date to search weibo
    weibo_list = []
    iter_date = ts2datetime(start_ts)
    flow_text_index_name = flow_text_index_name_pre + str(iter_date)
    #step4: get query_body
    if sentiment != '2':
        query_body = [{'terms': {'uid': uid_list}}, {'term':{'sentiment': sentiment}}, \
                {'range':{'timestamp':{'gte':start_ts, 'lt': start_ts+DAY}}}]
    else:
        query_body = [{'terms':{'uid':uid_list}}, {'terms':{'sentiment': SENTIMENT_SECOND}},\
                {'range':{'timestamp':{'gte':start_ts, 'lt':start_ts+DAY}}}]
    try:
        flow_text_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\
                body={'query':{'bool':{'must': query_body}}, 'sort': [{'timestamp':{'order':'asc'}}], 'size': MAX_VALUE})['hits']['hits']
    except:
        flow_text_result = []
    for flow_text_item in flow_text_result:
        source = flow_text_item['_source']
        weibo = {}
        weibo['uid'] = source['uid']
        weibo['uname'] = uid2uname[weibo['uid']]
        weibo['ip'] = source['ip']
        try:
            weibo['geo'] = '\t'.join(source['geo'].split('&'))
        except:
            weibo['geo'] = ''
        weibo['text'] = source['text']
        weibo['timestamp'] = source['timestamp']
        weibo['sentiment'] = source['sentiment']
        weibo_list.append(weibo)

    return weibo_list
Пример #11
0
def get_influence_content(uid, timestamp_from, timestamp_to):
    weibo_list = []
    #split timestamp range to new_range_dict_list
    from_date_ts = datetime2ts(ts2datetime(timestamp_from))
    to_date_ts = datetime2ts(ts2datetime(timestamp_to))
    new_range_dict_list = []
    if from_date_ts != to_date_ts:
        iter_date_ts = from_date_ts
        while iter_date_ts < to_date_ts:
            iter_next_date_ts = iter_date_ts + DAY
            new_range_dict_list.append({
                'range': {
                    'timestamp': {
                        'gte': iter_date_ts,
                        'lt': iter_next_date_ts
                    }
                }
            })
            iter_date_ts = iter_next_date_ts
        if new_range_dict_list[0]['range']['timestamp']['gte'] < timestamp_from:
            new_range_dict_list[0]['range']['timestamp'][
                'gte'] = timestamp_from
        if new_range_dict_list[-1]['range']['timestamp']['lt'] > timestamp_to:
            new_range_dict_list[-1]['range']['timestamp']['lt'] = timestamp_to
    else:
        new_range_dict_list = [{
            'range': {
                'timestamp': {
                    'gte': timestamp_from,
                    'lt': timestamp_to
                }
            }
        }]
    #iter date to search flow_text
    iter_result = []
    for range_item in new_range_dict_list:
        range_from_ts = range_item['range']['timestamp']['gte']
        range_from_date = ts2datetime(range_from_ts)
        flow_text_index_name = flow_text_index_name_pre + range_from_date
        query = []
        query.append({'term': {'uid': uid}})
        query.append(range_item)
        try:
            flow_text_exist = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\
                    body={'query':{'bool':{'must': query}}, 'sort':[{'timestamp':'asc'}]})['hits']['hits']
        except:
            flow_text_exist = []
        iter_result.extend(flow_text_exist)
    # get weibo list
    for item in flow_text_exist:
        source = item['_source']
        weibo = {}
        weibo['timestamp'] = ts2date(source['timestamp'])
        weibo['ip'] = source['ip']
        weibo['text'] = source['text']
        weibo['geo'] = '\t'.join(source['geo'].split('&'))
        weibo_list.append(weibo)

    return weibo_list
Пример #12
0
def new_get_user_weibo(uid, sort_type):
    results = []
    weibo_list = []
    now_date = ts2datetime(time.time())
    #run_type
    if RUN_TYPE == 0:
        now_date = RUN_TEST_TIME
        sort_type = 'timestamp'
    #step1:get user name
    try:
        user_profile_result = es_user_profile.get(index=profile_index_name, doc_type=profile_index_type,\
                id=uid, _source=False, fields=['nick_name'])
    except:
        user_profile_result = {}
    if user_profile_result:
        uname = user_profile_result['fields']['nick_name'][0]
    else:
        uname = ''
    #step2:get user weibo
    for i in range(7, 0, -1):
        iter_date = ts2datetime(datetime2ts(now_date) - i * DAY)
        index_name = flow_text_index_name_pre + iter_date
        try:
            weibo_result = es_flow_text.search(index=index_name, doc_type=flow_text_index_type,\
                    body={'query':{'filtered':{'filter':{'term': {'uid': uid}}}}, 'sort':sort_type, 'size':100})['hits']['hits']
        except:
            weibo_result = []
        if weibo_result:
            weibo_list.extend(weibo_result)
    print 'weibo_list:', weibo_list[0]
    sort_weibo_list = sorted(weibo_list,
                             key=lambda x: x['_source'][sort_type],
                             reverse=True)[:100]
    for weibo_item in sort_weibo_list:
        source = weibo_item['_source']
        mid = source['mid']
        uid = source['uid']
        text = source['text']
        ip = source['geo']
        timestamp = source['timestamp']
        date = ts2date(timestamp)
        sentiment = source['sentiment']
        #run_type
        if RUN_TYPE == 1:
            retweet_count = source['retweet_count']
            comment_count = source['comment_count']
            sensitive_score = source['sensitive']
        else:
            retweet_count = 0
            comment_count = 0
            sensitive_score = 0
        city = ip2city(ip)
        results.append([
            mid, uid, text, ip, city, timestamp, date, retweet_count,
            comment_count, sensitive_score
        ])

    return results
Пример #13
0
def search_weibo(root_uid,uid,mtype):
    query_body = {
        #'query':{
            'filter':{
                'bool':{
                    'must':[{'term':{'uid':uid}},
                            {'term':{'message_type':mtype}}],
                    'should':[{'term':{'root_uid':root_uid}},
                              {'term':{'directed_uid':root_uid}}],
                }
            }
        #}
    }
    index_list = []
    for i in range(7, 0, -1):
        if RUN_TYPE == 1:
            iter_date = ts2datetime(datetime2ts(now_date) - i * DAY)
        else:
            iter_date = ts2datetime(datetime2ts(RUN_TEST_TIME) - i * DAY) 
        index_list.append(flow_text_index_name_pre + iter_date)
    results = es_flow_text.search(index=index_list,doc_type=flow_text_index_type,body=query_body)['hits']['hits']
    weibo = {}
    f_result = []

    if len(results) > 0:
        for result in results:
            #print type(result),result
            weibo['last_text'] = [result['_source']['text'],result['_source']['text'],result['_source']['timestamp']]
            mid = result['_source']['root_mid']
            # print mid
            len_pre = len(flow_text_index_name_pre)
            index = result['_index'][len_pre:]
            root_index = []
            for j in range(0,7):   #一周的,一个月的话就0,30
                iter_date = ts2datetime(datetime2ts(index) - j * DAY) 
                root_index.append(flow_text_index_name_pre + iter_date)
            results0 = es_flow_text.search(index=root_index,doc_type=flow_text_index_type,body={'query':{'term':{'mid':mid}}})['hits']['hits']
            if len(results0)>0:
                for result0 in results0:
                    weibo['ori_text'] = [result0['_source']['text'],result0['_source']['timestamp']]
                    f_result.append(weibo)
                    weibo={}
    return f_result
Пример #14
0
def get_psycho_status(uid_list):
    results = {}
    uid_sentiment_dict = {}
    #time for es_flow_text
    now_ts = time.time()
    now_date_ts = datetime2ts(ts2datetime(now_ts))
    #run_type
    if RUN_TYPE == 0:
        now_date_ts = datetime2ts(RUN_TEST_TIME)
    start_date_ts = now_date_ts - DAY * WEEK
    for i in range(0, WEEK):
        iter_date_ts = start_date_ts + DAY * i
        flow_text_index_date = ts2datetime(iter_date_ts)
        flow_text_index_name = flow_text_index_name_pre + flow_text_index_date
        try:
            flow_text_exist = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\
                    body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'size': MAX_VALUE}, _source=False,  fields=['uid', 'sentiment'])['hits']['hits']
        except:
            flow_text_exist = []
        for flow_text_item in flow_text_exist:
            uid = flow_text_item['fields']['uid'][0]
            sentiment = flow_text_item['fields']['sentiment'][0]
            if uid in uid_sentiment_dict:
                try:
                    uid_sentiment_dict[uid][str(sentiment)] += 1
                except:
                    uid_sentiment_dict[uid][str(sentiment)] = 1
            else:
                uid_sentiment_dict[uid] = {str(sentiment): 1}
    #compute first and second psycho_status
    for uid in uid_list:
        results[uid] = {'first':{}, 'second':{}}
        try:
            user_sentiment_result = uid_sentiment_dict[uid]
        except:
            user_sentiment_result = {}
        all_count = sum(user_sentiment_result.values())
        #compute second level sentiment---negative type sentiment
        second_sentiment_count_list = [user_sentiment_result[item] for item in user_sentiment_result if item in SENTIMENT_SECOND]
        second_sentiment_all_count = sum(second_sentiment_count_list)
        for sentiment_item in SENTIMENT_SECOND:
            try:
                results[uid]['second'][sentiment_item] = float(user_sentiment_result[sentiment_item]) / all_count
            except:
                results[uid]['second'][sentiment_item] = 0
        #compute first level sentiment---middle, postive, negative
        user_sentiment_result['7'] = second_sentiment_all_count
        for sentiment_item in SENTIMENT_FIRST:
            try:
                sentiment_ratio = float(user_sentiment_result[sentiment_item]) / all_count
            except:
                sentiment_ratio = 0
            results[uid]['first'][sentiment_item] = sentiment_ratio

    return results
Пример #15
0
def get_activity_weibo(task_name, start_ts):
    results = []
    #step1: get task_name uid
    try:
        group_result = es_group_result.get(index=group_index_name, doc_type=group_index_type ,\
                id=task_name, _source=False, fields=['uid_list'])
    except:
        group_result = {}
    if group_result == {}:
        return 'task name invalid'
    try:
        uid_list = group_result['fields']['uid_list']
    except:
        uid_list = []
    if uid_list == []:
        return 'task uid list null'
    #step2: get uid2uname
    uid2uname = {}
    try:
        user_portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, \
                body = {'ids':uid_list}, _source=False, fields=['uname'])['docs']
    except:
        user_portrait_result = []
    for item in user_portrait_result:
        uid = item['_id']
        if item['found']==True:
            uname = item['fields']['uname'][0]
        uid2uname[uid] = uname
    #step3: search time_segment weibo
    time_segment = FOUR_HOUR
    end_ts = start_ts + time_segment
    time_date = ts2datetime(start_ts)
    flow_text_index_name = flow_text_index_name_pre + time_date
    query = []
    query.append({'terms':{'uid': uid_list}})
    query.append({'range':{'timestamp':{'gte':start_ts, 'lt':end_ts}}})
    try:
        flow_text_es_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type, \
                body={'query':{'bool':{'must':query}}, 'sort':'timestamp', 'size':MAX_VALUE})['hits']['hits']
    except:
        flow_text_es_result = []
    for item in flow_text_es_result:
        weibo = {}
        source = item['_source']
        weibo['timestamp'] = ts2date(source['timestamp'])
        weibo['ip'] = source['ip']
        weibo['text'] = source['text']
        if source['geo']:
            weibo['geo'] = '\t'.join(source['geo'])
        else:
            weibo['geo'] = ''
        results.append(weibo)

    return results
Пример #16
0
def get_activity_weibo(task_name, start_ts):
    results = []
    #step1: get task_name uid
    try:
        group_result = es_group_result.get(index=group_index_name, doc_type=group_index_type ,\
                id=task_name, _source=False, fields=['uid_list'])
    except:
        group_result = {}
    if group_result == {}:
        return 'task name invalid'
    try:
        uid_list = group_result['fields']['uid_list']
    except:
        uid_list = []
    if uid_list == []:
        return 'task uid list null'
    #step2: get uid2uname
    uid2uname = {}
    try:
        user_portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, \
                body = {'ids':uid_list}, _source=False, fields=['uname'])['docs']
    except:
        user_portrait_result = []
    for item in user_portrait_result:
        uid = item['_id']
        if item['found'] == True:
            uname = item['fields']['uname'][0]
        uid2uname[uid] = uname
    #step3: search time_segment weibo
    time_segment = FOUR_HOUR
    end_ts = start_ts + time_segment
    time_date = ts2datetime(start_ts)
    flow_text_index_name = flow_text_index_name_pre + time_date
    query = []
    query.append({'terms': {'uid': uid_list}})
    query.append({'range': {'timestamp': {'gte': start_ts, 'lt': end_ts}}})
    try:
        flow_text_es_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type, \
                body={'query':{'bool':{'must':query}}, 'sort':'timestamp', 'size':MAX_VALUE})['hits']['hits']
    except:
        flow_text_es_result = []
    for item in flow_text_es_result:
        weibo = {}
        source = item['_source']
        weibo['timestamp'] = ts2date(source['timestamp'])
        weibo['ip'] = source['ip']
        weibo['text'] = source['text']
        if source['geo']:
            weibo['geo'] = '\t'.join(source['geo'])
        else:
            weibo['geo'] = ''
        results.append(weibo)

    return results
Пример #17
0
def new_get_user_weibo(uid, sort_type):
    results = []
    weibo_list = []
    now_date = ts2datetime(time.time())
    #run_type
    if RUN_TYPE == 0:
        now_date = RUN_TEST_TIME
        sort_type = 'timestamp'
    #step1:get user name
    try:
        user_profile_result = es_user_profile.get(index=profile_index_name, doc_type=profile_index_type,\
                id=uid, _source=False, fields=['nick_name'])
    except:
        user_profile_result = {}
    if user_profile_result:
        uname = user_profile_result['fields']['nick_name'][0]
    else:
        uname = ''
    #step2:get user weibo
    for i in range(7, 0, -1):
        iter_date = ts2datetime(datetime2ts(now_date) - i * DAY)
        index_name = flow_text_index_name_pre + iter_date
        try:
            weibo_result = es_flow_text.search(index=index_name, doc_type=flow_text_index_type,\
                    body={'query':{'filtered':{'filter':{'term': {'uid': uid}}}}, 'sort':sort_type, 'size':100})['hits']['hits']
        except:
            weibo_result = []
        if weibo_result:
            weibo_list.extend(weibo_result)
    print 'weibo_list:', weibo_list[0]
    sort_weibo_list = sorted(weibo_list, key=lambda x:x['_source'][sort_type], reverse=True)[:100]
    for weibo_item in sort_weibo_list:
        source = weibo_item['_source']
        mid = source['mid']
        uid = source['uid']
        text = source['text']
        ip = source['geo']
        timestamp = source['timestamp']
        date = ts2date(timestamp)
        sentiment = source['sentiment']
        #run_type
        if RUN_TYPE == 1:
            retweet_count = source['retweet_count']
            comment_count = source['comment_count']
            sensitive_score = source['sensitive']
        else:
            retweet_count = 0
            comment_count = 0
            sensitive_score = 0
        city = ip2city(ip)
        results.append([mid, uid, text, ip, city,timestamp, date, retweet_count, comment_count, sensitive_score])

    return results
Пример #18
0
def get_social_inter_content(uid1, uid2, type_mark):
    weibo_list = []
    #get two type relation about uid1 and uid2
    #search weibo list
    now_ts = int(time.time())
    #run_type
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = datetime2ts(RUN_TEST_TIME)
    #uid2uname
    uid2uname = {}
    try:
        portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type ,\
                                body={'ids': [uid1, uid2]}, _source=False, fields=['uid', 'uname'])['docs']
    except:
        portrait_result = []
    
    for item in portrait_result:
        uid = item['_id']
        if item['found'] == True:
            uname = item['fields']['uname'][0]
            uid2uname[uid] = uname
        else:
            uid2uname[uid] = 'unknown'
    #iter date to search weibo list
    for i in range(7, 0, -1):
        iter_date_ts = now_date_ts - i*DAY
        iter_date = ts2datetime(iter_date_ts)
        flow_text_index_name = flow_text_index_name_pre + str(iter_date)
        query = []
        query.append({'bool':{'must':[{'term':{'uid':uid1}}, {'term':{'directed_uid': int(uid2)}}]}})
        if type_mark=='out':
            query.append({'bool':{'must':[{'term':{'uid':uid2}}, {'term':{'directed_uid': int(uid1)}}]}})
        try:
            flow_text_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\
                    body={'query': {'bool':{'should': query}}, 'sort':[{'timestamp':{'order': 'asc'}}], 'size':MAX_VALUE})['hits']['hits']
        except:
            flow_text_result = []
        for flow_text in flow_text_result:
            source = flow_text['_source']
            weibo = {}
            weibo['timestamp'] = source['timestamp']
            weibo['ip'] = source['ip']
            weibo['geo'] = source['geo']
            weibo['text'] = '\t'.join(source['text'].split('&'))
            weibo['uid'] =  source['uid']
            weibo['uname'] = uid2uname[weibo['uid']]
            weibo['directed_uid'] = str(source['directed_uid'])
            weibo['directed_uname'] = uid2uname[str(source['directed_uid'])]
            weibo_list.append(weibo)

    return weibo_list
Пример #19
0
def get_social_inter_content(uid1, uid2, type_mark):
    weibo_list = []
    #get two type relation about uid1 and uid2
    #search weibo list
    now_ts = int(time.time())
    #run_type
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = datetime2ts(RUN_TEST_TIME)
    #uid2uname
    uid2uname = {}
    try:
        portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type ,\
                                body={'ids': [uid1, uid2]}, _source=False, fields=['uid', 'uname'])['docs']
    except:
        portrait_result = []
    
    for item in portrait_result:
        uid = item['_id']
        if item['found'] == True:
            uname = item['fields']['uname'][0]
            uid2uname[uid] = uname
        else:
            uid2uname[uid] = 'unknown'
    #iter date to search weibo list
    for i in range(7, 0, -1):
        iter_date_ts = now_date_ts - i*DAY
        iter_date = ts2datetime(iter_date_ts)
        flow_text_index_name = flow_text_index_name_pre + str(iter_date)
        query = []
        query.append({'bool':{'must':[{'term':{'uid':uid1}}, {'term':{'directed_uid': int(uid2)}}]}})
        if type_mark=='out':
            query.append({'bool':{'must':[{'term':{'uid':uid2}}, {'term':{'directed_uid': int(uid1)}}]}})
        try:
            flow_text_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\
                    body={'query': {'bool':{'should': query}}, 'sort':[{'timestamp':{'order': 'asc'}}], 'size':MAX_VALUE})['hits']['hits']
        except:
            flow_text_result = []
        for flow_text in flow_text_result:
            source = flow_text['_source']
            weibo = {}
            weibo['timestamp'] = source['timestamp']
            weibo['ip'] = source['ip']
            weibo['geo'] = source['geo']
            weibo['text'] = '\t'.join(source['text'].split('&'))
            weibo['uid'] =  source['uid']
            weibo['uname'] = uid2uname[weibo['uid']]
            weibo['directed_uid'] = str(source['directed_uid'])
            weibo['directed_uname'] = uid2uname[str(source['directed_uid'])]
            weibo_list.append(weibo)

    return weibo_list
def read_flow_text_sentiment(uid_list):
    """
        读取用户微博(返回结果有微博情绪标签):
        输入数据:uid_list(字符串型列表)
        输出数据:word_dict(用户分词结果字典),weibo_list(用户微博列表)
        word_dict示例:{uid1:{'w1':f1,'w2':f2...}...}
        weibo_list示例:[[uid1,text1,s1,ts1],[uid2,text2,s2,ts2],...](每一条记录对应四个值:uid、text、sentiment、timestamp)
    """
    word_dict = dict()  # 词频字典
    weibo_list = []  # 微博列表
    now_ts = time.time()
    now_date_ts = datetime2ts(ts2datetime(now_ts))
    now_date_ts = datetime2ts("2013-09-08")
    start_date_ts = now_date_ts - DAY * WEEK
    for i in range(0, WEEK):
        iter_date_ts = start_date_ts + DAY * i
        flow_text_index_date = ts2datetime(iter_date_ts)
        flow_text_index_name = flow_text_index_name_pre + flow_text_index_date
        print flow_text_index_name
        try:
            flow_text_exist = es_flow_text.search(
                index=flow_text_index_name,
                doc_type=flow_text_index_type,
                body={"query": {"filtered": {"filter": {"terms": {"uid": uid_list}}}}, "size": MAX_VALUE},
                _source=False,
                fields=["text", "uid", "sentiment", "keywords_dict", "timestamp"],
            )["hits"]["hits"]
        except:
            flow_text_exist = []

        for flow_text_item in flow_text_exist:
            uid = flow_text_item["fields"]["uid"][0].encode("utf-8")
            text = flow_text_item["fields"]["text"][0].encode("utf-8")
            sentiment = int(flow_text_item["fields"]["sentiment"][0])
            ts = flow_text_item["fields"]["timestamp"][0]
            keywords_dict = json.loads(flow_text_item["fields"]["keywords_dict"][0])
            keywords_dict = json.dumps(keywords_dict, encoding="UTF-8", ensure_ascii=False)
            keywords_dict = eval(keywords_dict)

            if word_dict.has_key(uid):
                item_dict = Counter(word_dict[uid])
                keywords_dict = Counter(keywords_dict)
                item_dict = dict(item_dict + keywords_dict)
                word_dict[uid] = item_dict
            else:
                word_dict[uid] = keywords_dict

            weibo_list.append([uid, text, sentiment, ts])

    return word_dict, weibo_list
Пример #21
0
def get_user_geo(uid, dropped_geos=u"中国&美国"):
    """
    :param uid: 用户的id
    :param dropped_geos: &分割的地点,因为geo中都包含中国
    :return: geo 位置的set
    """
    dropped_geos = set(dropped_geos.split("&"))
    # 获取用户的偏好
    try:
        user_portrait_result = es_user_portrait. \
            get_source(index=portrait_index_name, doc_type=profile_index_type, id=uid)
    except NotFoundError:
        user_portrait_result = None

    # portrait表中存在geo信息
    if user_portrait_result and len(user_portrait_result["activity_geo"]) > 0:
        geos = user_portrait_result["activity_geo"] - dropped_geos

    # 不存在geo信息,获取之前发去的微博提取
    else:
        flow_text_index_list = []
        now_timestamp = datetime2ts(ts2datetime(time.time()))
        if RUN_TYPE == 0:
            now_timestamp = datetime2ts(RUN_TEST_TIME)
        for i in range(7, 0, -1):
            iter_date = ts2datetime(now_timestamp - DAY * i)
            flow_text_index_list.append(flow_text_index_name_pre + iter_date)

        weibo_all = es_flow_text.search(index=flow_text_index_list,
                                        doc_type=flow_text_index_type,
                                        body={
                                            'query': {
                                                'filtered': {
                                                    'filter': {
                                                        'term': {
                                                            'uid': uid
                                                        }
                                                    }
                                                }
                                            },
                                            'size': 2000,
                                        })['hits']['hits']
        geos = set()
        for temp in weibo_all:
            geos |= set(temp["_source"]["geo"].split("&"))

    return geos
Пример #22
0
def get_influence_content(uid, timestamp_from, timestamp_to):
    weibo_list = []
    #split timestamp range to new_range_dict_list
    from_date_ts = datetime2ts(ts2datetime(timestamp_from))
    to_date_ts = datetime2ts(ts2datetime(timestamp_to))
    new_range_dict_list = []
    if from_date_ts != to_date_ts:
        iter_date_ts = from_date_ts
        while iter_date_ts < to_date_ts:
            iter_next_date_ts = iter_date_ts + DAY
            new_range_dict_list.append({'range':{'timestamp':{'gte':iter_date_ts, 'lt':iter_next_date_ts}}})
            iter_date_ts = iter_next_date_ts
        if new_range_dict_list[0]['range']['timestamp']['gte'] < timestamp_from:
            new_range_dict_list[0]['range']['timestamp']['gte'] = timestamp_from
        if new_range_dict_list[-1]['range']['timestamp']['lt'] > timestamp_to:
            new_range_dict_list[-1]['range']['timestamp']['lt'] = timestamp_to
    else:
        new_range_dict_list = [{'range':{'timestamp':{'gte':timestamp_from, 'lt':timestamp_to}}}]
    #iter date to search flow_text
    iter_result = []
    for range_item in new_range_dict_list:
        range_from_ts = range_item['range']['timestamp']['gte']
        range_from_date = ts2datetime(range_from_ts)
        flow_text_index_name = flow_text_index_name_pre + range_from_date
        query = []
        query.append({'term':{'uid':uid}})
        query.append(range_item)
        try:
            flow_text_exist = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\
                    body={'query':{'bool':{'must': query}}, 'sort':[{'timestamp':'asc'}]})['hits']['hits']
        except:
            flow_text_exist = []
        iter_result.extend(flow_text_exist)
    # get weibo list
    for item in flow_text_exist:
        source = item['_source']
        weibo = {}
        weibo['timestamp'] = ts2date(source['timestamp'])
        weibo['ip'] = source['ip']
        weibo['text'] = source['text']
        if source['geo']:
            weibo['geo'] = '\t'.join(source['geo'].split('&'))
        else:
            weibo['geo'] = ''
        weibo_list.append(weibo)
        
    return weibo_list
Пример #23
0
def read_flow_text_sentiment(uid_list):
    '''
        读取用户微博(返回结果有微博情绪标签):
        输入数据:uid_list(字符串型列表)
        输出数据:word_dict(用户分词结果字典),weibo_list(用户微博列表)
        word_dict示例:{uid1:{'w1':f1,'w2':f2...}...}
        weibo_list示例:[[uid1,text1,s1,ts1],[uid2,text2,s2,ts2],...](每一条记录对应四个值:uid、text、sentiment、timestamp)
    '''
    word_dict = dict()#词频字典
    weibo_list = []#微博列表
    now_ts = time.time()
    now_date_ts = datetime2ts(ts2datetime(now_ts))
    now_date_ts = datetime2ts('2013-09-08')
    start_date_ts = now_date_ts - DAY * WEEK
    for i in range(0,WEEK):
        iter_date_ts = start_date_ts + DAY * i
        flow_text_index_date = ts2datetime(iter_date_ts)
        flow_text_index_name = flow_text_index_name_pre + flow_text_index_date
        print flow_text_index_name
        try:
            flow_text_exist = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\
                    body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'size': MAX_VALUE}, _source=False,  fields=['text','uid','sentiment','keywords_dict','timestamp'])['hits']['hits']
        except:
            flow_text_exist = []

        for flow_text_item in flow_text_exist:
            uid = flow_text_item['fields']['uid'][0].encode('utf-8')
            text = flow_text_item['fields']['text'][0].encode('utf-8')
            sentiment = int(flow_text_item['fields']['sentiment'][0])
            ts = flow_text_item['fields']['timestamp'][0]
            keywords_dict = json.loads(flow_text_item['fields']['keywords_dict'][0])
            keywords_dict = json.dumps(keywords_dict, encoding="UTF-8", ensure_ascii=False)
            keywords_dict = eval(keywords_dict)

            if word_dict.has_key(uid):
                item_dict = Counter(word_dict[uid])
                keywords_dict = Counter(keywords_dict)
                item_dict = dict(item_dict + keywords_dict)
                word_dict[uid] = item_dict
            else:
                word_dict[uid] = keywords_dict

            weibo_list.append([uid,text,sentiment,ts])
            
    return  word_dict,weibo_list
Пример #24
0
def cctv_video_rec(uid, k=10):
    flow_text_index_list = []
    now_timestamp = datetime2ts(ts2datetime(time.time()))
    if RUN_TYPE == 0:
        now_timestamp = datetime2ts(RUN_TEST_TIME)
    for i in range(7, 0, -1):
        iter_date = ts2datetime(now_timestamp - DAY * i)
        flow_text_index_list.append(flow_text_index_name_pre + iter_date)

    weibo_all = es_flow_text.search(index=flow_text_index_list,
                                    doc_type=flow_text_index_type,
                                    body={
                                        'query': {
                                            'filtered': {
                                                'filter': {
                                                    'term': {
                                                        'uid': uid
                                                    }
                                                }
                                            }
                                        },
                                        'size': 100,
                                    })['hits']['hits']
    user_words = set()
    for weibo in weibo_all:
        weibo_text = weibo["_source"]["ip"]
        user_words |= set(jieba.cut(weibo_text))

    rio_dict = load_topic_video_dict(RIO_VIDEO_INFO_FILE)
    tiger_videos = load_videos(TIGER_VIDEO_INFO_FILE)

    ret_dict = dict()
    ret_dict["tiger"] = random.sample(tiger_videos, k)

    user_pref_topic = set(rio_dict.keys()) & user_words
    # 若找不到,随机分配topic
    if len(user_pref_topic) == 0:
        user_pref_topic = set(random.sample(rio_dict.keys(), k))
    ret_dict["rio"] = list()
    for topic in user_pref_topic:
        ret_dict["rio"].extend(rio_dict[topic])
        if len(ret_dict["rio"]) >= k:
            ret_dict["rio"] = ret_dict["rio"][:k]
            break
    return ret_dict
Пример #25
0
def read_flow_text_sentiment(uid_list):
    '''
        读取用户微博(返回结果有微博情绪标签):
        输入数据:uid_list(字符串型列表)
        输出数据:word_dict(用户分词结果字典),weibo_list(用户微博列表)
        word_dict示例:{uid1:{'w1':f1,'w2':f2...}...}
        weibo_list示例:[[uid1,text1,s1,ts1],[uid2,text2,s2,ts2],...](每一条记录对应四个值:uid、text、sentiment、timestamp)
    '''
    word_dict = dict()#词频字典
    weibo_list = []#微博列表
    now_ts = time.time()
    now_date_ts = datetime2ts(ts2datetime(now_ts))
    now_date_ts = datetime2ts('2013-09-08')
    start_date_ts = now_date_ts - DAY * WEEK
    for i in range(0,WEEK):
        iter_date_ts = start_date_ts + DAY * i
        flow_text_index_date = ts2datetime(iter_date_ts)
        flow_text_index_name = flow_text_index_name_pre + flow_text_index_date
        print flow_text_index_name
        try:
            flow_text_exist = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\
                    body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'size': MAX_VALUE}, _source=False,  fields=['text','uid','sentiment','keywords_dict','timestamp'])['hits']['hits']
        except:
            flow_text_exist = []

        for flow_text_item in flow_text_exist:
            uid = flow_text_item['fields']['uid'][0].encode('utf-8')
            text = flow_text_item['fields']['text'][0].encode('utf-8')
            sentiment = int(flow_text_item['fields']['sentiment'][0])
            ts = flow_text_item['fields']['timestamp'][0]
            keywords_dict = json.loads(flow_text_item['fields']['keywords_dict'][0])
            keywords_dict = json.dumps(keywords_dict, encoding="UTF-8", ensure_ascii=False)
            keywords_dict = eval(keywords_dict)

            if word_dict.has_key(uid):
                item_dict = Counter(word_dict[uid])
                keywords_dict = Counter(keywords_dict)
                item_dict = dict(item_dict + keywords_dict)
                word_dict[uid] = item_dict
            else:
                word_dict[uid] = keywords_dict

            weibo_list.append([uid,text,sentiment,ts])
            
    return  word_dict,weibo_list
Пример #26
0
def localRec(uid, k=200):
    # 运行状态,
    # 0 ->  当前为2016-11-28 00:00:00
    # 1 ->  当前时间
    now_timestamp = datetime2ts(ts2datetime(time.time()))
    if RUN_TYPE == 0:
        now_timestamp = datetime2ts(RUN_TEST_TIME)

    flow_text_index_list = []
    for i in range(7, 0, -1):
        iter_date = ts2datetime(now_timestamp - DAY * i)
        flow_text_index_list.append(flow_text_index_name_pre + iter_date)

    # 获取用户地理位置
    # user_geos = get_user_geo(uid)
    # # 根据位置查询weibo
    # weibo_all = es_flow_text.search(index=flow_text_index_list, doc_type=ads_weibo_index_type,
    #                                 body={"query":{"bool":{"must":
    #                                                                 [{"match":{"keywords_string":"新闻"}},
    #                                                                  {"match":{"geo":"合肥"}}
    #                                                                  ]}},
    #                                            "size": 200
    #                                       })["hits"]["hits"]
    '''可以直接查询长度大于100的但是很慢
    {"query":{"filtered":{"query":{"bool":{"must":[{"match":{"keywords_string":"新闻"}},{"match":{"geo":"合肥"}}]}},"filter":{"regexp":{"text":{"value":".{100,}"}}}}}}
    '''
    ip = get_user_ip(uid)
    ip = ".".join(ip.split(".")[:-2])
    weibo_all = es_flow_text.search(index=flow_text_index_list,
                                    doc_type=ads_weibo_index_type,
                                    body={
                                        "query": {
                                            "bool": {
                                                "must": [{
                                                    "prefix": {
                                                        "text.ip": ip
                                                    }
                                                }]
                                            }
                                        },
                                        "size": 2000
                                    })["hits"]["hits"]

    local_weibo_rec = []
    weibo_user_uids = [weibo["_source"]["uid"] for weibo in weibo_all]
    user_profiles = search_user_profile_by_user_ids(weibo_user_uids)
    exists_ip = set()
    for weibo in weibo_all:
        weibo = weibo["_source"]
        weibo_text = weibo["text"]
        if weibo["ip"] in exists_ip:
            continue
        # 一个ip只选一个
        exists_ip.add(weibo["ip"])
        if not is_suit(weibo_text):
            continue
        weibo["len"] = len(weibo_text)
        try:
            mid = weibo["mid"]
            uid = weibo["uid"]
        except:
            continue
        weibo["weibo_url"] = weiboinfo2url(uid, mid)
        # 可能出现许多userprofile查不到的情况
        if uid in user_profiles:
            weibo["photo_url"] = user_profiles[uid]["photo_url"]
            weibo["nick_name"] = user_profiles[uid]["nick_name"]
        else:
            weibo["photo_url"] = "None"
            weibo["nick_name"] = "None"
            local_weibo_rec.append(weibo)
    return local_weibo_rec
Пример #27
0
def adsRec(uid, queryInterval=HOUR * 4):
    '''
    从广告表中读取当前时间点前一段时间queryInterval内的广微博,得到其中的广告部分
    然后根据用户的key_word信息得到推荐的广告。
    :param uid: 用户ID
    :param queryInterval: 查询之前多久的广告
    :return: 广告微博列表,按照相关度(感兴趣程度)排序
    '''

    # 运行状态,
    # 0 ->  当前为2013-9-8 00:00:00
    # 1 ->  当前时间
    now_date = ts2datetime(time.time()) if RUN_TYPE == 1 else ts2datetime(
        datetime2ts(RUN_TEST_TIME) - DAY)

    # 获取用户的偏好
    user_portrait_result = es_user_portrait. \
        get_source(index=portrait_index_name, doc_type=profile_index_type, id=uid)

    user_key_words = set(user_portrait_result["keywords_string"].split("&"))

    # test,目前使用的是从原始数据中读取一定时间段内的微博并实时计算的方式得到
    now_timestamp = datetime2ts(ts2datetime(time.time()))
    if RUN_TYPE == 0:
        now_timestamp = datetime2ts(RUN_TEST_TIME)

    ads_weibo_index_name = []
    for i in range(7, 0, -1):
        iter_date = ts2datetime(now_timestamp - DAY * i)
        ads_weibo_index_name.append(flow_text_index_name_pre + iter_date)

    ads_weibo_all = es_flow_text.search(
        index=ads_weibo_index_name,
        doc_type=ads_weibo_index_type,
        body={
            'query': {
                "filtered": {
                    "filter": {
                        "range": {
                            "timestamp": {
                                "gte": datetime2ts(now_date) - queryInterval
                            }
                        }
                    }
                }
            },
            'size': 2000,
        })['hits']['hits']

    # 根据权重得到不同类别上词语的权重TFIDF
    topic_word_weight_dic = construct_topic_word_weight_dic(
        ADS_TOPIC_TFIDF_DIR)

    # 根据用户发微博的keywords得到用户在广告的topic上的分布
    # 因为已有的topic不太适合广告的分类
    user_topic_dic = construct_topic_feature_dic(user_key_words,
                                                 topic_word_weight_dic)

    ads_weibo_prefer = adsPreferred(user_topic_dic, ads_weibo_all,
                                    topic_word_weight_dic, 30)
    return ads_weibo_prefer
Пример #28
0
def get_sensitive_weibo_detail(ts, social_sensors, sensitive_words_list, message_type, size=100):
    results = []
    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "must":[
                            {"range":{
                                "timestamp":{
                                    "gte": ts - time_interval,
                                    "lt": ts
                                }
                            }},
                            {"term": {"message_type": message_type}},
                            {"terms":{"keywords_string": sensitive_words_list}}
                        ]
                    }
                }
            }
        },
        "size": size,
        "sort": {"timestamp": {"order": "desc"}}
    }

    if social_sensors:
        query_body['query']['filtered']['filter']['bool']['must'].append({"terms": {"uid": social_sensors}})

    datetime = ts2datetime(ts)
    datetime_1 = ts2datetime(ts-time_interval)
    index_name = flow_text_index_name_pre + datetime
    exist_es = es_text.indices.exists(index_name)
    index_name_1 = flow_text_index_name_pre + datetime_1
    exist_es_1 = es_text.indices.exists(index_name_1)

    if datetime == datetime_1 and exist_es:
        search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"]
    elif datetime != datetime_1 and exist_es_1:
        search_results = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"]
    else:
        search_results = []

    uid_list = []
    if search_results:
        for item in search_results:
            uid_list.append(item["_source"]['uid'])
        if uid_list:
            portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"]

        for i in range(len(uid_list)):
            item = search_results[i]['_source']
            temp = []
            # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type
            temp.append(item['uid'])
            if portrait_result[i]['found']:
                temp.append(portrait_result[i]["fields"]["nick_name"][0])
                temp.append(portrait_result[i]["fields"]["photo_url"][0])
            else:
                temp.append("unknown")
                temp.append("")
            temp.append(item["text"])
            #print item['text']
            temp.append(item["sentiment"])
            temp.append(ts2date(item['timestamp']))
            temp.append(item['geo'])
            keywords_set = set(item['keywords_string'].split('&'))
            common_keywords = set(sensitive_words_list) & keywords_set
            temp.append(list(common_keywords))
            temp.append(item['message_type'])
            results.append(temp)

    return results
Пример #29
0
def get_sensitive_text_detail(task_name, ts, user, order):
    _id = user + '-' + task_name
    task_detail = es.get(index=index_sensing_task, doc_type=_id, id=ts)['_source']
    weibo_detail = json.loads(task_detail['sensitive_weibo_detail'])

    weibo_detail_list = []
    if weibo_detail:
        for iter_mid, item in weibo_detail.iteritems():
            tmp = []
            tmp.append(iter_mid)
            tmp.append(item[iter_mid])
            tmp.append(item['retweeted'])
            tmp.append(item['comment'])
            weibo_detail_list.append(tmp)
    mid_list = weibo_detail.keys()

    results = []
    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "terms":{"mid": mid_list}
                }
            }
        }
    }

    index_list = []
    datetime = ts2datetime(ts)
    datetime_1 = ts2datetime(ts-DAY)
    index_name = flow_text_index_name_pre + datetime
    exist_es = es_text.indices.exists(index_name)
    if exist_es:
        index_list.append(index_name)
    index_name_1 = flow_text_index_name_pre + datetime_1
    exist_es_1 = es_text.indices.exists(index_name_1)
    if exist_es_1:
        index_list.append(index_name_1)

    if index_list and mid_list:
        search_results = es_text.search(index=index_list, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"]
    else:
        search_results = []

    uid_list = []
    text_dict = dict() # 文本信息
    portrait_dict = dict() # 背景信息
    if search_results:
        for item in search_results:
            uid_list.append(item["_source"]['uid'])
            text_dict[item['_id']] = item['_source'] # _id是mid
        if uid_list:
            portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"]
            for item in portrait_result:
                if item['found']:
                    portrait_dict[item['_id']] = {"nick_name": item["fields"]["nick_name"][0], "photo_url": item["fields"]["photo_url"][0]}
                else:
                    portrait_dict[item['_id']] = {"nick_name": item['_id'], "photo_url":""}

        if order == "total":
            sorted_list = sorted(weibo_detail_list, key=lambda x:x[1], reverse=True)
        elif order == "retweeted":
            sorted_list = sorted(weibo_detail_list, key=lambda x:x[2], reverse=True)
        elif order == "comment":
            sorted_list = sorted(weibo_detail_list, key=lambda x:x[3], reverse=True)
        else:
            sorted_list = weibo_detail_list

        count_n = 0
        for item in sorted_list:
            mid = item[0]
            iter_text = text_dict.get(mid, {})
            temp = []
            # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type
            if iter_text:
                uid = iter_text['uid']
                temp.append(uid)
                iter_portrait = portrait_dict.get(uid, {})
                if iter_portrait:
                    temp.append(iter_portrait['nick_name'])
                    temp.append(iter_portrait['photo_url'])
                else:
                    temp.extend([uid,''])
                temp.append(iter_text["text"])
                temp.append(iter_text["sentiment"])
                temp.append(ts2date(iter_text['timestamp']))
                temp.append(iter_text['geo'])
                temp.append(iter_text['message_type'])
                temp.append(item[2])
                temp.append(item[3])
                temp.append(iter_text.get('sensitive', 0))
                count_n += 1
                results.append(temp)

        if results and order == "ts":
            results = sorted(results, key=lambda x:x[5], reverse=True)

        if results and order == "sensitive":
            results = sorted(results, key=lambda x:x[-1], reverse=True)

    return results
Пример #30
0
def search_sentiment_detail_all_keywords(start_ts, task_type, task_detail, time_segment, sentiment, sort_type):
    results = {}
    must_query_list = []
    #step0: get query keywords list
    keyword_nest_body_list = []
    keywords_list = task_detail.split(',')
    print 'keywords_list:', keywords_list
    for keywords_item in keywords_list:
        keyword_nest_body_list.append({'wildcard':{'text': '*' + keywords_item + '*'}})
    must_query_list.append({'bool':{'should': keyword_nest_body_list}})
    #step1: get weibo from flow_text
    start_ts = int(start_ts)
    start_date = ts2datetime(start_ts)
    end_ts = start_ts + str2segment[time_segment]
    if sentiment == '7':
        query_sentiment_list = SENTIMENT_SECOND
    else:
        query_sentiment_list = [sentiment]
    must_query_list.append({'range': {'timestamp': {'gte': start_ts, 'lt':end_ts}}})
    must_query_list.append({'terms': {'sentiment': query_sentiment_list}})
    query_body = {
        'query':{
            'bool':{
                'must': must_query_list
                }
            },
        'size': SENTIMENT_MAX_TEXT,
        'sort': sort_type
        }
    flow_text_index_name = flow_text_index_name_pre + start_date
    print 'flow_text_index_name:', flow_text_index_name
    try:
        flow_text_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\
                body=query_body)['hits']['hits']
    except:
        flow_text_result = []
    print 'flow_text_result:', len(flow_text_result)
    print 'show weibo list'
    show_weibo_list, user_set = deal_show_weibo_list(flow_text_result)
    print 'get keyword'
    #step2: get keywords from flow_text
    keyword_query_dict = {
        'query':{
            'bool':{
                'must':must_query_list
                }
            },
        'aggs':{
            'all_interests': {
                'terms': {
                    'field': 'keywords_string',
                    'size': SENTIMENT_MAX_KEYWORDS
                    }
                }
            }
        }
    show_keywords_dict = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\
            body=keyword_query_dict)['aggregations']['all_interests']['buckets']
    keywords_list = [[item['key'], item['doc_count']] for item in show_keywords_dict]
    #step3: get user information
    filter_type = 'in-out'
    in_portrait_result, out_portrait_result = identify_user_portrait(user_set, filter_type)
    #step4: add uname to show weibo list
    show_weibo_list = add_uname2weibo(show_weibo_list, in_portrait_result, out_portrait_result)
    #step5: results
    results['weibo'] = show_weibo_list
    results['in_portrait_result'] = sorted(in_portrait_result.items(), key=lambda x:x[1][1], reverse=True)[:SENTIMENT_MAX_USER]
    results['out_portrait_result'] = sorted(out_portrait_result.items(), key=lambda x:x[1][3], reverse=True)[:SENTIMENT_MAX_USER]
    results['keywords'] = keywords_list
    return results
Пример #31
0
def get_origin_weibo_detail(ts, user, task_name, size, order, message_type=1):
    _id = user + '-' + task_name
    task_detail = es_user_portrait.get(index=index_sensing_task,
                                       doc_type=_id,
                                       id=ts)['_source']

    mid_value = json.loads(task_detail['mid_topic_value'])
    duplicate_dict = json.loads(task_detail['duplicate_dict'])
    tmp_duplicate_dict = dict()
    for k, v in duplicate_dict.iteritems():
        try:
            tmp_duplicate_dict[v].append(k)
        except:
            tmp_duplicate_dict[v] = [k, v]

    if message_type == 1:
        weibo_detail = json.loads(task_detail['origin_weibo_detail'])
    elif message_type == 2:
        weibo_detail = json.loads(task_detail['retweeted_weibo_detail'])
    else:
        weibo_detail = json.loads(task_detail['sensitive_weibo_detail'])
    weibo_detail_list = []
    if weibo_detail:
        for iter_mid, item in weibo_detail.iteritems():
            tmp = []
            tmp.append(iter_mid)
            tmp.append(item[iter_mid])
            tmp.append(item['retweeted'])
            tmp.append(item['comment'])
            weibo_detail_list.append(tmp)
    mid_list = weibo_detail.keys()
    print len(mid_list)
    results = []
    query_body = {
        "query": {
            "filtered": {
                "filter": {
                    "terms": {
                        "mid": mid_list
                    }
                }
            }
        },
        "size": 1000,
        "sort": {
            "timestamp": {
                "order": "desc"
            }
        }
    }

    index_list = []
    datetime = ts2datetime(ts)
    datetime_1 = ts2datetime(ts - DAY)
    index_name = flow_text_index_name_pre + datetime
    print es_text
    exist_es = es_text.indices.exists(index_name)
    print exist_es
    if exist_es:
        index_list.append(index_name)
    index_name_1 = flow_text_index_name_pre + datetime_1
    exist_es_1 = es_text.indices.exists(index_name_1)
    if exist_es_1:
        index_list.append(index_name_1)

    if index_list and mid_list:
        search_results = es_text.search(index=index_list,
                                        doc_type=flow_text_index_type,
                                        body=query_body)["hits"]["hits"]
    else:
        search_results = []

    uid_list = []
    text_dict = dict()  # 文本信息
    portrait_dict = dict()  # 背景信息
    sort_results = []
    if search_results:
        for item in search_results:
            uid_list.append(item["_source"]['uid'])
            text_dict[item['_id']] = item['_source']  # _id是mid
        if uid_list:
            portrait_result = es_profile.mget(
                index=profile_index_name,
                doc_type=profile_index_type,
                body={"ids": uid_list},
                fields=['nick_name', 'photo_url'])["docs"]
            for item in portrait_result:
                if item['found']:
                    portrait_dict[item['_id']] = {
                        "nick_name": item["fields"]["nick_name"][0],
                        "photo_url": item["fields"]["photo_url"][0]
                    }
                else:
                    portrait_dict[item['_id']] = {
                        "nick_name": item['_id'],
                        "photo_url": ""
                    }

        if order == "total":
            sorted_list = sorted(weibo_detail_list,
                                 key=lambda x: x[1],
                                 reverse=True)
        elif order == "retweeted":
            sorted_list = sorted(weibo_detail_list,
                                 key=lambda x: x[2],
                                 reverse=True)
        elif order == "comment":
            sorted_list = sorted(weibo_detail_list,
                                 key=lambda x: x[3],
                                 reverse=True)
        else:
            sorted_list = weibo_detail_list

        count_n = 0
        results_dict = dict()
        mid_index_dict = dict()
        for item in sorted_list:  # size
            mid = item[0]
            iter_text = text_dict.get(mid, {})
            temp = []
            # uid, nick_name, photo_url, text, sentiment, timestamp, geo, keywords_string, message_type
            if iter_text:
                uid = iter_text['uid']
                temp.append(uid)
                iter_portrait = portrait_dict.get(uid, {})
                if iter_portrait:
                    temp.append(iter_portrait['nick_name'])
                    temp.append(iter_portrait['photo_url'])
                else:
                    temp.extend([uid, ''])
                temp.append(iter_text["text"])
                temp.append(iter_text["sentiment"])
                temp.append(ts2date(iter_text['timestamp']))
                temp.append(iter_text['geo'])
                if message_type == 1:
                    temp.append(1)
                elif message_type == 2:
                    temp.append(3)
                else:
                    temp.append(iter_text['message_type'])
                temp.append(iter_text['keywords_string'])
                temp.append(item[2])
                temp.append(item[3])
                temp.append(iter_text.get('sensitive', 0))
                temp.append(iter_text['timestamp'])
                temp.append(mid_value[mid])
                temp.append(mid)
                results.append(temp)
            count_n += 1

        results = sorted(results,
                         key=operator.itemgetter(-4, -2, -6),
                         reverse=True)  # -4 -2 -3
        sort_results = []
        count = 0
        for item in results:
            sort_results.append([item])
            mid_index_dict[item[-1]] = count
            count += 1

        if tmp_duplicate_dict:
            remove_list = []
            value_list = tmp_duplicate_dict.values()  # [[mid, mid], ]
            for item in value_list:
                tmp = []
                for mid in item:
                    if mid_index_dict.get(mid, 0):
                        tmp.append(mid_index_dict[mid])
                if len(tmp) > 1:
                    tmp_min = min(tmp)
                else:
                    continue
                tmp.remove(tmp_min)
                for iter_count in tmp:
                    sort_results[tmp_min].extend(sort_results[iter_count])
                    remove_list.append(sort_results[iter_count])
            if remove_list:
                for item in remove_list:
                    sort_results.remove(item)

    return sort_results
Пример #32
0
def get_positive_weibo_detail(ts, social_sensors, keywords_list, size, sentiment_type=1):
    former_mid_list = query_mid_list(ts-time_interval, keywords_list, time_segment, social_sensors) # 前一段时间内的微博mid list
    current_mid_list = query_mid_list(ts, keywords_list, time_interval,  social_sensors)
    mid_list = []
    mid_list.extend(former_mid_list)
    mid_list.extend(current_mid_list)

    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "must":[
                            {"range":{
                                "timestamp":{
                                    "gte": ts - time_interval,
                                    "lt": ts
                                }
                            }}
                        ],
                        "should":[
                            {"terms": {"root_mid": mid_list}},
                            {"terms": {"mid": mid_list}},
                        ]
                    }
                }
            }
        },
        "sort": {"timestamp": {"order": "desc"}},
        "size": 100
    }



    #if social_sensors and int(sentiment_type) == 1:
    #    query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"terms":{"uid": social_sensors}})

    if int(sentiment_type) == 1 or int(sentiment_type) == 0:
        query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"term":{"sentiment":sentiment_type}})
    else:
        query_body["query"]["filtered"]["filter"]["bool"]["must"].append([{"terms":{"sentiment": ["2", "3"]}}])

    # 判断当前ts和ts-time_interval是否属于同一天,确定查询哪个es
    datetime = ts2datetime(ts)
    datetime_1 = ts2datetime(ts-time_interval)
    index_name = flow_text_index_name_pre + datetime
    exist_es = es_text.indices.exists(index_name)
    index_name_1 = flow_text_index_name_pre + datetime_1
    exist_es_1 = es_text.indices.exists(index_name_1)

    # 1. 聚合原创微博mid list
    if datetime == datetime_1 and exist_es:
        search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"]
    elif datetime != datetime_1 and exist_es_1:
        search_results = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"]
    else:
        search_results = []

    uid_list = []
    results = []
    if search_results:
        for item in search_results:
            uid_list.append(item["_source"]['uid'])
        if uid_list:
            portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"]

        for i in range(len(uid_list)):
            item = search_results[i]['_source']
            temp = []
            # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type
            temp.append(item['uid'])
            if portrait_result[i]['found']:
                temp.append(portrait_result[i]["fields"]["nick_name"][0])
                temp.append(portrait_result[i]["fields"]["photo_url"][0])
            else:
                temp.append("unknown")
                temp.append("")
            temp.append(item["text"])
            temp.append(item["sentiment"])
            temp.append(ts2date(item['timestamp']))
            temp.append(item['geo'])
            keywords_set = set(item['keywords_string'].split('&'))
            common_keywords = set(keywords_list) & keywords_set
            temp.append(list(common_keywords))
            temp.append(item['message_type'])
            results.append(temp)

    return results
Пример #33
0
def search_retweet_network_keywords(task_id, uid):
    results = {}
    task_results = es_network_task.get(index=network_keywords_index_name, \
                doc_type=network_keywords_index_type, id=task_id)['_source']

    start_date = task_results['start_date']
    start_ts = datetime2ts(start_date)
    end_date = task_resuts['end_date']
    end_ts = datetime2ts(end_date)
    iter_date_ts = start_ts
    to_date_ts = end_ts
    iter_query_date_list = [] # ['2013-09-01', '2013-09-02']
    while iter_date_ts <= to_date_ts:
        iter_date = ts2datetime(iter_date_ts)
        iter_query_date_list.append(iter_date)
        iter_date_ts += DAY
    #step2: get iter search flow_text_index_name
    #step2.1: get search keywords list
    query_must_list = []
    keyword_nest_body_list = []
    keywords_string = task_results['query_keywords']
    keywords_list = keywords_string.split('&')
    for keywords_item in keywords_list:
        keyword_nest_body_list.append({'wildcard': {'text': '*' + keywords_item + '*'}})
    query_must_list.append({'bool': {'should': keyword_nest_body_list}})
    network_results = {}
    retweet_query = query_must_list
    be_retweet_query = query_must_list
    #retweet
    retweet_query.append({'term': {'uid': uid}})
    item_results = {}
    for iter_date in iter_query_date_list:
        flow_text_index_name = flow_text_index_name_pre + iter_date
        query_body = {
            'query':{
                'bool':{
                    'must':retweet_query
                }
            },
            'size': 100
        }
        flow_text_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\
                    body=query_body)['hits']['hits']
        for item in flow_text_result:
            source = item['_source']
            source_uid = source['directed_uid']
            try:
                item_results[source_uid] += 1
            except:
                item_results[source_uid] = 1
    results = retweet_dict2results(uid, item_results)
    network_results['retweet'] = results
    #be_retweet
    retweet_query.append({'term': {'directed_uid': uid}})
    item_results = {}
    for iter_date in iter_query_date_list:
        flow_text_index_name = flow_text_index_name_pre + iter_date
        query_body = {
            'query':{
                'bool':{
                    'must':retweet_query
                }
            },
            'size': 100
        }
        flow_text_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\
                    body=query_body)['hits']['hits']
        for item in flow_text_result:
            source = item['_source']
            source_uid = source['directed_uid']
            try:
                item_results[source_uid] += 1
            except:
                item_results[source_uid] = 1
    results = retweet_dict2results(uid, item_results)
    network_results['be_retweet'] = results
    return network_results 
Пример #34
0
def new_get_user_weibo(uid, sort_type):
    results = []
    weibo_list = []
    now_date = ts2datetime(time.time())
    #run_type
    if RUN_TYPE == 0:
        now_date = RUN_TEST_TIME
        sort_type = 'timestamp'
    #step1:get user name
    print '708'
    try:
        user_profile_result = es_user_profile.get(index=profile_index_name, doc_type=profile_index_type,\
                id=uid, _source=False, fields=['nick_name'])
    except:
        user_profile_result = {}
    print '714', len(user_profile_result)
    if user_profile_result:
        uname = user_profile_result['fields']['nick_name'][0]
    else:
        uname = ''
    #step2:get user weibo
    for i in range(7, 0, -1):
        if RUN_TYPE == 1:
            iter_date = ts2datetime(datetime2ts(now_date) - i * DAY)
        else:
            iter_date = '2013-09-01'
        index_name = flow_text_index_name_pre + iter_date
        print '726'
        try:
            weibo_result = es_flow_text.search(index=index_name, doc_type=flow_text_index_type,\
                    body={'query':{'filtered':{'filter':{'term': {'uid': uid}}}}, 'size':MAX_VALUE})['hits']['hits']
            #print weibo_result
        except:
            weibo_result = []
        print '732', len(weibo_result)
        if weibo_result:
            weibo_list.extend(weibo_result)

    #sort_weibo_list = sorted(weibo_list, key=lambda x:x['_source'][sort_type], reverse=True)[:100]
    mid_set = set()
    for weibo_item in weibo_list:
        source = weibo_item['_source']
        mid = source['mid']
        uid = source['uid']
        text = source['text']
        ip = source['ip']
        timestamp = source['timestamp']
        date = ts2date(timestamp)
        sentiment = source['sentiment']
        weibo_url = weiboinfo2url(uid, mid)
        #run_type
        if RUN_TYPE == 1:
            try:
                retweet_count = source['retweeted']
            except:
                retweet_count = 0
            try:
                comment_count = source['comment']
            except:
                comment_count = 0
            try:
                sensitive_score = source['sensitive']
            except:
                sensitive_score = 0
        else:
            retweet_count = 0
            comment_count = 0
            sensitive_score = 0
        city = ip2city(ip)
        if mid not in mid_set:
            results.append([
                mid, uid, text, ip, city, timestamp, date, retweet_count,
                comment_count, sensitive_score, weibo_url
            ])
            mid_set.add(mid)
    if sort_type == 'timestamp':
        sort_results = sorted(results, key=lambda x: x[5], reverse=True)
    elif sort_type == 'retweet_count':
        sort_results = sorted(results, key=lambda x: x[7], reverse=True)
    elif sort_type == 'comment_count':
        sort_results = sorted(results, key=lambda x: x[8], reverse=True)
    elif sort_type == 'sensitive':
        sort_results = sorted(results, key=lambda x: x[9], reverse=True)
    print '778'
    return sort_results
def influenced_detail(uid, date, style):
    date1 = str(date).replace('-', '')
    index_name = pre_index + date1
    index_text = "flow_text_" + date
    #detail_text = {}
    style = int(style)
    try:
        user_info = es_cluster.get(index=index_name, doc_type=influence_doctype, id=uid)["_source"]
    except:
        result = {}
        return result
    origin_retweetd_dict = json.loads(user_info["origin_weibo_retweeted_detail"])
    origin_comment_dict = json.loads(user_info['origin_weibo_comment_detail'])
    retweeted_retweeted_dict = json.loads(user_info["retweeted_weibo_retweeted_detail"])
    retweeted_comment_dict = json.loads(user_info["retweeted_weibo_comment_detail"])

    origin_retweetd = sorted(origin_retweetd_dict.items(), key=lambda x:x[1], reverse=True)
    origin_comment = sorted(origin_comment_dict.items(), key=lambda x:x[1], reverse=True)
    retweeted_retweeted = sorted(retweeted_retweeted_dict.items(), key=lambda x:x[1], reverse=True)
    retweeted_comment = sorted(retweeted_comment_dict.items(), key=lambda x:x[1], reverse=True)

    query_body_origin = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "must":[
                            {"term":{"message_type": 1}},
                            {"term":{"uid": uid}}
                        ]
                    }
                }
            }
        },
        "size": 10000
    }
    result_1 = es.search(index=index_text, doc_type="text", body=query_body_origin)['hits']['hits']
    origin_set = set()
    if result_1:
        for item in result_1:
            origin_set.add(item['_id'])

    query_body_retweeted = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "must":[
                            {"term":{"message_type": 3}},
                            {"term":{"uid": uid}}
                        ]
                    }
                }
            }
        },
        "size": 10000
    }
    result_2 = es.search(index=index_text, doc_type="text", body=query_body_retweeted)['hits']['hits']
    retweeted_set = set()
    if result_2:
        for item in retweeted_set:
            retweeted_set.add(item['_id'])
    
    if origin_retweetd:
        for item in origin_retweetd:
            if item[0] not in origin_set:
                origin_retweetd.remove(item)

    if origin_comment:
        for item in origin_comment:
            if item[0] not in origin_set:
                origin_comment.remove(item)

    if retweeted_retweeted:
        for item in retweeted_retweeted:
            if item[0] not in retweeted_set:
                retweeted_retweeted.remove(item)

    if retweeted_comment:
        for item in retweeted_comment:
            if item[0] not in retweeted_set:
                retweeted_comment.remove(item)

    if style == 0:
        detail_text = get_text(origin_retweetd[:20], date, user_info, style)
    elif style == 1:
        detail_text = get_text(origin_comment[:20], date, user_info, style)
    elif style == 2:
        detail_text = get_text(retweeted_retweeted[:20], date, user_info, style)
    else:
        detail_text = get_text(retweeted_comment[:20], date, user_info, style)
    #detail_text["origin_retweeted"] = get_text(origin_retweetd, date)
    #detail_text["origin_comment"] = get_text(origin_comment, date)
    #detail_text["retweeted_retweeted"] = get_text(retweeted_retweeted, date)
    #detail_text["retweeted_comment"] = get_text(retweeted_comment, date)

    return detail_text
def influenced_people(uid, mid, influence_style, date, default_number=20):
# uid 
# which weibo----mid, retweeted weibo ---seek for root_mid
# influence_style: retweeted(0) or comment(1)
    date1 = ts2datetime(datetime2ts(date)).replace('-', '')
    index_name = pre_index + date1
    index_flow_text = pre_text_index + date
    text_result = es.get(index=index_flow_text, doc_type=flow_text_index_type, id=mid)["_source"]
    temp_mid = text_result.get("root_mid",'') #判断微博是否是原创微博
    if temp_mid:
        mid_type = 1 # 非原创微博
    else:
        mid_type = 0 # 原创微博
    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "must":[
                        ]
                    }
                }
            }
        },
        "size": 100000
    }

    if mid_type == 0:
        if int(influence_style) == 0: # origin weibo, all retweeted people
            query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"root_uid": uid}}, {"term": {"message_type": 3}}, {"term": {"root_mid": mid}}])
        else: # commented people
            query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"directed_uid": uid}}, {"term": {"message_type": 2}}, {"term": {"root_mid": mid}}])
    else:
        if int(influence_style) == 0: # origin weibo, all retweeted people
            query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"directed_uid": uid}}, {"term": {"message_type": 3}}, {"term": {"root_mid": temp_mid}}])
        else: # commented people
            query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"directed_uid": uid}}, {"term": {"message_type": 2}}, {"term": {"root_mid": temp_mid}}])
    search_results = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=query_body, fields=["uid"], timeout=30)["hits"]["hits"]
    results = []
    if search_results:
        for item in search_results:
            if int(item["fields"]["uid"][0]) == int(uid):
                pass
            else:
                results.append(item["fields"]["uid"][0])
        results = list(set(results))
    else:
        results = []


    if results:
        portrait_results = es_user_portrait.mget(index=user_portrait, doc_type=portrait_index_type, body={"ids": results}, fields=["domain", "topic_string", "activity_geo_dict","importance", "influence"])["docs"]
    else:
        portrait_results = {}


    in_portrait = []
    out_portrait = []
    in_portrait_info = []
    retweeted_domain = {}
    retweeted_topic = {}
    retweeted_geo = {}
    average_influence = 0
    total_influence = 0
    count = 0

    if portrait_results:
        for item in portrait_results:
            if item["found"]:
                temp = []
                count += 1
                temp.append(item['_id'])
                temp.append(item["fields"]["importance"][0])
                in_portrait.append(temp)
                temp_domain = item["fields"]["domain"][0].split('&')
                temp_topic = item["fields"]["topic_string"][0].split('&')
                temp_geo = json.loads(item["fields"]["activity_geo_dict"][0])[-1].keys()
                total_influence += item["fields"]["influence"][0]
                retweeted_domain = aggregation(temp_domain, retweeted_domain)
                retweeted_topic = aggregation(temp_topic, retweeted_topic)
                retweeted_geo = aggregation(temp_geo, retweeted_geo)
            else:
                out_portrait.append(item['_id'])
        retweeted_domain = proportion(retweeted_domain)
        retweeted_topic = proportion(retweeted_topic)
        retweeted_geo = proportion(retweeted_geo)
        try:
            average_influence = total_influence/count
        except:
            average_influence = 0
    sorted_retweeted_domain = sorted(retweeted_domain.items(),key=lambda x:x[1], reverse=True)
    sorted_retweeted_topic = sorted(retweeted_topic.items(),key=lambda x:x[1], reverse=True)
    sorted_retweeted_geo = sorted(retweeted_geo.items(), key=lambda x:x[1], reverse=True)

    retweeted_results = dict()
    retweeted_results["domian"] = sorted_retweeted_domain[:5]
    retweeted_results["topic"] = sorted_retweeted_topic[:5]
    retweeted_results["geo"] = sorted_retweeted_geo[:5]
    retweeted_results["influence"] = average_influence
    in_portrait = sorted(in_portrait, key=lambda x:x[1], reverse=True)


    temp_list = []
    for item in in_portrait:
        temp_list.append(item[0])
    retweeted_results['in_portrait_number'] = len(temp_list)
    retweeted_results['out_portrait_number'] = len(out_portrait)
    in_portrait_url = get_user_url(temp_list[:default_number])
    out_portrait_url = get_user_url(out_portrait[:default_number])

    return_results = dict()
    return_results["influence_users"] = [in_portrait_url, out_portrait_url]
    return_results["influence_distribution"] = retweeted_results


    return return_results





    return ([in_portrait_url[:default_number], out_portrait_url[:default_number]])
def influenced_user_detail(uid, date, origin_retweeted_mid, retweeted_retweeted_mid, message_type, default_number=20):
    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "must": [
                        ]
                    }
                }
            }
        },
        "size":10000
    }
    #详细影响到的人 
    date1 = str(date).replace('-', '')
    index_name = pre_index + date1
    index_flow_text = pre_text_index + date
    origin_retweeted_uid = [] # influenced user uid_list
    retweeted_retweeted_uid = []
    origin_comment_uid = []
    retweeted_comment_uid = []
    if origin_retweeted_mid: # 所有转发该条原创微博的用户
        query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"terms": {"root_mid": origin_retweeted_mid}})
        query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term":{"message_type": message_type}}, {"term":{"root_uid": uid}}])
        origin_retweeted_result = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=query_body, fields=["uid"])["hits"]["hits"]
        if origin_retweeted_result:
            for item in origin_retweeted_result:
                origin_retweeted_uid.append(item["fields"]["uid"][0])
    if retweeted_retweeted_mid: # 所有评论该条原创微博的用户
        query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"terms": {"root_mid": retweeted_retweeted_mid}})
        query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term":{"message_type": message_type}},{"term": {"directed_uid": uid}}])
        retweeted_retweeted_result = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=query_body, fields=["uid"])["hits"]["hits"]
        if retweeted_retweeted_result:
            for item in retweeted_retweeted_result:
                retweeted_retweeted_uid.append(item["fields"]["uid"][0])
    retweeted_uid_list = [] # all retweeted user list
    retweeted_results = {} # statistics of all retweeted uid information
    retweeted_domain = {}
    retweeted_topic = {}
    retweeted_geo = {}
    in_portrait = []
    out_portrait = []
    average_influence = 0
    total_influence = 0
    count = 0
    retweeted_uid_list.extend(origin_retweeted_uid)
    retweeted_uid_list.extend(retweeted_retweeted_uid)
    retweeted_uid_list = list(set(retweeted_uid_list) - set([uid])) # filter uids
    if retweeted_uid_list:
        user_portrait_result = es_user_portrait.mget(index=user_portrait, doc_type=portrait_index_type, body={"ids": retweeted_uid_list}, fields=["domain", "topic_string", "activity_geo_dict","importance", "influence"])["docs"]
        for item in user_portrait_result:
            if item["found"]:
                temp = []
                count += 1
                temp.append(item['_id'])
                temp.append(item["fields"]["importance"][0])
                in_portrait.append(temp)
                temp_domain = item["fields"]["domain"][0].split('&')
                temp_topic = item["fields"]["topic_string"][0].split('&')
                temp_geo = json.loads(item["fields"]["activity_geo_dict"][0])[-1].keys()
                total_influence += item["fields"]["influence"][0]
                retweeted_domain = aggregation(temp_domain, retweeted_domain)
                retweeted_topic = aggregation(temp_topic, retweeted_topic)
                retweeted_geo = aggregation(temp_geo, retweeted_geo)
            else:
                out_portrait.append(item['_id'])
        retweeted_domain = proportion(retweeted_domain)
        retweeted_topic = proportion(retweeted_topic)
        retweeted_geo = proportion(retweeted_geo)
        try:
            average_influence = total_influence/count
        except:
            average_influence = 0
    sorted_retweeted_domain = sorted(retweeted_domain.items(),key=lambda x:x[1], reverse=True)
    sorted_retweeted_topic = sorted(retweeted_topic.items(),key=lambda x:x[1], reverse=True)
    sorted_retweeted_geo = sorted(retweeted_geo.items(), key=lambda x:x[1], reverse=True)
    retweeted_results["domian"] = sorted_retweeted_domain[:5]
    retweeted_results["topic"] = sorted_retweeted_topic[:5]
    retweeted_results["geo"] = sorted_retweeted_geo[:5]
    retweeted_results["influence"] = average_influence
    in_portrait = sorted(in_portrait, key=lambda x:x[1], reverse=True)

    temp_list = []
    for item in in_portrait:
        temp_list.append(item[0])
    retweeted_results['in_portrait_number'] = len(temp_list)
    retweeted_results['out_portrait_number'] = len(out_portrait)
    in_portrait_url = get_user_url(temp_list[:default_number])
    out_portrait_url = get_user_url(out_portrait[:default_number])
    retweeted_results["in_portrait"] = in_portrait_url
    retweeted_results["out_portrait"] = out_portrait_url
    retweeted_results["total_number"] = len(temp_list) + len(out_portrait)

    return retweeted_results
def statistics_influence_people(uid, date, style):
    # output: different retweeted and comment, uids' domain distribution, topic distribution, registeration geo distribution
    results = {} # retwweted weibo people and comment weibo people
    date1 = str(date).replace('-', '')
    index_name = pre_index + date1
    index_flow_text = pre_text_index + date

    try:
        bci_result = es_cluster.get(index=index_name, doc_type=influence_doctype, id=uid)["_source"]
    except:
        bci_result = []
        return results
    origin_mid = [] # origin weibo mid
    retweeted_mid = [] # retweeted weibo mid

    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "must":[
                        ]
                    }
                }
            }
        },
        "size":1000
    }

    body_1 = copy.deepcopy(query_body)
    body_2 = copy.deepcopy(query_body)

    body_1["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term":{"message_type": 1}}, {"term":{"uid": uid}}])
    result_1 = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=body_1)["hits"]["hits"]
    if result_1:
        for item in result_1:
            origin_mid.append(item['_id'])

    body_1["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term":{"message_type": 3}}, {"term":{"uid": uid}}])
    result_2 = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=body_2)["hits"]["hits"]
    if result_2:
        for item in result_2:
            if item['_source'].get('root_mid', ''):
                retweeted_mid.append(item['_source']['root_mid'])    
    
    origin_retweeted = json.loads(bci_result["origin_weibo_retweeted_detail"])
    retweeted_retweeted = json.loads(bci_result["retweeted_weibo_retweeted_detail"])
    origin_comment = json.loads(bci_result["origin_weibo_comment_detail"])
    retweeted_comment = json.loads(bci_result["retweeted_weibo_comment_detail"])

    """
    retweeted_total_number = sum(origin_retweeted.values()) + sum(retweeted_retweeted.values())
    comment_total_number = sum(origin_comment.values()) + sum(retweeted_comment.values())
    if origin_retweeted:
        origin_retweeted_mid = filter_mid(origin_retweeted)
    if retweeted_retweeted:
        retweeted_retweeted_mid = filter_mid(retweeted_retweeted)
    if origin_comment:
        origin_comment_mid = filter_mid(origin_comment)
    if retweeted_comment:
        retweeted_comment_mid = filter_mid(retweeted_comment)

    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "should":[
                        ],
                        "must": [
                        ]
                    }
                }
            }
        },
        "size":10000
    }
    """

    if int(style) == 0: # retweeted
        retweeted_results = influenced_user_detail(uid, date, origin_mid, retweeted_mid, 3)
        results = retweeted_results
    else:
        comment_results = influenced_user_detail(uid, date, origin_mid, retweeted_mid, 2)
        results = comment_results

    return results
Пример #39
0
def group_user_weibo(task_name, submit_user, sort_type):
    weibo_list = []
    now_date = ts2datetime(time.time())
    if sort_type == 'retweet':
        sort_type = 'retweeted'
    #run_type
    if RUN_TYPE == 0:
        now_date = RUN_TEST_TIME
        sort_type = 'timestamp'
    #step1: get group user
    task_id = submit_user + '-' + task_name
    try:
        group_exist_result = es_group_result.get(index=group_index_name, doc_type=group_index_type,\
                id=task_id)['_source']
    except:
        group_exist_result = {}
    if not group_exist_result:
        return 'group no exist'
    #step2: get user weibo list
    uid_list = group_exist_result['uid_list']
    for i in range(6, -1, -1):
        iter_date = ts2datetime(datetime2ts(now_date) - i * DAY)
        index_name = flow_text_index_name_pre + iter_date
        try:
            weibo_result = es_flow_text.search(index=index_name, doc_type=flow_text_index_type,\
                    body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'sort':[{sort_type: {'order': 'desc'}}], 'size':100})['hits']['hits']
        except:
            weibo_result = []
        if weibo_result:
            weibo_list.extend(weibo_result)
    #sort_weibo_list = sorted(weibo_list, key=lambda x:x['_source'][sort_type], reverse=True)[:100]
    sort_weibo_list = weibo_list
    #step3: get user name
    try:
        portrait_exist_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, \
                body={'ids':uid_list})['docs']
    except:
        portrait_exist_result = []
    uid2uname_dict = {}
    for portrait_item in portrait_exist_result:
        uid = portrait_item['_id']
        if portrait_item['found'] == True:
            source = portrait_item['_source']
            uname = source['uname']
        else:
            uname = 'unknown'
        uid2uname_dict[uid] = uname
    weibo_list = []
    for weibo_item in sort_weibo_list:
        source = weibo_item['_source']
        mid = source['mid']
        uid = source['uid']
        uname = uid2uname_dict[uid]
        text = source['text']
        ip = source['geo']
        timestamp = source['timestamp']
        date = ts2date(timestamp)
        sentiment = source['sentiment']
        weibo_url = weiboinfo2url(uid, mid)
        #run_type:
        if RUN_TYPE == 1:
            try:
                retweet_count = source['retweeted']
            except:
                retweet_count = 0
            try:
                comment_count = source['comment']
            except:
                comment_count = 0
            try:
                sensitive_score = source['sensitive']
            except:
                sensitive_score = 0
        else:
            retweet_count = 0
            comment_count = 0
            sensitive_score = 0
        city = ip2city(ip)
        weibo_list.append([
            mid, uid, uname, text, ip, city, timestamp, date, retweet_count,
            comment_count, sensitive_score, weibo_url
        ])
    if sort_type == 'timestamp':
        new_weibo_list = sorted(weibo_list, key=lambda x: x[6], reverse=True)
    elif sort_type == 'retweeted':
        new_weibo_list = sorted(weibo_list, key=lambda x: x[8], reverse=True)
    elif sort_type == 'comment':
        new_weibo_list = sorted(weibo_list, key=lambda x: x[9], reverse=True)
    elif sort_type == 'sensitive':
        new_weibo_list = sorted(weibo_list, key=lambda x: x[10], reverse=True)
    return new_weibo_list
Пример #40
0
def search_sentiment_detail_in_topic(start_ts, task_type, task_detail, time_segment, sentiment, sort_type):
    results = {}
    start_ts = int(start_ts)
    start_date = ts2datetime(start_ts)
    end_ts = start_ts + str2segment[time_segment]
    print 'start_ts:', ts2datetime(start_ts)
    print 'end_ts:', ts2datetime(end_ts)
    if sentiment == '7':
        query_sentiment_list = SENTIMENT_SECOND
    else:
        query_sentiment_list = [sentiment]
    user_topic = task_detail
    #step1: iter get weibo and user in topic
    iter_user_count = 0
    in_user_result = {}
    all_filter_weibo_list = []
    sort_evaluate_max = SENTIMENT_SORT_EVALUATE_MAX
    flow_text_index_name = flow_text_index_name_pre + start_date
    print 'flow_text_index_name:', flow_text_index_name
    while len(in_user_result) < SENTIMENT_MAX_USER:
        print 'in_user_result:', len(in_user_result)
        print 'sort_evaluate_max:', sort_evaluate_max
        query_body = {
        'query':{
            'filtered':{
                'filter':{
                    'bool':{
                        'must':[
                            {'range': {sort_type: {'lt': sort_evaluate_max}}},
                            {'terms': {'sentiment': query_sentiment_list}},
                            {'range': {'timestamp':{'gte': start_ts, 'lt': end_ts}}}
                            ]
                        }
                    }
                }
            },
        'sort': [{sort_type: {'order': 'desc'}}],
        'size': SENTIMENT_ITER_TEXT_COUNT
        }
        try:
            flow_text_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\
                    body=query_body)['hits']['hits']
        except:
            flow_text_result = []
        print 'len flow_text_result:', len(flow_text_result)
        if not flow_text_result:
            break
        weibo_list, user_set = deal_show_weibo_list(flow_text_result)
        #filter topic user
        filter_type = 'topic'
        print 'identify user portrait topic'
        in_portrait_result = identify_user_portrait_domain_topic(user_set, filter_type, user_topic)
        filter_weibo_list = filter_weibo_in(weibo_list, in_portrait_result)
        if filter_weibo_list:
            all_filter_weibo_list.extend(filter_weibo_list)
        if in_portrait_result:
            in_user_result = dict(in_user_result, **in_portrait_result)
        sort_evaluate_max = flow_text_result[-1]['_source'][sort_type]
    query_uid_list = in_user_result.keys()
    #step2: get keywords from flow_text
    print 'get keyword'
    keyword_query_dict = {
            'query':{
                'filtered':{
                    'filter':{
                        'bool':{
                            'must':[
                                {'range': {'timestamp': {'gte': start_ts, 'lt': end_ts}}},
                                {'terms': {'uid': query_uid_list}}
                            ]
                        }
                    }
                }
            },
            'aggs':{
                'all_interests':{
                    'terms':{
                        'field': 'keywords_string',
                        'size': SENTIMENT_MAX_KEYWORDS
                    }
                }
            }
        }
    show_keywords_dict = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\
            body=keyword_query_dict)['aggregations']['all_interests']['buckets']
    keywords_list = [[item['key'], item['doc_count']] for item in show_keywords_dict]
    #step3: get results
    results['weibo'] = all_filter_weibo_list
    results['in_portrait_result'] = sorted(in_user_result.items(), key=lambda x:x[1][1], reverse=True)
    results['keywords'] = keywords_list
    return results
def get_retweet_weibo_detail(ts, user, task_name, size, text_type, type_value):
    _id = user + '-' + task_name
    task_detail = es_user_portrait.get(index=index_sensing_task, doc_type=_id, id=ts)['_source']
    origin_weibo_detail = json.loads(task_detail['origin_weibo_detail'])
    retweeted_weibo_detail = json.loads(task_detail['retweeted_weibo_detail'])

    mid_list = []
    mid_list.extend(origin_weibo_detail.keys())
    mid_list.extend(retweeted_weibo_detail.keys())

    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "must":[
                            {"range":{
                                "timestamp":{
                                    "gte": ts - time_interval,
                                    "lt": ts
                                }
                            }},
                            {"terms": {"root_mid": mid_list}}
                        ]
                    }
                }
            }
        },
        "sort": {"timestamp": {"order": "desc"}},
        "size": 100
    }

    if text_type == "message_type":
        query_body['query']['filtered']['filter']['bool']['must'].append({"term":{text_type: type_value}})
    if text_type == "sentiment":
        #if isinstance(type_value, str):
        if len(type_value) == 1:
            query_body['query']['filtered']['filter']['bool']['must'].append({"term":{text_type: type_value}})
        else:
            query_body['query']['filtered']['filter']['bool']['must'].append({"terms":{text_type: type_value}})

    datetime = ts2datetime(ts)
    datetime_1 = ts2datetime(ts-time_interval)
    index_name = flow_text_index_name_pre + datetime
    exist_es = es_text.indices.exists(index_name)
    index_name_1 = flow_text_index_name_pre + datetime_1
    exist_es_1 = es_text.indices.exists(index_name_1)

    # 1. 查询微博
    if datetime == datetime_1 and exist_es:
        search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"]
    elif datetime != datetime_1 and exist_es_1:
        search_results = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"]
    else:
        search_results = []
    #print search_results
    # 2. 获取微博相关信息
    results = []
    uid_list = []
    if search_results:
        for item in search_results:
            uid_list.append(item["_source"]['uid'])
        if uid_list:
            portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"]

        for i in range(len(uid_list)):
            item = search_results[i]['_source']
            temp = []
            # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type
            temp.append(item['uid'])
            if portrait_result[i]['found']:
                temp.append(portrait_result[i]["fields"]["nick_name"][0])
                temp.append(portrait_result[i]["fields"]["photo_url"][0])
            else:
                temp.append(item['uid'])
                temp.append("")
            temp.append(item["text"])
            #print item['text']
            temp.append(item["sentiment"])
            temp.append(ts2date(item['timestamp']))
            temp.append(item['geo'])
            temp.append(item["message_type"])
            results.append(temp)

    return results
Пример #42
0
def group_user_weibo(task_name, submit_user, sort_type):
    weibo_list = []
    now_date = ts2datetime(time.time())
    #run_type
    if RUN_TYPE == 0:
        now_date = RUN_TEST_TIME
        sort_type = 'timestamp'
    #step1: get group user
    task_id = submit_user + '-' + task_name
    try:
        group_exist_result = es_group_result.get(index=group_index_name, doc_type=group_index_type,\
                id=task_id)['_source']
    except:
        group_exist_result = {}
    if not group_exist_result:
        return 'group no exist'
    #step2: get user weibo list
    uid_list = group_exist_result['uid_list']
    for i in range(7,0,-1):
        iter_date = ts2datetime(datetime2ts(now_date) - i * DAY)
        index_name = flow_text_index_name_pre + iter_date
        try:
            weibo_result = es_flow_text.search(index=index_name, doc_type=flow_text_index_type,\
                    body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'sort':sort_type, 'size':100})['hits']['hits']
        except:
            weibo_result = []
        if weibo_result:
            weibo_list.extend(weibo_result)
    sort_weibo_list = sorted(weibo_list, key=lambda x:x['_source'][sort_type], reverse=True)[:100]
    #step3: get user name
    try:
        portrait_exist_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, \
                body={'ids':uid_list})['docs']
    except:
        portrait_exist_result = []
    uid2uname_dict = {}
    for portrait_item in portrait_exist_result:
        uid = portrait_item['_id']
        if portrait_item['found'] == True:
            source = portrait_item['_source']
            uname = source['uname']
        else:
            uname = 'unknown'
        uid2uname_dict[uid] = uname
    weibo_list = []
    for weibo_item in sort_weibo_list:
        source = weibo_item['_source']
        mid = source['mid']
        uid = source['uid']
        uname = uid2uname_dict[uid]
        text = source['text']
        ip = source['geo']
        timestamp = source['timestamp']
        date = ts2date(timestamp)
        sentiment = source['sentiment']
        weibo_url = weiboinfo2url(uid, mid)
        #run_type:
        if RUN_TYPE == 1:
            retweet_count = source['retweet_count']
            comment_count = source['comment_count']
            sensitive_score = source['sensitive_score']
        else:
            retweet_count = 0
            comment_count = 0
            sensitive_score = 0
        city = ip2city(ip)
        weibo_list.append([mid, uid, uname, text, ip, city, timestamp, date, retweet_count, comment_count, sensitive_score, weibo_url])
    return weibo_list
Пример #43
0
def influenced_people(uid, mid, influence_style, date, default_number=20):
    # uid
    # which weibo----mid, retweeted weibo ---seek for root_mid
    # influence_style: retweeted(0) or comment(1)
    date1 = str(date).replace('-', '')
    index_name = pre_index + date1
    index_flow_text = pre_text_index + date
    text_result = es.get(index=index_flow_text,
                         doc_type=flow_text_index_type,
                         id=mid)["_source"]
    temp_mid = text_result.get("root_mid", '')  #判断微博是否是原创微博
    print temp_mid
    if temp_mid:
        mid_type = 1  # 非原创微博
    else:
        mid_type = 0  # 原创微博
    query_body = {
        "query": {
            "filtered": {
                "filter": {
                    "bool": {
                        "must": []
                    }
                }
            }
        },
        "size": 100000
    }

    if mid_type == 0:
        if int(influence_style) == 0:  # origin weibo, all retweeted people
            query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{
                "term": {
                    "root_uid": uid
                }
            }, {
                "term": {
                    "message_type": 3
                }
            }, {
                "term": {
                    "root_mid": mid
                }
            }])
        else:  # commented people
            query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{
                "term": {
                    "directed_uid": uid
                }
            }, {
                "term": {
                    "message_type": 2
                }
            }, {
                "term": {
                    "root_mid": mid
                }
            }])
    else:
        if int(influence_style) == 0:  # origin weibo, all retweeted people
            query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{
                "term": {
                    "directed_uid": uid
                }
            }, {
                "term": {
                    "message_type": 3
                }
            }, {
                "term": {
                    "root_mid": temp_mid
                }
            }])
        else:  # commented people
            query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{
                "term": {
                    "directed_uid": uid
                }
            }, {
                "term": {
                    "message_type": 2
                }
            }, {
                "term": {
                    "root_mid": temp_mid
                }
            }])
    search_results = es.search(index=index_flow_text,
                               doc_type=flow_text_index_type,
                               body=query_body,
                               fields=["uid"],
                               timeout=30)["hits"]["hits"]
    results = []
    if search_results:
        for item in search_results:
            if int(item["fields"]["uid"][0]) == int(uid):
                pass
            else:
                results.append(item["fields"]["uid"][0])
        results = list(set(results))
    else:
        results = []

    if results:
        portrait_results = es_user_portrait.mget(index=user_portrait,
                                                 doc_type=portrait_index_type,
                                                 body={"ids": results},
                                                 fields=[
                                                     "domain", "topic_string",
                                                     "activity_geo_dict",
                                                     "importance", "influence"
                                                 ])["docs"]
    else:
        portrait_results = {}

    in_portrait = []
    out_portrait = []
    in_portrait_info = []
    retweeted_domain = {}
    retweeted_topic = {}
    retweeted_geo = {}
    average_influence = 0
    total_influence = 0
    count = 0

    if portrait_results:
        for item in portrait_results:
            if item["found"]:
                temp = []
                count += 1
                temp.append(item['_id'])
                temp.append(item["fields"]["importance"][0])
                in_portrait.append(temp)
                temp_domain = item["fields"]["domain"][0].split('&')
                temp_topic = item["fields"]["topic_string"][0].split('&')
                temp_geo = json.loads(
                    item["fields"]["activity_geo_dict"][0])[-1].keys()
                total_influence += item["fields"]["influence"][0]
                retweeted_domain = aggregation(temp_domain, retweeted_domain)
                retweeted_topic = aggregation(temp_topic, retweeted_topic)
                retweeted_geo = aggregation(temp_geo, retweeted_geo)
            else:
                out_portrait.append(item['_id'])
        retweeted_domain = proportion(retweeted_domain)
        retweeted_topic = proportion(retweeted_topic)
        retweeted_geo = proportion(retweeted_geo)
        try:
            average_influence = total_influence / count
        except:
            average_influence = 0
    sorted_retweeted_domain = sorted(retweeted_domain.items(),
                                     key=lambda x: x[1],
                                     reverse=True)
    sorted_retweeted_topic = sorted(retweeted_topic.items(),
                                    key=lambda x: x[1],
                                    reverse=True)
    sorted_retweeted_geo = sorted(retweeted_geo.items(),
                                  key=lambda x: x[1],
                                  reverse=True)

    retweeted_results = dict()
    retweeted_results["domian"] = sorted_retweeted_domain[:5]
    retweeted_results["topic"] = sorted_retweeted_topic[:5]
    retweeted_results["geo"] = sorted_retweeted_geo[:5]
    retweeted_results["influence"] = average_influence
    in_portrait = sorted(in_portrait, key=lambda x: x[1], reverse=True)

    temp_list = []
    for item in in_portrait:
        temp_list.append(item[0])
    print temp_list[:20]
    print out_portrait[:20]
    retweeted_results['in_portrait_number'] = len(temp_list)
    retweeted_results['out_portrait_number'] = len(out_portrait)
    in_portrait_url = get_user_url(temp_list[:default_number])
    out_portrait_url = get_user_url(out_portrait[:default_number])

    return_results = dict()
    return_results["influence_users"] = [in_portrait_url, out_portrait_url]
    return_results["influence_distribution"] = retweeted_results

    return return_results

    return ([
        in_portrait_url[:default_number], out_portrait_url[:default_number]
    ])
Пример #44
0
def search_sentiment_detail_in_topic(start_ts, task_type, task_detail, time_segment, sentiment, sort_type):
    results = {}
    start_ts = int(start_ts)
    start_date = ts2datetime(start_ts)
    end_ts = start_ts + str2segment[time_segment]
    if sort_type == 'retweet':
        sort_type = 'retweeted'
    now_date = ts2datetime(time.time())
    if start_date == now_date:
        sort_type = 'timestamp'
    #print 'start_ts:', ts2datetime(start_ts)
    #print 'end_ts:', ts2datetime(end_ts)
    if sentiment == '7':
        query_sentiment_list = SENTIMENT_SECOND
    else:
        query_sentiment_list = [sentiment]
    user_topic = task_detail
    #step1: iter get weibo and user in topic
    iter_user_count = 0
    in_user_result = {}
    all_filter_weibo_list = []
    sort_evaluate_max = SENTIMENT_SORT_EVALUATE_MAX
    flow_text_index_name = flow_text_index_name_pre + start_date
    #print 'flow_text_index_name:', flow_text_index_name
    while len(in_user_result) < SENTIMENT_MAX_USER:
        #print 'in_user_result:', len(in_user_result)
        #print 'sort_evaluate_max:', sort_evaluate_max
        query_body = {
        'query':{
            'filtered':{
                'filter':{
                    'bool':{
                        'must':[
                            {'range': {sort_type: {'lt': sort_evaluate_max}}},
                            {'terms': {'sentiment': query_sentiment_list}},
                            {'range': {'timestamp':{'gte': start_ts, 'lt': end_ts}}}
                            ]
                        }
                    }
                }
            },
        'sort': [{sort_type: {'order': 'desc'}}],
        'size': SENTIMENT_ITER_TEXT_COUNT
        }
        try:
            flow_text_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\
                    body=query_body)['hits']['hits']
        except:
            flow_text_result = []
        #print 'len flow_text_result:', len(flow_text_result)
        if not flow_text_result:
            break
        weibo_list, user_set = deal_show_weibo_list(flow_text_result)
        #filter topic user
        filter_type = 'topic'
        #print 'identify user portrait topic'
        in_portrait_result = identify_user_portrait_domain_topic(user_set, filter_type, user_topic)
        filter_weibo_list = filter_weibo_in(weibo_list, in_portrait_result)
        if filter_weibo_list:
            all_filter_weibo_list.extend(filter_weibo_list)
        if in_portrait_result:
            in_user_result = dict(in_user_result, **in_portrait_result)
        sort_evaluate_max = flow_text_result[-1]['_source'][sort_type]
    query_uid_list = in_user_result.keys()
    #step2: get keywords from flow_text
    #print 'get keyword'
    keyword_query_dict = {
            'query':{
                'filtered':{
                    'filter':{
                        'bool':{
                            'must':[
                                {'range': {'timestamp': {'gte': start_ts, 'lt': end_ts}}},
                                {'terms': {'uid': query_uid_list}}
                            ]
                        }
                    }
                }
            },
            'aggs':{
                'all_interests':{
                    'terms':{
                        'field': 'keywords_string',
                        'size': SENTIMENT_MAX_KEYWORDS
                    }
                }
            }
        }
    show_keywords_dict = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\
            body=keyword_query_dict)['aggregations']['all_interests']['buckets']
    keywords_list = [[item['key'], item['doc_count']] for item in show_keywords_dict]
    #step3: get results
    results['weibo'] = all_filter_weibo_list
    results['in_portrait_result'] = sorted(in_user_result.items(), key=lambda x:x[1][1], reverse=True)
    results['keywords'] = keywords_list
    return results
Пример #45
0
def influenced_user_detail(uid,
                           date,
                           origin_retweeted_mid,
                           retweeted_retweeted_mid,
                           message_type,
                           default_number=20):
    query_body = {
        "query": {
            "filtered": {
                "filter": {
                    "bool": {
                        "should": [],
                        "must": []
                    }
                }
            }
        },
        "size": 10000
    }
    #详细影响到的人
    date1 = str(date).replace('-', '')
    index_name = pre_index + date1
    index_flow_text = pre_text_index + date
    origin_retweeted_uid = []  # influenced user uid_list
    retweeted_retweeted_uid = []
    origin_comment_uid = []
    retweeted_comment_uid = []
    if origin_retweeted_mid:  # 所有转发该条原创微博的用户
        length = len(origin_retweeted_mid)
        if length != 1:
            for iter_mid in origin_retweeted_mid:
                query_body["query"]["filtered"]["filter"]["bool"][
                    "should"].append({"term": {
                        "root_mid": iter_mid
                    }})
        else:
            iter_mid = origin_retweeted_mid[0]
            query_body["query"]["filtered"]["filter"]["bool"]["must"].append(
                {"term": {
                    "root_mid": iter_mid
                }})
        query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{
            "term": {
                "message_type": message_type
            }
        }, {
            "term": {
                "root_uid": uid
            }
        }])
        origin_retweeted_result = es.search(index=index_flow_text,
                                            doc_type=flow_text_index_type,
                                            body=query_body,
                                            fields=["uid"])["hits"]["hits"]
        if origin_retweeted_result:
            for item in origin_retweeted_result:
                origin_retweeted_uid.append(item["fields"]["uid"][0])
    if retweeted_retweeted_mid:  # 所有评论该条原创微博的用户
        length = len(retweeted_retweeted_mid)
        if length != 1:
            for iter_mid in retweeted_retweeted_mid:
                query_body["query"]["filtered"]["filter"]["bool"][
                    "should"].append({"term": {
                        "root_mid": iter_mid
                    }})
        else:
            iter_mid = retweeted_retweeted_mid[0]
            query_body["query"]["filtered"]["filter"]["bool"]["must"].append(
                {"term": {
                    "root_mid": iter_mid
                }})
        query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{
            "term": {
                "message_type": message_type
            }
        }, {
            "term": {
                "directed_uid": uid
            }
        }])
        retweeted_retweeted_result = es.search(index=index_flow_text,
                                               doc_type=flow_text_index_type,
                                               body=query_body,
                                               fields=["uid"])["hits"]["hits"]
        if retweeted_retweeted_result:
            for item in retweeted_retweeted_result:
                retweeted_retweeted_uid.append(item["fields"]["uid"][0])
    retweeted_uid_list = []  # all retweeted user list
    retweeted_results = {}  # statistics of all retweeted uid information
    retweeted_domain = {}
    retweeted_topic = {}
    retweeted_geo = {}
    in_portrait = []
    out_portrait = []
    average_influence = 0
    total_influence = 0
    count = 0
    retweeted_uid_list.extend(origin_retweeted_uid)
    retweeted_uid_list.extend(retweeted_retweeted_uid)
    retweeted_uid_list = list(set(retweeted_uid_list) -
                              set([uid]))  # filter uids
    if retweeted_uid_list:
        user_portrait_result = es_user_portrait.mget(
            index=user_portrait,
            doc_type=portrait_index_type,
            body={"ids": retweeted_uid_list},
            fields=[
                "domain", "topic_string", "activity_geo_dict", "importance",
                "influence"
            ])["docs"]
        for item in user_portrait_result:
            if item["found"]:
                temp = []
                count += 1
                temp.append(item['_id'])
                temp.append(item["fields"]["importance"][0])
                in_portrait.append(temp)
                temp_domain = item["fields"]["domain"][0].split('&')
                temp_topic = item["fields"]["topic_string"][0].split('&')
                temp_geo = json.loads(
                    item["fields"]["activity_geo_dict"][0])[-1].keys()
                total_influence += item["fields"]["influence"][0]
                retweeted_domain = aggregation(temp_domain, retweeted_domain)
                retweeted_topic = aggregation(temp_topic, retweeted_topic)
                retweeted_geo = aggregation(temp_geo, retweeted_geo)
            else:
                out_portrait.append(item['_id'])
        retweeted_domain = proportion(retweeted_domain)
        retweeted_topic = proportion(retweeted_topic)
        retweeted_geo = proportion(retweeted_geo)
        try:
            average_influence = total_influence / count
        except:
            average_influence = 0
    sorted_retweeted_domain = sorted(retweeted_domain.items(),
                                     key=lambda x: x[1],
                                     reverse=True)
    sorted_retweeted_topic = sorted(retweeted_topic.items(),
                                    key=lambda x: x[1],
                                    reverse=True)
    sorted_retweeted_geo = sorted(retweeted_geo.items(),
                                  key=lambda x: x[1],
                                  reverse=True)
    retweeted_results["domian"] = sorted_retweeted_domain[:5]
    retweeted_results["topic"] = sorted_retweeted_topic[:5]
    retweeted_results["geo"] = sorted_retweeted_geo[:5]
    retweeted_results["influence"] = average_influence
    in_portrait = sorted(in_portrait, key=lambda x: x[1], reverse=True)

    temp_list = []
    for item in in_portrait:
        temp_list.append(item[0])
    #print temp_list[:20]
    #print out_portrait[:20]
    retweeted_results['in_portrait_number'] = len(temp_list)
    retweeted_results['out_portrait_number'] = len(out_portrait)
    in_portrait_url = get_user_url(temp_list[:default_number])
    out_portrait_url = get_user_url(out_portrait[:default_number])
    retweeted_results["in_portrait"] = in_portrait_url
    retweeted_results["out_portrait"] = out_portrait_url

    return retweeted_results
Пример #46
0
def search_sentiment_detail_all_keywords(start_ts, task_type, task_detail, time_segment, sentiment, sort_type):
    results = {}
    must_query_list = []
    if sort_type=='retweet':
        sort_type = 'retweeted'
    start_ts_date = ts2datetime(int(start_ts))
    now_date = ts2datetime(time.time())
    if start_ts_date == now_date:
        sort_type = 'timestamp'
    #step0: get query keywords list
    keyword_nest_body_list = []
    keywords_list = task_detail.split(',')
    #print 'keywords_list:', keywords_list
    for keywords_item in keywords_list:
        #print 'keywords_item:', keywords_item
        keyword_nest_body_list.append({'wildcard':{'text': '*' + keywords_item + '*'}})
    must_query_list.append({'bool':{'should': keyword_nest_body_list}})
    #step1: get weibo from flow_text
    start_ts = int(start_ts)
    start_date = ts2datetime(start_ts)
    end_ts = start_ts + str2segment[time_segment]
    if sentiment == '7':
        query_sentiment_list = SENTIMENT_SECOND
    else:
        query_sentiment_list = [sentiment]
    must_query_list.append({'range': {'timestamp': {'gte': start_ts, 'lt':end_ts}}})
    must_query_list.append({'terms': {'sentiment': query_sentiment_list}})
    in_user_result = {}
    out_user_result = {}
    all_filter_weibo_list = []
    sort_evaluate_max = SENTIMENT_SORT_EVALUATE_MAX
    while len(in_user_result) < SENTIMENT_MAX_USER:
        query_body = {
            'query':{
                'bool':{
                    'must': [
                        {'range':{sort_type: {'lt': sort_evaluate_max}}},
                        {'range':{'timestamp':{'gte': start_ts, 'lt': end_ts}}},
                        {'terms':{'sentiment': query_sentiment_list}},
                        {'bool':{'should': keyword_nest_body_list}}
                    ]
                }
            },
        'size': SENTIMENT_ITER_TEXT_COUNT,
        'sort': [{sort_type: {'order': 'desc'}}]
        }
        flow_text_index_name = flow_text_index_name_pre + start_date
        #print 'flow_text_index_name:', flow_text_index_name
        try:
            flow_text_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\
                body=query_body)['hits']['hits']
        except:
            flow_text_result = []
        #print 'flow_text_result:', len(flow_text_result)
        #print 'show weibo list'
        if not flow_text_result:
            break
        show_weibo_list, user_set = deal_show_weibo_list(flow_text_result)
        filter_type = 'in-out'
        in_portrait_result, out_portrait_result = identify_user_portrait(user_set, filter_type)
        if len(all_filter_weibo_list) <= SENTIMENT_MAX_TEXT and show_weibo_list:
            all_filter_weibo_list.extend(show_weibo_list)
        if in_portrait_result:
            in_user_result = dict(in_user_result, **in_portrait_result)
        if out_portrait_result:
            out_user_result = dict(out_user_result, **out_portrait_result)
        sort_evaluate_max = flow_text_result[-1]['_source'][sort_type]

    #print 'get keyword'
    #step2: get keywords from flow_text
    keyword_query_dict = {
        'query':{
            'bool':{
                'must':must_query_list
                }
            },
        'aggs':{
            'all_interests': {
                'terms': {
                    'field': 'keywords_string',
                    'size': SENTIMENT_MAX_KEYWORDS
                    }
                }
            }
        }
    show_keywords_dict = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\
            body=keyword_query_dict)['aggregations']['all_interests']['buckets']
    keywords_list = [[item['key'], item['doc_count']] for item in show_keywords_dict]
    #step3: get user information
    #filter_type = 'in-out'
    #in_portrait_result, out_portrait_result = identify_user_portrait(user_set, filter_type)
    in_portrait_result = in_user_result
    out_portrait_result = out_user_result
    #step4: add uname to show weibo list
    show_weibo_list = add_uname2weibo(all_filter_weibo_list, in_portrait_result, out_portrait_result)
    #step5: results
    results['weibo'] = show_weibo_list
    results['in_portrait_result'] = sorted(in_portrait_result.items(), key=lambda x:x[1][1], reverse=True)[:SENTIMENT_MAX_USER]
    results['out_portrait_result'] = sorted(out_portrait_result.items(), key=lambda x:x[1][3], reverse=True)[:SENTIMENT_MAX_USER]
    results['keywords'] = keywords_list
    return results
Пример #47
0
def aggregation_hot_keywords(start_time, stop_time, keywords_list):
    start_time = int(start_time)
    stop_time = int(stop_time)
    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "must":[
                            {"terms": {"keywords_string": keywords_list}},
                            {"range":{
                                "timestamp":{
                                    "gte":start_time,
                                    "lt": stop_time
                                }
                            }}
                        ]
                    }
                }
            }
        },
        "aggs":{
            "all_keywords":{
                "terms": {"field": "keywords_string", "size": PRE_AGGREGATION_NUMBER}
            }
        }
    }


    keywords_dict = dict()
    datetime = ts2datetime(float(stop_time))
    index_name = flow_text_index_name_pre + datetime
    exist_es = es_text.indices.exists(index_name)
    if exist_es:
        search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["aggregations"]['all_keywords']['buckets']
        if search_results:
            for item in search_results:
                keywords_dict[item['key']] = item['doc_count']

    datetime_1 = ts2datetime(float(start_time))
    if datetime_1 == datetime:
        pass
    else:
        ts = float(stop_time)
        while 1:
            keywords_dict_1 = dict()
            ts = ts-day_time
            datetime = ts2datetime(ts)
            index_name = flow_text_index_name_pre + datetime
            exist_es = es_text.indices.exists(index_name)
            if exist_es:
                search_results_1 = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["aggregations"]['all_keywords']['buckets']
                if search_results_1:
                    print search_results_1
                    for item in search_results_1:
                        keywords_dict_1[item['key']] = item['doc_count']
                for iter_key in keywords_dict_1.keys():
                    if keywords_dict.has_key(iter_key):
                        keywords_dict[iter_key] += keywords_dict_1[iter_key]
                    else:
                        keywords_dict[iter_key] = keywords_dict_1[iter_key]
            if datetime_1 == datetime:
                break
    print keywords_dict
    return_dict = sorted(keywords_dict.items(), key=lambda x:x[1], reverse=True)[:AGGRAGATION_KEYWORDS_NUMBER]
    return return_dict
def get_origin_weibo_detail(ts, user, task_name, size, order, message_type=1):
    _id = user + '-' + task_name
    task_detail = es_user_portrait.get(index=index_sensing_task, doc_type=_id, id=ts)['_source']

    mid_value = json.loads(task_detail['mid_topic_value'])
    duplicate_dict = json.loads(task_detail['duplicate_dict'])
    tmp_duplicate_dict = dict()
    for k,v in duplicate_dict.iteritems():
        try:
            tmp_duplicate_dict[v].append(k)
        except:
            tmp_duplicate_dict[v] = [k, v]

        

    if message_type == 1:
        weibo_detail = json.loads(task_detail['origin_weibo_detail'])
    elif message_type == 2:
        weibo_detail = json.loads(task_detail['retweeted_weibo_detail'])
    else:
        weibo_detail = json.loads(task_detail['sensitive_weibo_detail'])
    weibo_detail_list = []
    if weibo_detail:
        for iter_mid, item in weibo_detail.iteritems():
            tmp = []
            tmp.append(iter_mid)
            tmp.append(item[iter_mid])
            tmp.append(item['retweeted'])
            tmp.append(item['comment'])
            weibo_detail_list.append(tmp)
    mid_list = weibo_detail.keys()

    results = []
    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "terms":{"mid": mid_list}
                }
            }
        },
        "size": 1000,
        "sort": {"timestamp": {"order": "desc"}}
    }


    index_list = []
    datetime = ts2datetime(ts)
    datetime_1 = ts2datetime(ts-DAY)
    index_name = flow_text_index_name_pre + datetime
    exist_es = es_text.indices.exists(index_name)
    if exist_es:
        index_list.append(index_name)
    index_name_1 = flow_text_index_name_pre + datetime_1
    exist_es_1 = es_text.indices.exists(index_name_1)
    if exist_es_1:
        index_list.append(index_name_1)

    if index_list and mid_list:
        search_results = es_text.search(index=index_list, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"]
    else:
        search_results = []

    uid_list = []
    text_dict = dict() # 文本信息
    portrait_dict = dict() # 背景信息
    sort_results = []
    if search_results:
        for item in search_results:
            uid_list.append(item["_source"]['uid'])
            text_dict[item['_id']] = item['_source'] # _id是mid
        if uid_list:
            portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"]
            for item in portrait_result:
                if item['found']:
                    portrait_dict[item['_id']] = {"nick_name": item["fields"]["nick_name"][0], "photo_url": item["fields"]["photo_url"][0]}
                else:
                    portrait_dict[item['_id']] = {"nick_name": item['_id'], "photo_url":""}

        if order == "total":
            sorted_list = sorted(weibo_detail_list, key=lambda x:x[1], reverse=True)
        elif order == "retweeted":
            sorted_list = sorted(weibo_detail_list, key=lambda x:x[2], reverse=True)
        elif order == "comment":
            sorted_list = sorted(weibo_detail_list, key=lambda x:x[3], reverse=True)
        else:
            sorted_list = weibo_detail_list

        count_n = 0
        results_dict = dict()
        mid_index_dict = dict()
        for item in sorted_list: # size
            mid = item[0]
            iter_text = text_dict.get(mid, {})
            temp = []
            # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type
            if iter_text:
                uid = iter_text['uid']
                temp.append(uid)
                iter_portrait = portrait_dict.get(uid, {})
                if iter_portrait:
                    temp.append(iter_portrait['nick_name'])
                    temp.append(iter_portrait['photo_url'])
                else:
                    temp.extend([uid,''])
                temp.append(iter_text["text"])
                temp.append(iter_text["sentiment"])
                temp.append(ts2date(iter_text['timestamp']))
                temp.append(iter_text['geo'])
                if message_type == 1:
                    temp.append(1)
                elif message_type == 2:
                    temp.append(3)
                else:
                    temp.append(iter_text['message_type'])
                temp.append(item[2])
                temp.append(item[3])
                temp.append(iter_text.get('sensitive', 0))
                temp.append(iter_text['timestamp'])
                temp.append(mid_value[mid])
                temp.append(mid)
                results.append(temp)
            count_n += 1

        results = sorted(results, key=operator.itemgetter(-4, -2, -6), reverse=True) # -4 -2 -3
        sort_results = []
        count = 0
        for item in results:
            sort_results.append([item])
            mid_index_dict[item[-1]] = count
            count += 1

        
        if tmp_duplicate_dict:
            remove_list = []
            value_list = tmp_duplicate_dict.values() # [[mid, mid], ]
            for item in value_list:
                tmp = []
                for mid in item:
                    if mid_index_dict.get(mid, 0):
                        tmp.append(mid_index_dict[mid])
                if len(tmp) > 1:
                    tmp_min = min(tmp)
                else:
                    continue
                tmp.remove(tmp_min)
                for iter_count in tmp:
                    sort_results[tmp_min].extend(sort_results[iter_count])
                    remove_list.append(sort_results[iter_count])
            if remove_list:
                for item in remove_list:
                    sort_results.remove(item)
        

    return sort_results
Пример #49
0
def get_retweet_weibo_detail(ts, user, task_name, size, text_type, type_value):
    _id = user + '-' + task_name
    task_detail = es_user_portrait.get(index=index_sensing_task,
                                       doc_type=_id,
                                       id=ts)['_source']
    origin_weibo_detail = json.loads(task_detail['origin_weibo_detail'])
    retweeted_weibo_detail = json.loads(task_detail['retweeted_weibo_detail'])

    mid_list = []
    mid_list.extend(origin_weibo_detail.keys())
    mid_list.extend(retweeted_weibo_detail.keys())

    query_body = {
        "query": {
            "filtered": {
                "filter": {
                    "bool": {
                        "must": [{
                            "range": {
                                "timestamp": {
                                    "gte": ts - time_interval,
                                    "lt": ts
                                }
                            }
                        }, {
                            "terms": {
                                "root_mid": mid_list
                            }
                        }]
                    }
                }
            }
        },
        "sort": {
            "timestamp": {
                "order": "desc"
            }
        },
        "size": 100
    }

    if text_type == "message_type":
        query_body['query']['filtered']['filter']['bool']['must'].append(
            {"term": {
                text_type: type_value
            }})
    if text_type == "sentiment":
        #if isinstance(type_value, str):
        if len(type_value) == 1:
            query_body['query']['filtered']['filter']['bool']['must'].append(
                {"term": {
                    text_type: type_value
                }})
        else:
            query_body['query']['filtered']['filter']['bool']['must'].append(
                {"terms": {
                    text_type: type_value
                }})

    datetime = ts2datetime(ts)
    datetime_1 = ts2datetime(ts - time_interval)
    index_name = flow_text_index_name_pre + datetime
    exist_es = es_text.indices.exists(index_name)
    index_name_1 = flow_text_index_name_pre + datetime_1
    exist_es_1 = es_text.indices.exists(index_name_1)

    # 1. 查询微博
    if datetime == datetime_1 and exist_es:
        search_results = es_text.search(index=index_name,
                                        doc_type=flow_text_index_type,
                                        body=query_body)["hits"]["hits"]
    elif datetime != datetime_1 and exist_es_1:
        search_results = es_text.search(index=index_name_1,
                                        doc_type=flow_text_index_type,
                                        body=query_body)["hits"]["hits"]
    else:
        search_results = []
    #print search_results
    # 2. 获取微博相关信息
    results = []
    uid_list = []
    if search_results:
        for item in search_results:
            uid_list.append(item["_source"]['uid'])
        if uid_list:
            portrait_result = es_profile.mget(
                index=profile_index_name,
                doc_type=profile_index_type,
                body={"ids": uid_list},
                fields=['nick_name', 'photo_url'])["docs"]

        for i in range(len(uid_list)):
            item = search_results[i]['_source']
            temp = []
            # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type
            temp.append(item['uid'])
            if portrait_result[i]['found']:
                temp.append(portrait_result[i]["fields"]["nick_name"][0])
                temp.append(portrait_result[i]["fields"]["photo_url"][0])
            else:
                temp.append(item['uid'])
                temp.append("")
            temp.append(item["text"])
            #print item['text']
            temp.append(item["sentiment"])
            temp.append(ts2date(item['timestamp']))
            temp.append(item['geo'])
            temp.append(item["message_type"])
            results.append(temp)

    return results
def influenced_detail(uid, date, style):
    date1 = str(date).replace('-', '')
    index_name = pre_index + date1
    index_text = "flow_text_" + date
    style = int(style)

    query_body_origin = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "must":[
                            {"term":{"message_type": 1}},
                            {"term":{"uid": uid}}
                        ]
                    }
                }
            }
        },
        "size": 10000
    }
    result_1 = es.search(index=index_text, doc_type="text", body=query_body_origin)['hits']['hits']
    origin_set = []
    if result_1:
        for item in result_1:
            origin_set.append([item['_id'], item['_source'].get("retweeted", 0), item['_source'].get("comment", 0)])

    query_body_retweeted = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "must":[
                            {"term":{"message_type": 3}},
                            {"term":{"uid": uid}}
                        ]
                    }
                }
            }
        },
        "size": 10000
    }
    result_2 = es.search(index=index_text, doc_type="text", body=query_body_retweeted)['hits']['hits']
    retweeted_set = []
    if result_2:
        for item in result_2:
            retweeted_set.append([item['_id'], item['_source'].get("retweeted", 0), item['_source'].get("comment", 0)])

    if style == 0:
        sorted_list = sorted(origin_set, key=lambda x:x[1], reverse=True)
        detail_text = get_text(sorted_list[:20], date, style)
    elif style == 1:
        sorted_list = sorted(origin_set, key=lambda x:x[2], reverse=True)
        detail_text = get_text(sorted_list[:20], date, style)
    elif style == 2:
        sorted_list = sorted(retweeted_set, key=lambda x:x[1], reverse=True)
        detail_text = get_text(sorted_list[:20], date, style)
    else:
        sorted_list = sorted(retweeted_set, key=lambda x:x[2], reverse=True)
        detail_text = get_text(sorted_list[:20], date, style)

    return detail_text
Пример #51
0
def get_sensitive_text_detail(task_name, ts, user, order):
    _id = user + '-' + task_name
    task_detail = es.get(index=index_sensing_task, doc_type=_id, id=ts)['_source']
    weibo_detail = json.loads(task_detail['sensitive_weibo_detail'])

    weibo_detail_list = []
    if weibo_detail:
        for iter_mid, item in weibo_detail.iteritems():
            tmp = []
            tmp.append(iter_mid)
            tmp.append(item[iter_mid])
            tmp.append(item['retweeted'])
            tmp.append(item['comment'])
            weibo_detail_list.append(tmp)
    mid_list = weibo_detail.keys()

    results = []
    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "terms":{"mid": mid_list}
                }
            }
        }
    }

    index_list = []
    datetime = ts2datetime(ts)
    datetime_1 = ts2datetime(ts-DAY)
    index_name = flow_text_index_name_pre + datetime
    exist_es = es_text.indices.exists(index_name)
    if exist_es:
        index_list.append(index_name)
    index_name_1 = flow_text_index_name_pre + datetime_1
    exist_es_1 = es_text.indices.exists(index_name_1)
    if exist_es_1:
        index_list.append(index_name_1)

    if index_list and mid_list:
        search_results = es_text.search(index=index_list, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"]
    else:
        search_results = []

    uid_list = []
    text_dict = dict() # 文本信息
    portrait_dict = dict() # 背景信息
    if search_results:
        for item in search_results:
            uid_list.append(item["_source"]['uid'])
            text_dict[item['_id']] = item['_source'] # _id是mid
        if uid_list:
            portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"]
            for item in portrait_result:
                if item['found']:
                    portrait_dict[item['_id']] = {"nick_name": item["fields"]["nick_name"][0], "photo_url": item["fields"]["photo_url"][0]}
                else:
                    portrait_dict[item['_id']] = {"nick_name": item['_id'], "photo_url":""}

        if order == "total":
            sorted_list = sorted(weibo_detail_list, key=lambda x:x[1], reverse=True)
        elif order == "retweeted":
            sorted_list = sorted(weibo_detail_list, key=lambda x:x[2], reverse=True)
        elif order == "comment":
            sorted_list = sorted(weibo_detail_list, key=lambda x:x[3], reverse=True)
        else:
            sorted_list = weibo_detail_list

        count_n = 0
        for item in sorted_list:
            mid = item[0]
            iter_text = text_dict.get(mid, {})
            temp = []
            # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type
            if iter_text:
                uid = iter_text['uid']
                temp.append(uid)
                iter_portrait = portrait_dict.get(uid, {})
                if iter_portrait:
                    temp.append(iter_portrait['nick_name'])
                    temp.append(iter_portrait['photo_url'])
                else:
                    temp.extend([uid,''])
                temp.append(iter_text["text"])
                temp.append(iter_text["sentiment"])
                temp.append(ts2date(iter_text['timestamp']))
                temp.append(iter_text['geo'])
                temp.append(iter_text['message_type'])
                temp.append(item[2])
                temp.append(item[3])
                temp.append(iter_text.get('sensitive', 0))
                count_n += 1
                results.append(temp)

        if results and order == "ts":
            results = sorted(results, key=lambda x:x[5], reverse=True)

        if results and order == "sensitive":
            results = sorted(results, key=lambda x:x[-1], reverse=True)

    return results