def end_track_task(task_name):
    status = 0
    try:
        task_exist = es.get(index=index_name, doc_type=index_type, id=task_name)['_source']
    except:
        return 'task name not exist'
    task_status = task_exist['status']
    if status == '0':
        return 'task have end'
    else:
        task_exist['status'] = 0
        # made end time
        now_ts = time.time()
        now_date = ts2datetime(now_ts)
        now_date_ts = datetime2ts(now_date)
        time_segment = int((now_ts - now_date_ts) / 900) + 1
        end_ts = now_date_ts + time_segment * 900
        end_date = ts2date(end_ts)
        task_exist['end_date'] = end_date
        task_user = task_exist['uid_list']
        status = change_user_count(task_user)
        if status == 0:
            return 'change user task count fail'
        else:
            es.index(index=index_name, doc_type=index_type, id=task_name, body=task_exist)
            status = delete_task_redis(task_name)
            if status == 0:
                return 'delete task from redis fail'
            else:
                return 'success change status to end'
예제 #2
0
def end_track_task(task_name):
    status = 0
    try:
        task_exist = es.get(index=index_name,
                            doc_type=index_type,
                            id=task_name)['_source']
    except:
        return 'task name not exist'
    task_status = task_exist['status']
    if status == '0':
        return 'task have end'
    else:
        task_exist['status'] = 0
        # made end time
        now_ts = time.time()
        now_date = ts2datetime(now_ts)
        now_date_ts = datetime2ts(now_date)
        time_segment = int((now_ts - now_date_ts) / 900) + 1
        end_ts = now_date_ts + time_segment * 900
        end_date = ts2date(end_ts)
        task_exist['end_date'] = end_date
        task_user = task_exist['uid_list']
        status = change_user_count(task_user)
        if status == 0:
            return 'change user task count fail'
        else:
            es.index(index=index_name,
                     doc_type=index_type,
                     id=task_name,
                     body=task_exist)
            status = delete_task_redis(task_name)
            if status == 0:
                return 'delete task from redis fail'
            else:
                return 'success change status to end'
예제 #3
0
def get_activity_weibo(task_name,
                       submit_user,
                       start_ts,
                       time_segment=FOUR_HOUR):
    results = []
    #step1: get task_name uid
    task_id = submit_user + task_name
    try:
        group_result = es_group_result.get(index=group_index_name, doc_type=group_index_type ,\
                id=task_id, _source=False, fields=['uid_list'])
    except:
        group_result = {}
    if group_result == {}:
        return 'task name invalid'
    try:
        uid_list = group_result['fields']['uid_list']
    except:
        uid_list = []
    if uid_list == []:
        return 'task uid list null'
    #step2: get uid2uname
    uid2uname = {}
    try:
        user_portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, \
                body = {'ids':uid_list}, _source=False, fields=['uname'])['docs']
    except:
        user_portrait_result = []
    for item in user_portrait_result:
        uid = item['_id']
        if item['found'] == True:
            uname = item['fields']['uname'][0]
        uid2uname[uid] = uname
    #step3: search time_segment weibo
    end_ts = start_ts + time_segment
    time_date = ts2datetime(start_ts)
    flow_text_index_name = flow_text_index_name_pre + time_date
    query = []
    query.append({'terms': {'uid': uid_list}})
    query.append({'range': {'timestamp': {'gte': start_ts, 'lt': end_ts}}})
    try:
        flow_text_es_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type, \
                body={'query':{'bool':{'must':query}}, 'sort':'timestamp', 'size':MAX_VALUE})['hits']['hits']
    except:
        flow_text_es_result = []
    for item in flow_text_es_result:
        weibo = {}
        source = item['_source']
        weibo['timestamp'] = ts2date(source['timestamp'])
        weibo['ip'] = source['ip']
        weibo['text'] = source['text']
        if source['geo']:
            weibo['geo'] = '\t'.join(source['geo'])
        else:
            weibo['geo'] = ''
        results.append(weibo)

    return results
def get_influence_content(uid, timestamp_from, timestamp_to):
    weibo_list = []
    # split timestamp range to new_range_dict_list
    from_date_ts = datetime2ts(ts2datetime(timestamp_from))
    to_date_ts = datetime2ts(ts2datetime(timestamp_to))
    new_range_dict_list = []
    if from_date_ts != to_date_ts:
        iter_date_ts = from_date_ts
        while iter_date_ts < to_date_ts:
            iter_next_date_ts = iter_date_ts + DAY
            new_range_dict_list.append({"range": {"timestamp": {"gte": iter_date_ts, "lt": iter_next_date_ts}}})
            iter_date_ts = iter_next_date_ts
        if new_range_dict_list[0]["range"]["timestamp"]["gte"] < timestamp_from:
            new_range_dict_list[0]["range"]["timestamp"]["gte"] = timestamp_from
        if new_range_dict_list[-1]["range"]["timestamp"]["lt"] > timestamp_to:
            new_range_dict_list[-1]["range"]["timestamp"]["lt"] = timestamp_to
    else:
        new_range_dict_list = [{"range": {"timestamp": {"gte": timestamp_from, "lt": timestamp_to}}}]
    # iter date to search flow_text
    iter_result = []
    for range_item in new_range_dict_list:
        range_from_ts = range_item["range"]["timestamp"]["gte"]
        range_from_date = ts2datetime(range_from_ts)
        flow_text_index_name = flow_text_index_name_pre + range_from_date
        query = []
        query.append({"term": {"uid": uid}})
        query.append(range_item)
        try:
            flow_text_exist = es_flow_text.search(
                index=flow_text_index_name,
                doc_type=flow_text_index_type,
                body={"query": {"bool": {"must": query}}, "sort": [{"timestamp": "asc"}]},
            )["hits"]["hits"]
        except:
            flow_text_exist = []
        iter_result.extend(flow_text_exist)
    # get weibo list
    for item in flow_text_exist:
        source = item["_source"]
        weibo = {}
        weibo["timestamp"] = ts2date(source["timestamp"])
        weibo["ip"] = source["ip"]
        weibo["text"] = source["text"]
        if source["geo"]:
            weibo["geo"] = "\t".join(source["geo"].split("&"))
        else:
            weibo["geo"] = ""
        weibo_list.append(weibo)

    return weibo_list
def get_text(top_list, date, order):

# input: [[mid1, no.1], [mid2, no.2], ['mid3', no.3]]
# output: [[text1, no.1], [text2, no.2], [text3, no.3]]
# mid, retweeted, comment, text, geo, timestamp, sentiment, mid_url
    results = []
    index_flow_text = pre_text_index + date
    #index_list = get_text_index(date)
    if len(top_list) != 0: # no one
        mid_list = []
        for item in top_list:
            mid_list.append(item[0])
        search_result = es.mget(index=index_flow_text, doc_type=flow_text_index_type, body={"ids":mid_list})["docs"]
        for i in range(len(top_list)):
            temp = []
            temp.extend(top_list[i])
            if search_result[i]['found']:
                source = search_result[i]['_source']
                temp.append(source["text"])
                temp.append(source["geo"])
                temp.append(ts2date(source["timestamp"]))
                temp.append(source["sentiment"])
                temp.append(weiboinfo2url(source['uid'], source['mid']))
                temp.append(uid_url+source['uid'])
                temp.append(source['uid'])
                try:
                    uname = es_profile.get(index=profile_index_name, doc_type=profile_index_type, id=source['uid'])["_source"]["nick_name"]
                    temp.append(uname)
                except:
                    temp.append(source['uid'])
                #temp.append(source['sensitive_words_string'].replace("&", " "))
                temp.append(int(source["timestamp"]))
            else:
                temp.extend(["", "", "", "", "", "", "", "", time.time()])

            results.append(temp)

    if results:
        if int(order) == 1:#"sensitive"
            results = sorted(results, key=lambda x:x[3], reverse=True)
        elif int(order) == 2: #retweet
            results = sorted(results, key=lambda x:x[1], reverse=True)
        elif int(order) == 3: #comment
            results = sorted(results, key=lambda x:x[2], reverse=True)
        else:
            results = sorted(results, key=lambda x:x[-1], reverse=False)

    return results
예제 #6
0
def full_text_search(keywords, uid, start_time, end_time, size):
    results = []
    uid_list = []
    user_profile_list = []
    query_body = {
        "query": {
            "filtered":{
                "filter":{
                    "bool": {
                        "must": []
                    }
                }
            }
        },
        "size":size,
        "sort":{"timestamp":{"order": 'desc'}}
    }

    if RUN_TYPE:
        query_body["sort"] = {"user_fansnum":{"order": 'desc'}}

    if uid:
        query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"term":{"uid":uid}})

    if keywords:
        keywords_list = keywords.split(',')
        for word in keywords_list:
            query_body["query"]["filtered"]["filter"]["bool"]["must"].append({'wildcard':{'text':{'wildcard':'*'+word+'*'}}})

    index_list = []
    exist_bool = es_flow_text.indices.exists(index="flow_text_"+end_time)
    if start_time:
        start_ts = datetime2ts(start_time)
        end_ts = datetime2ts(end_time)
        ts = end_ts
        while 1:
            index_name = "flow_text_"+ts2datetime(ts)
            exist_bool = es_flow_text.indices.exists(index=index_name)
            if exist_bool:
                index_list.append(index_name)
            if ts == start_ts:
                break
            else:
                ts -= 3600*24

    print index_list
    #  没有可行的es
    if not index_list:
        return []

    search_results = es_flow_text.search(index=index_list, doc_type="text", body=query_body)["hits"]["hits"]
    for item in search_results:
        uid_list.append(item['_source']['uid'])
    history_max = get_history_max()
    personal_field = ["nick_name", "fansnum", "statusnum","user_location"]
    user_info = get_user_profile(uid_list, personal_field)
    bci_results = ES_CLUSTER_FLOW1.mget(index="bci_history", doc_type="bci", body={"ids":uid_list}, _source=False, fields=["bci_day_last"])["docs"]
    sensitive_results = es_sensitive_history.mget(index="sensitive_history", doc_type="sensitive", body={"ids":uid_list}, _source=False, fields=["last_value"])["docs"]

    count = 0
    for item in search_results:
        item = item['_source']
        uid_list.append(item['uid'])
        iter_item = []
        iter_item.append(item['uid'])
        iter_item.append(user_info[count][1])
        iter_item.append(item['text'])
        iter_item.append(ts2date(item['timestamp']))
        iter_item.append(item['geo'])
        if item.get("sensitive_words_string", ''):
            iter_item.append(item['sensitive_words_string'].split('&'))
        else:
            iter_item.append([])
        iter_item.append(item.get('retweeted', 0))
        iter_item.append(item.get('comment', 0))
        count += 1
        results.append(iter_item)

    user_set = set()
    count = 0
    for item in user_info:
        if item[0] in user_set:
            continue
        else:
            user_set.add(item[0])
        if bci_results[count]["found"]:
            bci_value = bci_results[count]["fields"]["bci_day_last"][0]
            item.append(normalize_index(bci_value, history_max["max_bci"]))
        else:
            item.append(0)
        if sensitive_results[count]["found"]:
            sensitive_value = sensitive_results[count]['fields']['last_value'][0]
            item.append(normalize_index(sensitive_value, history_max["max_sensitive"]))
        else:
            item.append(0)
        user_profile_list.append(item)

    return results, user_profile_list
def get_activity_weibo(task_name, submit_user, start_ts, time_segment=FOUR_HOUR):
    results = []
    # step1: get task_name uid
    task_id = submit_user + task_name
    try:
        group_result = es_group_result.get(
            index=group_index_name, doc_type=group_index_type, id=task_id, _source=False, fields=["uid_list"]
        )
    except:
        group_result = {}
    if group_result == {}:
        return "task name invalid"
    try:
        uid_list = group_result["fields"]["uid_list"]
    except:
        uid_list = []
    if uid_list == []:
        return "task uid list null"
    # step2: get uid2uname
    uid2uname = {}
    try:
        user_portrait_result = es_user_portrait.mget(
            index=portrait_index_name,
            doc_type=portrait_index_type,
            body={"ids": uid_list},
            _source=False,
            fields=["uname"],
        )["docs"]
    except:
        user_portrait_result = []
    for item in user_portrait_result:
        uid = item["_id"]
        if item["found"] == True:
            uname = item["fields"]["uname"][0]
        uid2uname[uid] = uname
    # step3: search time_segment weibo
    end_ts = start_ts + time_segment
    time_date = ts2datetime(start_ts)
    flow_text_index_name = flow_text_index_name_pre + time_date
    query = []
    query.append({"terms": {"uid": uid_list}})
    query.append({"range": {"timestamp": {"gte": start_ts, "lt": end_ts}}})
    try:
        flow_text_es_result = es_flow_text.search(
            index=flow_text_index_name,
            doc_type=flow_text_index_type,
            body={"query": {"bool": {"must": query}}, "sort": "timestamp", "size": MAX_VALUE},
        )["hits"]["hits"]
    except:
        flow_text_es_result = []
    for item in flow_text_es_result:
        weibo = {}
        source = item["_source"]
        weibo["timestamp"] = ts2date(source["timestamp"])
        weibo["ip"] = source["ip"]
        weibo["text"] = source["text"]
        if source["geo"]:
            weibo["geo"] = "\t".join(source["geo"])
        else:
            weibo["geo"] = ""
        results.append(weibo)

    return results
예제 #8
0
def full_text_search(keywords, uid, start_time, end_time, size):
    results = []
    uid_list = []
    user_profile_list = []
    query_body = {
        "query": {
                    "bool": {
                        "must": []
                    }
        },
        "size":size,
        "sort":{"timestamp":{"order": 'desc'}}
    }

    if RUN_TYPE:
        query_body["sort"] = {"user_fansnum":{"order": 'desc'}}

    if uid:
        query_body["query"]["bool"]["must"].append({"term":{"uid":uid}})

    if keywords:
        keywords_list = keywords.split(',')
        for word in keywords_list:
            query_body["query"]["bool"]["must"].append({'wildcard':{'text':{'wildcard':'*'+word+'*'}}})

    index_list = []
    exist_bool = es_flow_text.indices.exists(index="flow_text_"+end_time)
    if start_time:
        start_ts = datetime2ts(start_time)
        end_ts = datetime2ts(end_time)
        ts = end_ts
        while 1:
            index_name = "flow_text_"+ts2datetime(ts)
            exist_bool = es_flow_text.indices.exists(index=index_name)
            if exist_bool:
                index_list.append(index_name)
            if ts == start_ts:
                break
            else:
                ts -= 3600*24

    print index_list
    #  没有可行的es
    if not index_list:
        return [[], []]

    search_results = es_flow_text.search(index=index_list, doc_type="text", body=query_body)["hits"]["hits"]
    for item in search_results:
        uid_list.append(item['_source']['uid'])
    user_info = []
    if uid_list:
        history_max = get_history_max()
        personal_field = ["nick_name", "fansnum", "statusnum","user_location"]
        user_info = get_user_profile(uid_list, personal_field)
        bci_results = ES_CLUSTER_FLOW1.mget(index="bci_history", doc_type="bci", body={"ids":uid_list}, _source=False, fields=["bci_day_last"])["docs"]
        in_portrait = es_user_portrait.mget(index="sensitive_user_portrait", doc_type="user", body={"ids":uid_list}, _source=False)["docs"]
        sensitive_results = es_sensitive_history.mget(index="sensitive_history", doc_type="sensitive", body={"ids":uid_list}, _source=False, fields=["last_value"])["docs"]
    print "len search: ", len(search_results)

    count = 0
    # uid uname text date geo sensitive_words retweeted comment
    for item in search_results:
        item = item['_source']
        uid_list.append(item['uid'])
        iter_item = []
        iter_item.append(item['uid'])
        iter_item.append(user_info[count][1])
        iter_item.append(item['text'])
        iter_item.append(ts2date(item['timestamp']))
        iter_item.append(item['geo'])
        if item.get("sensitive_words_string", ''):
            iter_item.append(item['sensitive_words_string'].split('&'))
        else:
            iter_item.append([])
        iter_item.append(item.get('retweeted', 0))
        iter_item.append(item.get('comment', 0))
        count += 1
        results.append(iter_item)

    user_set = set()
    count = 0
    # uid "nick_name", "fansnum", "statusnum","user_location", bci, sensitive
    for item in user_info:
        if item[0] in user_set:
            continue
        else:
            user_set.add(item[0])
        if bci_results[count]["found"]:
            if bci_results[count].has_key("fields"):
                bci_value = bci_results[count]["fields"]["bci_day_last"][0]
            else:
                bci_value = 0
            item.append(normalize_index(bci_value, history_max["max_bci"]))
        else:
            item.append(0)
        if sensitive_results[count]["found"]:
            if sensitive_results[count].has_key("fields"):
                sensitive_value = sensitive_results[count]['fields']['last_value'][0]
            else:
                sensitive_value = 0
            item.append(normalize_index(sensitive_value, history_max["max_sensitive"]))
        else:
            item.append(0)
        if in_portrait[count]["found"]:
            item.append("1")
        else:
            item.append("0")
        user_profile_list.append(item)

    return results, user_profile_list
def compute_mid_result(task_name, task_submit_date):
    result = {'count_0':{}, 'count_1':{}, 'sentiment_0_126':{}, 'sentiment_0_127':{}, 'sentiment_0_128':{},\
            'sentiment_0_129':{}, 'sentiment_0_130':{}, 'sensitive_score':{}, 'geo_0':{}, 'geo_1':{},\
            'hashtag_0':{}, 'hashtag_1':{}, 'sentiment_1_126':{}, 'sentiment_1_127':{}, \
            'sentiment_1_128':{}, 'sentiment_1_129':{}, 'sentiment_1_130':{}}
    #geo & hashtag: day
    #other: 15min
    search_time_segment = 3600 * 4
    #start_ts = datetime2ts(task_submit_date)
    start_ts = date2ts(task_submit_date)
    now_ts = time.time()
    now_date = ts2datetime(now_ts)
    #test
    now_ts = datetime2ts('2013-09-08')
    date_ts = datetime2ts(now_date)
    segment = int((now_ts - date_ts) / 900) + 1
    end_ts = date_ts + segment * 900
    #every search time-range: 4 hour----bulk action to search
    begin_ts = start_ts

    while True:
        if begin_ts >= end_ts:
            break
        compute_ts = ts2date(begin_ts)
        #print 'compute ts:', compute_ts
        query_body = {'range':{'timestamp':{'from': begin_ts, 'to':begin_ts+search_time_segment}}}
        try:
            mid_result_list = es.search(index=monitor_index_name, doc_type=task_name, body={'query':query_body, 'size':100000, 'sort':[{'timestamp':{'order': 'asc'}}]})['hits']['hits']
        except Exception, e:
            raise e
        if mid_result_list:
            for mid_result_item in mid_result_list:
                result_item = mid_result_item['_source']
                timestamp = result_item['timestamp']
                #attr_count
                #print 'compute_count'
                count_dict = json.loads(result_item['count'])
                for sensitive in count_dict:
                    count_key = 'count_' + sensitive
                    result[count_key][str(timestamp)] = count_dict[sensitive]
                #attr_sentiment
                #print 'compute_sentiment'
                sensitive_sentiment_dict = json.loads(result_item['sentiment'])
                for sensitive in sensitive_sentiment_dict:
                    sentiment_dict = sensitive_sentiment_dict[sensitive]
                    for sentiment in sentiment_dict:
                        sentiment_key = 'sentiment_'+sensitive+'_'+sentiment
                        result[sentiment_key][str(timestamp)] = sentiment_dict[sentiment]
                #attr_sensitive_score
                #print 'compute_sensitive_word'
                if 'sensitive_word' in result_item:
                    sensitive_word_dict = json.loads(result_item['sensitive_word'])
                else:
                    sensitive_word_dict = {}
                ts_word_score = 0
                for word in sensitive_word_dict:
                    #print 'word:', json.dumps(word.encode('utf-8')), word.encode('utf-8'), type(word.encode('utf-8'))
                    search_word = word.encode('utf-8')
                    #print 'search_word:', search_word, type(search_word)
                    try:
                        word_identify = json.loads(word_r.hget('sensitive_words', search_word))
                    except:
                        word_identify = [2]
                    ts_word_score += sensitive_word_dict[word] * word_identify[0]
                result['sensitive_score'][str(timestamp)] = ts_word_score
                #attr_geo
                #print 'compute geo'
                timestamp_date = ts2datetime(timestamp)
                sensitive_geo_dict = json.loads(result_item['geo'])
                for sensitive in sensitive_geo_dict:
                    if timestamp_date not in result['geo_'+sensitive]:
                        result['geo_'+sensitive][timestamp_date] = {}
                        
                    geo_dict = sensitive_geo_dict[sensitive]
                    for geo in geo_dict:
                        try:
                            result['geo_'+sensitive][timestamp_date][geo] += geo_dict[geo]
                        except:
                            result['geo_'+sensitive][timestamp_date][geo] = geo_dict[geo]

                #attr_hashtag
                #print 'compute hashtag'
                if 'hashtag' in result_item:
                    sensitive_hashtag_dict = json.loads(result_item['hashtag'])
                else:
                    sensitive_hashtag_dict = {}
                    result['hashtag_0'][timestamp_date] = {}
                    result['hashtag_1'][timestamp_date] = {}
                for sensitive in sensitive_hashtag_dict:
                    for sensitive in sensitive_hashtag_dict:
                        if timestamp_date not in result['hashtag_'+sensitive]:
                            result['hashtag_'+sensitive][timestamp_date] = {}
                        hashtag_dict = sensitive_hashtag_dict[sensitive]
                        for hashtag in hashtag_dict:
                            try:
                                result['hashtag_'+sensitive][timestamp_date][hashtag] += hashtag_dict[hashtag]
                            except:
                                result['hashtag_'+sensitive][timestamp_date][hashtag] = hashtag_dict[hashtag]

        begin_ts += search_time_segment
    query_body.append({'bool':{'should':nest_body_list}})
    # range search: timestamp timestamp+900
    query_body.append({'range':{'timestamp':{'from':timestamp, 'to':timestamp+900}}})
    # match search: sensitive_status
    query_body.append({'term':{'sensitive': sensitive_status}})
    try:
        weibo_result = es.search(index=text_index_name, doc_type=text_index_type, \
                body={'query':{'bool':{'must': query_body}}, 'sort':[{'timestamp':{'order':'asc'}}], 'size':10000})['hits']['hits']
    except Exception, e:
        raise e
    for weibo_item in weibo_result:
        weibo_dict = weibo_item['_source']
        uid = weibo_dict['uid']
        uname = uid2uname(uid)
        timestamp = weibo_dict['timestamp']
        date = ts2date(timestamp)
        geo = weibo_dict['geo']
        text = weibo_dict['text']
        result.append([uid, uname, date, geo, text])

    return result

# show weibo when click sentiment node
def get_sentiment_weibo(task_name, sentiment, timestamp):
    result = []
    #step1: get task user
    #step2: search weibo by condition: task_user, sensitive_status, sentiment, timestamp
    task_exist = identify_task(task_name)
    if not task_exist:
        return 'the task is not exist'
    task_user = task_exist['uid_list']
def get_origin_weibo_detail(ts, user, task_name, size, order, message_type=1):
    _id = user + '-' + task_name
    task_detail = es_user_portrait.get(index=index_sensing_task, doc_type=_id, id=ts)['_source']

    mid_value = json.loads(task_detail['mid_topic_value'])
    duplicate_dict = json.loads(task_detail['duplicate_dict'])
    tmp_duplicate_dict = dict()
    for k,v in duplicate_dict.iteritems():
        try:
            tmp_duplicate_dict[v].append(k)
        except:
            tmp_duplicate_dict[v] = [k, v]

        

    if message_type == 1:
        weibo_detail = json.loads(task_detail['origin_weibo_detail'])
    elif message_type == 2:
        weibo_detail = json.loads(task_detail['retweeted_weibo_detail'])
    else:
        weibo_detail = json.loads(task_detail['sensitive_weibo_detail'])
    weibo_detail_list = []
    if weibo_detail:
        for iter_mid, item in weibo_detail.iteritems():
            tmp = []
            tmp.append(iter_mid)
            tmp.append(item[iter_mid])
            tmp.append(item['retweeted'])
            tmp.append(item['comment'])
            weibo_detail_list.append(tmp)
    mid_list = weibo_detail.keys()

    results = []
    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "terms":{"mid": mid_list}
                }
            }
        },
        "size": 1000,
        "sort": {"timestamp": {"order": "desc"}}
    }


    index_list = []
    datetime = ts2datetime(ts)
    datetime_1 = ts2datetime(ts-DAY)
    index_name = flow_text_index_name_pre + datetime
    exist_es = es_text.indices.exists(index_name)
    if exist_es:
        index_list.append(index_name)
    index_name_1 = flow_text_index_name_pre + datetime_1
    exist_es_1 = es_text.indices.exists(index_name_1)
    if exist_es_1:
        index_list.append(index_name_1)

    if index_list and mid_list:
        search_results = es_text.search(index=index_list, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"]
    else:
        search_results = []

    uid_list = []
    text_dict = dict() # 文本信息
    portrait_dict = dict() # 背景信息
    sort_results = []
    if search_results:
        for item in search_results:
            uid_list.append(item["_source"]['uid'])
            text_dict[item['_id']] = item['_source'] # _id是mid
        if uid_list:
            portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"]
            for item in portrait_result:
                if item['found']:
                    portrait_dict[item['_id']] = {"nick_name": item["fields"]["nick_name"][0], "photo_url": item["fields"]["photo_url"][0]}
                else:
                    portrait_dict[item['_id']] = {"nick_name": item['_id'], "photo_url":""}

        if order == "total":
            sorted_list = sorted(weibo_detail_list, key=lambda x:x[1], reverse=True)
        elif order == "retweeted":
            sorted_list = sorted(weibo_detail_list, key=lambda x:x[2], reverse=True)
        elif order == "comment":
            sorted_list = sorted(weibo_detail_list, key=lambda x:x[3], reverse=True)
        else:
            sorted_list = weibo_detail_list

        count_n = 0
        results_dict = dict()
        mid_index_dict = dict()
        for item in sorted_list: # size
            mid = item[0]
            iter_text = text_dict.get(mid, {})
            temp = []
            # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type
            if iter_text:
                uid = iter_text['uid']
                temp.append(uid)
                iter_portrait = portrait_dict.get(uid, {})
                if iter_portrait:
                    temp.append(iter_portrait['nick_name'])
                    temp.append(iter_portrait['photo_url'])
                else:
                    temp.extend([uid,''])
                temp.append(iter_text["text"])
                temp.append(iter_text["sentiment"])
                temp.append(ts2date(iter_text['timestamp']))
                temp.append(iter_text['geo'])
                if message_type == 1:
                    temp.append(1)
                elif message_type == 2:
                    temp.append(3)
                else:
                    temp.append(iter_text['message_type'])
                temp.append(item[2])
                temp.append(item[3])
                temp.append(iter_text.get('sensitive', 0))
                temp.append(iter_text['timestamp'])
                temp.append(mid_value[mid])
                temp.append(mid)
                results.append(temp)
            count_n += 1

        results = sorted(results, key=operator.itemgetter(-4, -2, -6), reverse=True) # -4 -2 -3
        sort_results = []
        count = 0
        for item in results:
            sort_results.append([item])
            mid_index_dict[item[-1]] = count
            count += 1

        
        if tmp_duplicate_dict:
            remove_list = []
            value_list = tmp_duplicate_dict.values() # [[mid, mid], ]
            for item in value_list:
                tmp = []
                for mid in item:
                    if mid_index_dict.get(mid, 0):
                        tmp.append(mid_index_dict[mid])
                if len(tmp) > 1:
                    tmp_min = min(tmp)
                else:
                    continue
                tmp.remove(tmp_min)
                for iter_count in tmp:
                    sort_results[tmp_min].extend(sort_results[iter_count])
                    remove_list.append(sort_results[iter_count])
            if remove_list:
                for item in remove_list:
                    sort_results.remove(item)
        

    return sort_results
def get_positive_weibo_detail(ts, social_sensors, keywords_list, size, sentiment_type=1):
    former_mid_list = query_mid_list(ts-time_interval, keywords_list, time_segment, social_sensors) # 前一段时间内的微博mid list
    current_mid_list = query_mid_list(ts, keywords_list, time_interval,  social_sensors)
    mid_list = []
    mid_list.extend(former_mid_list)
    mid_list.extend(current_mid_list)

    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "must":[
                            {"range":{
                                "timestamp":{
                                    "gte": ts - time_interval,
                                    "lt": ts
                                }
                            }},
                        ],
                        "should":[
                            {"terms": {"root_mid": mid_list}},
                            {"terms": {"mid": mid_list}},
                            {"terms":{"keywords_string": keywords_list}}
                        ]
                    }
                }
            }
        },
        "sort": {"timestamp": {"order": "desc"}},
        "size": 100
    }



    #if social_sensors and int(sentiment_type) == 1:
    #    query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"terms":{"uid": social_sensors}})

    if int(sentiment_type) == 1 or int(sentiment_type) == 0:
        query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"term":{"sentiment":sentiment_type}})
    else:
        query_body["query"]["filtered"]["filter"]["bool"]["must"] = [{"terms":{"sentiment": ["2", "3"]}}]

    # 判断当前ts和ts-time_interval是否属于同一天,确定查询哪个es
    datetime = ts2datetime(ts)
    datetime_1 = ts2datetime(ts-time_interval)
    index_name = flow_text_index_name_pre + datetime
    exist_es = es_text.indices.exists(index_name)
    index_name_1 = flow_text_index_name_pre + datetime_1
    exist_es_1 = es_text.indices.exists(index_name_1)

    # 1. 聚合原创微博mid list
    if datetime == datetime_1 and exist_es:
        search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"]
    elif datetime != datetime_1 and exist_es_1:
        search_results = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"]
    else:
        search_results = []

    uid_list = []
    results = []
    if search_results:
        for item in search_results:
            uid_list.append(item["_source"]['uid'])
        if uid_list:
            portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"]

        for i in range(len(uid_list)):
            item = search_results[i]['_source']
            temp = []
            # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type
            temp.append(item['uid'])
            if portrait_result[i]['found']:
                temp.append(portrait_result[i]["fields"]["nick_name"][0])
                temp.append(portrait_result[i]["fields"]["photo_url"][0])
            else:
                temp.append("unknown")
                temp.append("")
            temp.append(item["text"])
            temp.append(item["sentiment"])
            temp.append(ts2date(item['timestamp']))
            temp.append(item['geo'])
            keywords_set = set(item['keywords_string'].split('&'))
            common_keywords = set(keywords_list) & keywords_set
            temp.append(list(common_keywords))
            temp.append(item['message_type'])
            results.append(temp)

    return results
def get_retweet_weibo_detail(ts, user, task_name, size, text_type, type_value):
    _id = user + '-' + task_name
    task_detail = es_user_portrait.get(index=index_sensing_task, doc_type=_id, id=ts)['_source']
    origin_weibo_detail = json.loads(task_detail['origin_weibo_detail'])
    retweeted_weibo_detail = json.loads(task_detail['retweeted_weibo_detail'])

    mid_list = []
    mid_list.extend(origin_weibo_detail.keys())
    mid_list.extend(retweeted_weibo_detail.keys())

    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "must":[
                            {"range":{
                                "timestamp":{
                                    "gte": ts - time_interval,
                                    "lt": ts
                                }
                            }},
                            {"terms": {"root_mid": mid_list}}
                        ]
                    }
                }
            }
        },
        "sort": {"timestamp": {"order": "desc"}},
        "size": 100
    }

    if text_type == "message_type":
        query_body['query']['filtered']['filter']['bool']['must'].append({"term":{text_type: type_value}})
    if text_type == "sentiment":
        #if isinstance(type_value, str):
        if len(type_value) == 1:
            query_body['query']['filtered']['filter']['bool']['must'].append({"term":{text_type: type_value}})
        else:
            query_body['query']['filtered']['filter']['bool']['must'].append({"terms":{text_type: type_value}})

    datetime = ts2datetime(ts)
    datetime_1 = ts2datetime(ts-time_interval)
    index_name = flow_text_index_name_pre + datetime
    exist_es = es_text.indices.exists(index_name)
    index_name_1 = flow_text_index_name_pre + datetime_1
    exist_es_1 = es_text.indices.exists(index_name_1)

    # 1. 查询微博
    if datetime == datetime_1 and exist_es:
        search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"]
    elif datetime != datetime_1 and exist_es_1:
        search_results = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"]
    else:
        search_results = []
    #print search_results
    # 2. 获取微博相关信息
    results = []
    uid_list = []
    if search_results:
        for item in search_results:
            uid_list.append(item["_source"]['uid'])
        if uid_list:
            portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"]

        for i in range(len(uid_list)):
            item = search_results[i]['_source']
            temp = []
            # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type
            temp.append(item['uid'])
            if portrait_result[i]['found']:
                temp.append(portrait_result[i]["fields"]["nick_name"][0])
                temp.append(portrait_result[i]["fields"]["photo_url"][0])
            else:
                temp.append(item['uid'])
                temp.append("")
            temp.append(item["text"])
            #print item['text']
            temp.append(item["sentiment"])
            temp.append(ts2date(item['timestamp']))
            temp.append(item['geo'])
            temp.append(item["message_type"])
            results.append(temp)

    return results
예제 #14
0
def get_influence_content(uid, timestamp_from, timestamp_to):
    weibo_list = []
    #split timestamp range to new_range_dict_list
    from_date_ts = datetime2ts(ts2datetime(timestamp_from))
    to_date_ts = datetime2ts(ts2datetime(timestamp_to))
    new_range_dict_list = []
    if from_date_ts != to_date_ts:
        iter_date_ts = from_date_ts
        while iter_date_ts < to_date_ts:
            iter_next_date_ts = iter_date_ts + DAY
            new_range_dict_list.append({
                'range': {
                    'timestamp': {
                        'gte': iter_date_ts,
                        'lt': iter_next_date_ts
                    }
                }
            })
            iter_date_ts = iter_next_date_ts
        if new_range_dict_list[0]['range']['timestamp']['gte'] < timestamp_from:
            new_range_dict_list[0]['range']['timestamp'][
                'gte'] = timestamp_from
        if new_range_dict_list[-1]['range']['timestamp']['lt'] > timestamp_to:
            new_range_dict_list[-1]['range']['timestamp']['lt'] = timestamp_to
    else:
        new_range_dict_list = [{
            'range': {
                'timestamp': {
                    'gte': timestamp_from,
                    'lt': timestamp_to
                }
            }
        }]
    #iter date to search flow_text
    iter_result = []
    for range_item in new_range_dict_list:
        range_from_ts = range_item['range']['timestamp']['gte']
        range_from_date = ts2datetime(range_from_ts)
        flow_text_index_name = flow_text_index_name_pre + range_from_date
        query = []
        query.append({'term': {'uid': uid}})
        query.append(range_item)
        try:
            flow_text_exist = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\
                    body={'query':{'bool':{'must': query}}, 'sort':[{'timestamp':'asc'}]})['hits']['hits']
        except:
            flow_text_exist = []
        iter_result.extend(flow_text_exist)
    # get weibo list
    for item in flow_text_exist:
        source = item['_source']
        weibo = {}
        weibo['timestamp'] = ts2date(source['timestamp'])
        weibo['ip'] = source['ip']
        weibo['text'] = source['text']
        if source['geo']:
            weibo['geo'] = '\t'.join(source['geo'].split('&'))
        else:
            weibo['geo'] = ''
        weibo_list.append(weibo)

    return weibo_list