def add_task_record_time(task_name, submit_date):
    status = 0
    #start_ts = datetime2ts(submit_date)
    start_ts = date2ts(submit_date)
    r_task.hset('monitor_task_time_record', task_name, start_ts)
    status = 1
    return status
Exemplo n.º 2
0
def add_task_record_time(task_name, submit_date):
    status = 0
    #start_ts = datetime2ts(submit_date)
    start_ts = date2ts(submit_date)
    r_task.hset('monitor_task_time_record', task_name, start_ts)
    status = 1
    return status
def get_network(task_exist):
    task_name = task_exist['task_name']
    submit_date = task_exist['submit_date']
    submit_ts = date2ts(submit_date)

    time_segment = 24*3600
    now_ts = time.time()
    now_date = ts2datetime(now_ts)
    now_date_ts = datetime2ts(now_date)
    #test
    now_date_ts = datetime2ts('2013-09-07')
    iter_date_ts = now_date_ts
    iter_count = 1
    date_list = []
    top_list_dict = {}
    while True:
        if iter_count >= 8 or iter_date_ts < submit_ts:
            break
        iter_date = ts2datetime(iter_date_ts)
        date_list.append(iter_date)
        key = 'inner_' + str(iter_date)
        try:
            task_date_result = es.get(index=monitor_index_name, doc_type=task_name, id=key)['_source']
        except:
            task_date_result = {}
        #print 'task_name, key, task_date_result:', task_name, key, task_date_result
        iter_field = ['top1', 'top2', 'top3', 'top4', 'top5']
        for field in iter_field:
            user_count_item = json.loads(task_date_result[field])
            uid = user_count_item[0]
            uname = uid2uname(uid)
            count = user_count_item[1]
            try:
                top_list_dict[field].append([uid, uname, count])
            except:
                top_list_dict[field] = [[uid, uname, count]]
        
        iter_date_ts -= time_segment
        # get inner-retweet group from es---field: inner_graph
        '''
        try:
            inner_graph = json.loads(task_date_result['inner_graph'])
        except:
            inner_graph = {}
        '''

    abnormal_index = compute_inner_polarization(top_list_dict)
    
    return [date_list, top_list_dict, abnormal_index]
def compute_mid_result(task_name, task_submit_date):
    result = {'count_0':{}, 'count_1':{}, 'sentiment_0_126':{}, 'sentiment_0_127':{}, 'sentiment_0_128':{},\
            'sentiment_0_129':{}, 'sentiment_0_130':{}, 'sensitive_score':{}, 'geo_0':{}, 'geo_1':{},\
            'hashtag_0':{}, 'hashtag_1':{}, 'sentiment_1_126':{}, 'sentiment_1_127':{}, \
            'sentiment_1_128':{}, 'sentiment_1_129':{}, 'sentiment_1_130':{}}
    #geo & hashtag: day
    #other: 15min
    search_time_segment = 3600 * 4
    #start_ts = datetime2ts(task_submit_date)
    start_ts = date2ts(task_submit_date)
    now_ts = time.time()
    now_date = ts2datetime(now_ts)
    #test
    now_ts = datetime2ts('2013-09-08')
    date_ts = datetime2ts(now_date)
    segment = int((now_ts - date_ts) / 900) + 1
    end_ts = date_ts + segment * 900
    #every search time-range: 4 hour----bulk action to search
    begin_ts = start_ts

    while True:
        if begin_ts >= end_ts:
            break
        compute_ts = ts2date(begin_ts)
        #print 'compute ts:', compute_ts
        query_body = {'range':{'timestamp':{'from': begin_ts, 'to':begin_ts+search_time_segment}}}
        try:
            mid_result_list = es.search(index=monitor_index_name, doc_type=task_name, body={'query':query_body, 'size':100000, 'sort':[{'timestamp':{'order': 'asc'}}]})['hits']['hits']
        except Exception, e:
            raise e
        if mid_result_list:
            for mid_result_item in mid_result_list:
                result_item = mid_result_item['_source']
                timestamp = result_item['timestamp']
                #attr_count
                #print 'compute_count'
                count_dict = json.loads(result_item['count'])
                for sensitive in count_dict:
                    count_key = 'count_' + sensitive
                    result[count_key][str(timestamp)] = count_dict[sensitive]
                #attr_sentiment
                #print 'compute_sentiment'
                sensitive_sentiment_dict = json.loads(result_item['sentiment'])
                for sensitive in sensitive_sentiment_dict:
                    sentiment_dict = sensitive_sentiment_dict[sensitive]
                    for sentiment in sentiment_dict:
                        sentiment_key = 'sentiment_'+sensitive+'_'+sentiment
                        result[sentiment_key][str(timestamp)] = sentiment_dict[sentiment]
                #attr_sensitive_score
                #print 'compute_sensitive_word'
                if 'sensitive_word' in result_item:
                    sensitive_word_dict = json.loads(result_item['sensitive_word'])
                else:
                    sensitive_word_dict = {}
                ts_word_score = 0
                for word in sensitive_word_dict:
                    #print 'word:', json.dumps(word.encode('utf-8')), word.encode('utf-8'), type(word.encode('utf-8'))
                    search_word = word.encode('utf-8')
                    #print 'search_word:', search_word, type(search_word)
                    try:
                        word_identify = json.loads(word_r.hget('sensitive_words', search_word))
                    except:
                        word_identify = [2]
                    ts_word_score += sensitive_word_dict[word] * word_identify[0]
                result['sensitive_score'][str(timestamp)] = ts_word_score
                #attr_geo
                #print 'compute geo'
                timestamp_date = ts2datetime(timestamp)
                sensitive_geo_dict = json.loads(result_item['geo'])
                for sensitive in sensitive_geo_dict:
                    if timestamp_date not in result['geo_'+sensitive]:
                        result['geo_'+sensitive][timestamp_date] = {}
                        
                    geo_dict = sensitive_geo_dict[sensitive]
                    for geo in geo_dict:
                        try:
                            result['geo_'+sensitive][timestamp_date][geo] += geo_dict[geo]
                        except:
                            result['geo_'+sensitive][timestamp_date][geo] = geo_dict[geo]

                #attr_hashtag
                #print 'compute hashtag'
                if 'hashtag' in result_item:
                    sensitive_hashtag_dict = json.loads(result_item['hashtag'])
                else:
                    sensitive_hashtag_dict = {}
                    result['hashtag_0'][timestamp_date] = {}
                    result['hashtag_1'][timestamp_date] = {}
                for sensitive in sensitive_hashtag_dict:
                    for sensitive in sensitive_hashtag_dict:
                        if timestamp_date not in result['hashtag_'+sensitive]:
                            result['hashtag_'+sensitive][timestamp_date] = {}
                        hashtag_dict = sensitive_hashtag_dict[sensitive]
                        for hashtag in hashtag_dict:
                            try:
                                result['hashtag_'+sensitive][timestamp_date][hashtag] += hashtag_dict[hashtag]
                            except:
                                result['hashtag_'+sensitive][timestamp_date][hashtag] = hashtag_dict[hashtag]

        begin_ts += search_time_segment
def get_user_comment_retweet(task_exist):
    result = {} # result = {'uid1_comment':{ts:value}, 'uid1_retweet':{ts_value}, 'uid2_comment'}
    submit_date = task_exist['submit_date']
    start_ts = date2ts(submit_date)
    task_status = task_exist['status']
    if task_status == 1:
        now_ts = time.time()
        now_date = ts2datetime(now_ts)
        now_date_ts = datetime2ts(now_date)
        segment = int((now_ts - now_date_ts) / 900) + 1
        end_ts = now_date_ts + segment * 900
        #test
        end_ts = datetime2ts('2013-09-02')
    else:
        end_ts = date2ts(task_exist['end_date'])

    task_user = task_exist['uid_list']

    select_top_dict = {} # {uid:[ave_retweet_count, ave_peak_retweet_count]}
    #select union top5 ave_retweet_count and top5 ave_peak_retweet_count

    for user in task_user:
        result[user+'_comment'] = {}
        result[user+'_retweet'] = {}
        comment_retweet_dict = monitor_r.hgetall(user)
        for item in comment_retweet_dict:
            item_type_ts = item.split('_')
            item_type = item_type_ts[0]
            item_ts = item_type_ts[1]
            result[user+'_'+item_type][item_ts] = int(comment_retweet_dict[item])
        # use to detect peaks
        comment_dict = result[user+'_comment']
        complement_comment_dict = complement_ts(comment_dict, start_ts, end_ts)
        sort_comment_dict = sorted(complement_comment_dict.items(), key=lambda x:int(x[0]))
        detect_peaks_comment_input = [item[1] for item in sort_comment_dict]
        #print 'detect_peaks_comment_input:', detect_peaks_comment_input
        result[user+'_comment_peak'] = detect_peaks(detect_peaks_comment_input)

        retweet_dict = result[user+'_retweet']
        complement_retweet_dict = complement_ts(retweet_dict, start_ts, end_ts)
        sort_retweet_dict = sorted(complement_retweet_dict.items(), key=lambda x:int(x[0]))
        detect_peaks_retweet_input = [item[1] for item in sort_retweet_dict]
        result[user+'_retweet_peak'] = detect_peaks(detect_peaks_retweet_input)
        
        ave_retweet_count = sum(detect_peaks_retweet_input) / len(detect_peaks_retweet_input)
        peak_count_list = [detect_peaks_retweet_input[peak_location] for peak_location in result[user+'_retweet_peak']]
        ave_peak_count = sum(peak_count_list) / len(peak_count_list)
        select_top_dict[user] = [ave_retweet_count, ave_peak_count]
    
    #select union top5
    sort_select_top_count_dict = sorted(select_top_dict.items(), key=lambda x:x[1][0], reverse=True)
    top5_count_user_list = sort_select_top_count_dict[:5]
    top5_count_user = [item[0] for item in top5_count_user_list]
    sort_select_top_peak_dict = sorted(select_top_dict.items(), key=lambda x:x[1][1], reverse=True)
    top5_peak_user_list = sort_select_top_peak_dict[:5]
    top5_peak_user = [item[0] for item in top5_peak_user_list]
    union_user = list(set(top5_count_user) & set(top5_peak_user))
    new_result = {}
    for user in union_user:
        new_result[user+'_retweet'] = result[user+'_retweet']
        new_result[user+'_retweet_peak'] = result[user+'_retweet_peak']
        new_result[user+'_comment'] = result[user+'_comment']
        new_result[user+'_comment_peak'] = result[user+'_comment_peak']
    
    new_result['profile'] = get_top_user_profile(union_user)

    #compute abnormal index
    new_result['abnormal_index'] = compute_comment_retweet_abnormal(new_result, union_user)

    return new_result