예제 #1
0
def get_attr_geo_track(uid_list):
    date_results = [] # results = {'2013-09-01':[(geo1, count1), (geo2, track2)], '2013-09-02'...} 7day
    now_ts = time.time()
    now_date = ts2datetime(now_ts)
    #test
    now_date = '2013-09-08'
    ts = datetime2ts(now_date)
    for i in range(7, 0, -1):
        timestamp = ts - i*24*3600
        #print 'timestamp:', ts2datetime(timestamp)
        ip_dict = dict()
        results = r_cluster.hmget('ip_'+str(timestamp), uid_list)
        #print 'results:',results
        for item in results:
            if item:
                item_dict = json.loads(item)
                #print 'item_dict:', item_dict
                for ip_item in item_dict:
                    try:
                        ip_dict[ip_item] += item_dict[ip_item]
                    except:
                        ip_dict[ip_item] = item_dict[ip_item]
        geo_dict = ip2geo(ip_dict)
        sort_geo_dict = sorted(geo_dict.items(), key=lambda x:x[1], reverse=True)
        date_key = ts2datetime(timestamp)
        date_results.append([date_key, sort_geo_dict[:2]])
    #print 'results:', date_results
    return {'geo_track': json.dumps(date_results)}
예제 #2
0
파일: update_day.py 프로젝트: SwoJa/ruman
def update_day_hashtag(uid_list):
    results = {}
    all_results = {}
    now_ts = time.time()
    #run_type
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = test_ts

    for i in range(WEEK,0,-1):
        ts = now_date_ts - DAY*i
        count = 0
        hashtag_results = r_cluster.hmget('hashtag_'+str(ts), uid_list)
        for uid in uid_list:
            if uid not in results:
                results[uid] = {}
            hashtag_item = hashtag_results[count]
            if hashtag_item:
                hashtag_dict = json.loads(hashtag_item)
            else:
                hashtag_dict = {}
            for hashtag in hashtag_dict:
                try:
                    results[uid][hashtag] += 1
                except:
                    results[uid][hashtag] = 1
    for uid in uid_list:
        user_hashtag_dict = results[uid]
        hashtag_string = '&'.join(user_hashtag_dict.keys())
        all_results[uid] = {'hashtag': hashtag_string, 'hashtag_dict':user_hashtag_dict}
    return all_results
예제 #3
0
def update_day_hashtag(uid_list):
    results = {}
    all_results = {}
    now_ts = time.time()
    #run_type
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = test_ts

    for i in range(WEEK, 0, -1):
        ts = now_date_ts - DAY * i
        count = 0
        hashtag_results = r_cluster.hmget('hashtag_' + str(ts), uid_list)
        for uid in uid_list:
            if uid not in results:
                results[uid] = {}
            hashtag_item = hashtag_results[count]
            if hashtag_item:
                hashtag_dict = json.loads(hashtag_item)
            else:
                hashtag_dict = {}
            for hashtag in hashtag_dict:
                try:
                    results[uid][hashtag] += 1
                except:
                    results[uid][hashtag] = 1
    for uid in uid_list:
        user_hashtag_dict = results[uid]
        hashtag_string = '&'.join(user_hashtag_dict.keys())
        all_results[uid] = {
            'hashtag': hashtag_string,
            'hashtag_dict': user_hashtag_dict
        }
    return results
예제 #4
0
def update_day_geo(uid_list, user_info_list):
    results = {}
    now_ts = time.time()
    #run_type
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = test_ts
    ip_results = r_cluster.hmget('new_ip_' + str(now_date_ts - DAY), uid_list)
    count = 0
    for uid in uid_list:
        if uid not in results:
            results[uid] = {'activity_geo': {}, 'activity_geo_dict': []}
        uid_ip_results = ip_results[count]
        count += 1
        if uid_ip_results:
            uid_ip_dict = json.loads(uid_ip_results)
        else:
            uid_ip_dict = {}
        day_results = {}
        for ip in uid_ip_dict:
            ip_count = len(uid_ip_dict[ip].split('&'))
            geo = ip2city(ip)
            geo = geo.decode('utf-8')
            try:
                day_results[geo] += ip_count
            except:
                day_results[geo] = ip_count
        #update the activity_geo_dict
        activity_geo_history_list = json.loads(
            user_info_list[uid]['activity_geo_dict'])
        activity_geo_history_list.append(day_results)
        results[uid]['activity_geo_dict'] = json.dumps(
            activity_geo_history_list[-30:])
        #update the activity_geo
        week_activity_geo_list = activity_geo_history_list[-7:]
        week_geo_list = []
        for activity_geo_item in week_activity_geo_list:
            geo_list = activity_geo_item.keys()
            week_geo_list.extend(geo_list)
        week_geo_list = list(set(week_geo_list))
        week_geo_string = '&'.join([
            '&'.join((item.encode('utf-8')).split('\t'))
            for item in week_geo_list
        ])
        try:
            week_geo_aggs_string = '&'.join([
                (item.encode('utf-8')).split('\t')[-1]
                for item in week_geo_list
            ])
        except:
            week_geo_aggs_string = ''

        results[uid]['activity_geo'] = week_geo_string
        results[uid]['activity_geo_aggs'] = week_geo_aggs_string

    return results
예제 #5
0
def get_activity_time(uid_list):
    results = {}
    now_ts = time.time()
    now_date = ts2datetime(now_ts)
    #run_type
    if RUN_TYPE == 1:
        timestamp = datetime2ts(now_date)
    else:
        timestamp = datetime2ts(RUN_TEST_TIME)
    activity_list_dict = {}  # {uid:[activity_list], uid:[]}
    for i in range(1, WEEK + 1):
        ts = timestamp - DAY * i
        print ts
        r_result = r_cluster.hmget('activity_' + str(ts), uid_list)
        #print r_result
        if r_result:
            for j in range(0, len(uid_list)):
                uid = uid_list[j]
                if uid not in activity_list_dict:
                    activity_list_dict[uid] = [0 for i in range(0, 96)]
                user_r_result = r_result[j]
                if user_r_result:
                    user_activity_dict = json.loads(user_r_result)
                    for i in range(0, 96):
                        try:
                            count = user_activity_dict[str(i)]
                        except:
                            count = 0
                        activity_list_dict[uid].append(count)
    for uid in uid_list:
        activity_list = activity_list_dict[uid]
        statusnum = sum(activity_list)
        signal = np.array(activity_list)
        fftResult = np.abs(np.fft.fft(signal))**2
        n = signal.size
        freq = np.fft.fftfreq(n, d=1)
        i = 0
        max_val = 0
        max_freq = 0
        for val in fftResult:
            if val > max_val and freq[i] > 0:
                max_val = val
                max_freq = freq[i]
            i += 1
        results[uid] = {
            'statusnum': statusnum,
            'activity_time': math.log(max_freq + 1)
        }

    return results
def update_day_geo(uid_list, user_info_list):
    results = {}
    now_ts = time.time()
    #run_type
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = test_ts
    ip_results = r_cluster.hmget('new_ip_'+str(now_date_ts - DAY), uid_list)
    count = 0
    for uid in uid_list:
        if uid not in results:
            results[uid] = {'activity_geo':{}, 'activity_geo_dict':[]}
        uid_ip_results = ip_results[count]
        count += 1
        if uid_ip_results:
            uid_ip_dict = json.loads(uid_ip_results)
        else:
            uid_ip_dict = {}
        day_results = {}
        for ip in uid_ip_dict:
            ip_count = len(uid_ip_dict[ip].split('&'))
            geo, school = ip2city(ip)
            if geo:
                geo = geo.decode('utf-8')
                try:
                    day_results[geo] += ip_count
                except:
                    day_results[geo] = ip_count
        #update the activity_geo_dict
        activity_geo_history_list = json.loads(user_info_list[uid]['activity_geo_dict'])
        activity_geo_history_list.append(day_results)
        results[uid]['activity_geo_dict'] = json.dumps(activity_geo_history_list[-30:])
        #update the activity_geo
        week_activity_geo_list = activity_geo_history_list[-7:]
        week_geo_list = []
        for activity_geo_item in week_activity_geo_list:
            geo_list = activity_geo_item.keys()
            week_geo_list.extend(geo_list)
        week_geo_list = list(set(week_geo_list))
        week_geo_string = '&'.join(['&'.join(item.split('\t')) for item in week_geo_list])
        try:
            week_geo_aggs_string = '&'.join([item.split('\t')[-1] for item in week_geo_list])
        except:
            week_geo_aggs_string = ''

        results[uid]['activity_geo'] = week_geo_string
        results[uid]['activity_geo_aggs'] = week_geo_aggs_string
    #print 'update geo results:', results
    return results
예제 #7
0
def get_activity_time(uid_list):
    results = {}
    now_ts = time.time()
    now_date = ts2datetime(now_ts)
    #run_type
    if RUN_TYPE == 1:
        timestamp = datetime2ts(now_date)
    else:
        timestamp = datetime2ts(RUN_TEST_TIME)
    activity_list_dict = {} # {uid:[activity_list], uid:[]}
    for i in range(1,WEEK+1):
        ts = timestamp - DAY*i
        print ts
        r_result = r_cluster.hmget('activity_'+str(ts), uid_list)
        #print r_result
        if r_result:
            for j in range(0, len(uid_list)):
                uid = uid_list[j]
                if uid not in activity_list_dict:
                    activity_list_dict[uid] = [0 for i in range(0, 96)]
                user_r_result = r_result[j]
                if user_r_result:
                    user_activity_dict = json.loads(user_r_result)
                    for i in range(0, 96):
                        try:
                            count = user_activity_dict[str(i)]
                        except:
                            count = 0
                        activity_list_dict[uid].append(count)
    for uid in uid_list:
        activity_list = activity_list_dict[uid]
        statusnum = sum(activity_list)
        signal = np.array(activity_list)
        fftResult = np.abs(np.fft.fft(signal))**2
        n = signal.size
        freq = np.fft.fftfreq(n, d=1)
        i = 0
        max_val = 0
        max_freq = 0
        for val in fftResult:
            if val>max_val and freq[i]>0:
                max_val = val
                max_freq = freq[i]
            i += 1
        results[uid] = {'statusnum': statusnum, 'activity_time': math.log(max_freq + 1)}
    
    return results
예제 #8
0
def get_attr_trend(uid_list):
    result = {}
    now_ts = time.time()
    date = ts2datetime(now_ts - 24*3600)
    timestamp = datetime2ts(date)
    #test
    timestamp = datetime2ts('2013-09-08')
    time_result = dict()
    segment_result = dict()
    for i in range(1, 8):
        ts = timestamp - i*24*3600
        r_result = r_cluster.hmget('activity_'+str(ts), uid_list)
        #print 'r_result:', r_result
        for item in r_result:
            if item:
                item = json.loads(item)
                for segment in item:
                    try:
                        time_result[int(segment)/16*15*60*16+ts] += item[segment]
                    except:
                        time_result[int(segment)/16*15*60*16+ts] = item[segment]
                    try:
                        segment_result[int(segment)/16*15*60*16] += item[segment]
                    except:
                        segment_result[int(segment)/16*15*60*16] = item[segment]
    trend_list = []
    for i in range(1, 8):
        ts = timestamp - i*24*3600
        for j in range(0, 6):
            time_seg = ts + j*15*60*16
            if time_seg in time_result:
                trend_list.append((time_seg, time_result[time_seg]))
            else:
                trend_list.append((time_seg, 0))
    sort_trend_list = sorted(trend_list, key=lambda x:x[0], reverse=False)
    #print 'time_result:', time_result
    #print 'trend_list:', trend_list
    #print 'sort_trend_list:', sort_trend_list
    result['activity_trend'] = json.dumps(sort_trend_list)
    result['activity_time'] = json.dumps(segment_result)
    return result
예제 #9
0
def get_school(uid_list):
    now_ts = time.time()
    now_date_ts = datetime2ts(ts2datetime(now_ts))
    school_results = {}
    for i in range(WEEK, 0, -1):
        ts = now_date_ts - DAY * i
        ip_results = r_cluster.hmget('new_ip_' + str(ts), uid_list)
        count = 0
        for uid in uid_list:
            if uid not in school_results:
                school_results[uid] = {}
            ip_item = ip_results[count]
            if ip_item:
                uid_ip_dict = json.loads(ip_item)
            else:
                uid_ip_dict = {}
            for ip in uid_ip_dict:
                ip_count = len(uid_ip_dict[ip].split('&'))

                school = ip2school(ip)
                if school:
                    try:
                        school_results[uid][school] += ip_count
                    except:
                        school_results[uid][school] = ip_count

            count += 1
    results = {}
    for uid in uid_list:
        school_dict = school_results[uid]
        school_string = '&'.join(school_dict.keys())
        if school_dict != {}:
            is_school = '1'
        else:
            is_school = '0'
        results[uid] = {
            'is_school': is_school,
            'school_string': school_string,
            'school_dict': json.dumps(school_dict)
        }
    return results
예제 #10
0
def update_day_sensitive(uid_list):
    results = {}
    all_results = {}
    now_ts = time.time()
    #run_type
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = test_ts

    for i in range(WEEK,0,-1):
        ts = now_date_ts - DAY*i
        count = 0
        sensitive_results = r_cluster.hmget('sensitive_'+str(ts), uid_list)
        for uid in uid_list:
            if uid not in results:
                results[uid] = {}
                sensitive_item = sensitive_results[count]
            if sensitive_item:
                sensitive_dict = json.loads(sensitive_item)
            else:
                sensitive_dict = {}
            for sensitive in sensitive_dict:
                try:
                    results[uid][sensitive] += 1
                except:
                    results[uid][sensitive] = 1
    for uid in uid_list:
        user_sensitive_dict = results[uid]
        sensitive_score = 0
        for item in user_sensitive_dict:
            k = item
            v = user_sensitive_dict[k]
            tmp_stage = r_sensitive.hget('sensitive_words', k)
            if tmp_stage:
                sensitive_score += v * sensitive_score_dict[str(tmp_stage)]
        sensitive_string = '&'.join(user_sensitive_dict.keys())
        all_results[uid] = {'sensitive_string': sensitive_string, 'sensitive_dict':user_sensitive_dict,\
                'sensitive': sensitive_score}
    return all_results
예제 #11
0
def update_day_sensitive(uid_list):
    results = {}
    all_results = {}
    now_ts = time.time()
    #run_type
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = test_ts

    for i in range(WEEK, 0, -1):
        ts = now_date_ts - DAY * i
        count = 0
        sensitive_results = r_cluster.hmget('sensitive_' + str(ts), uid_list)
        for uid in uid_list:
            if uid not in results:
                results[uid] = {}
                sensitive_item = sensitive_results[count]
            if sensitive_item:
                sensitive_dict = json.loads(sensitive_item)
            else:
                sensitive_dict = {}
            for sensitive in sensitive_dict:
                try:
                    results[uid][sensitive] += 1
                except:
                    results[uid][sensitive] = 1
    for uid in uid_list:
        user_sensitive_dict = results[uid]
        sensitive_score = 0
        for item in user_sensitive_dict:
            k = item
            v = user_sensitive_dict[k]
            tmp_stage = r_sensitive.hget('sensitive_words', k)
            if tmp_stage:
                sensitive_score += v * sensitive_score_dict[str(tmp_stage)]
        sensitive_string = '&'.join(user_sensitive_dict.keys())
        all_results[uid] = {'sensitive_string': sensitive_string, 'sensitive_dict':user_sensitive_dict,\
                'sensitive': sensitive_score}
    return all_results
예제 #12
0
def get_school(uid_list):
    now_ts = time.time()
    now_date_ts = datetime2ts(ts2datetime(now_ts))
    school_results = {}
    for i in range(WEEK, 0, -1):
        ts = now_date_ts - DAY * i
        ip_results = r_cluster.hmget('new_ip_'+str(ts), uid_list)
        count = 0
        for uid in uid_list:
            if uid not in school_results:
                school_results[uid] = {}
            ip_item = ip_results[count]
            if ip_item:
                uid_ip_dict = json.loads(ip_item)
            else:
                uid_ip_dict = {}
            for ip in uid_ip_dict:
                ip_count = len(uid_ip_dict[ip].split('&'))
                
                school = ip2school(ip)
                if school:
                    try:
                        school_results[uid][school] += ip_count
                    except:
                        school_results[uid][school] = ip_count
           
            count += 1
    results = {} 
    for uid in uid_list:
        school_dict = school_results[uid]
        school_string = '&'.join(school_dict.keys())
        if school_dict != {}:
            is_school = '1'
        else:
            is_school = '0'
        results[uid] = {'is_school': is_school, 'school_string': school_string, 'school_dict': json.dumps(school_dict)}
    return results
def get_flow_information(uid_list):
    # 每天更新,只计算前一天的数据
    result_dict = {}
    now_ts = time.time() - 3600 * 24
    now_date = ts2datetime(now_ts)  # date: 2013-09-01
    #now_date = "2013-09-08"

    hashtag_results = {}
    geo_results = {}
    #ts = datetime2ts(now_date)
    user_hashtag_result = {}
    user_sensitive_hashtag = {}
    sensitive_words = {}
    user_ip_result = {}
    user_sensitive_ip = {}
    for i in range(1, 8):
        ts = ts - 3600 * 24
        date = ts2datetime(ts).replace('-', '')
        hashtag_results = r_cluster.hmget('hashtag_' + str(date), uid_list)
        sensitive_hashtag = r_cluster.hmget('sensitive_hashtag_' + str(date),
                                            uid_list)
        ip_results = r_cluster.hmget('ip_' + str(date), uid_list)
        sensitive_ip = r_cluster.hmget('sensitive_ip_' + str(date), uid_list)
        sensitive_results = r_cluster.hmget('sensitive_' + str(date), uid_list)

        for j in range(0, len(uid_list)):
            uid = uid_list[j]
            if sensitive_results[j]:
                sensitive_words_results = json.loads(sensitive_results[j])
                if sensitive_words.has_key(uid):
                    sensitive_words[uid].update(
                        {date: sensitive_words_results})
                else:
                    sensitive_words[uid] = {date: sensitive_words_results}
            if hashtag_results[j]:
                hashtag_dict = json.loads(hashtag_results[j])
                if user_hashtag_result.has_key(uid):
                    user_hashtag_result[uid].update({date: hashtag_dict})
                else:
                    user_hashtag_result[uid] = {date: hashtag_dict}
            if sensitive_hashtag[j]:
                sensitive_hashtag_dict = json.loads(sensitive_hashtag[j])
                if user_sensitive_hashtag.has_key(uid):
                    user_sensitive_hashtag[uid].update(
                        {date: sensitive_hashtag_dict})
                else:
                    user_sensitive_hashtag[uid] = {
                        date: sensitive_hashtag_dict
                    }
            if ip_results[j]:
                ip_dict = json.loads(ip_results[j])
                if user_ip_result.has_key(uid):
                    user_ip_result[uid].update({date: ip_dict})
                else:
                    user_ip_result[uid] = {date: ip_dict}
            if sensitive_ip[j]:
                sensitive_ip_result = json.loads(sensitive_ip[j])
                if user_sensitive_ip.has_key(uid):
                    user_sensitive_ip[uid].update({date: sensitive_ip_result})
                else:
                    user_sensitive_ip[uid] = {date: sensitive_ip_result}

    for uid in uid_list:
        hashtag_string = ''
        sensitive_hashtag_string = ''
        ip_string = ''
        ip_all = ""
        sensitive_ip_string = ''
        hashtag_dict = {}
        sensitive_hashtag_dict = {}
        ip_dict = {}
        sensitive_ip_dict = {}
        sensitive_words_string = ''
        sensitive_words_dict = {}

        if sensitive_words.has_key(uid):
            sensitive_words_string = extract_string(sensitive_words[uid])
            sensitive_words_dict = json.dumps(sensitive_words[uid])
        if user_hashtag_result.has_key(uid):
            hashtag_string = extract_string(user_hashtag_result[uid])
            hashtag_dict = json.dumps(user_hashtag_result[uid])
        if user_sensitive_hashtag.has_key(uid):
            sensitive_hashtag_string = extract_string(
                user_sensitive_hashtag[uid])
            sensitive_hashtag_dict = json.dumps(user_sensitive_hashtag[uid])
        if user_ip_result.has_key(uid):
            ip_string = extract_geo(user_ip_result[uid])
            ip_dict = json.dumps(ip_to_geo(user_ip_result[uid]))
            ip_all = json.dumps(user_ip_result[uid])
        if user_sensitive_ip.has_key(uid):
            sensitive_ip_string = extract_geo(user_sensitive_ip[uid])
            sensitive_ip_dict = json.dumps(ip_to_geo(user_sensitive_ip[uid]))

        result_dict[uid] = {"hashtag_string": hashtag_string, "hashtag_dict": hashtag_dict, \
                            "sensitive_hashtag_string": sensitive_hashtag_string, "sensitive_hashtag_dict": sensitive_hashtag_dict, \
                            "geo_activity": ip_dict, "geo_string": ip_string, 'ip': ip_all, \
                             "sensitive_geo_activity": sensitive_ip_dict, "sensitive_geo_string":sensitive_ip_string, \
                             'sensitive_words_string': sensitive_words_string, 'sensitive_words_dict': sensitive_words_dict}
    return result_dict
def update_flow_information(user_info):
    results = {} # results ={uid: {'activity_geo_dict':'', 'activity_geo':'', 'hashtag_dict':'', 'hashtag':'', 'online_pattern_dict':'', 'online_pattern':''}}
    uid_list = user_info.keys()
    now_ts = time.time()
    now_date = ts2datetime(now_ts)
    timestamp = datetime2ts(now_date)
    #test
    timestamp = datetime2ts('2013-09-08')
    user_hashtag_dict = dict()
    user_online_dict = dict()
    ip_user_count_dict = {}
    new_day_ip_dict = dict()
    for i in range(7,0,-1):
        ts = timestamp - 24*3600*i
        print 'iter date:', ts2date(ts)
        results = r_cluster.hmget('hashtag_'+str(ts), uid_list)
        online_pattern_results = r_cluster.hmget('online_'+str(ts), uid_list)

        if i==0:
            ip_result = r_cluater.hmget('hashtag_'+str(ts), uid_list)

        for j in range(0, len(uid_list)):
            uid = uid_list[j]
            #attr: hashtag
            if results[j]:
                hashtag_dict = json.loads(results[j])
                for hashtag in hashtag_dict:
                    if uid in user_hashtag_dict:
                        try:
                            user_hashtag_dict[uid][hashtag] += hashtag_dict[hashtag]
                        except:
                            user_hashtag_dict[uid][hashtag] = hashtag_dict[hashtag]
                    else:
                        user_hashtag_dict[uid] = {hashtag: hashtag_dict[hashtag]}
            '''
            #attr: online_pattern
            if online_pattern_results[j]:
                online_pattern_dict = json.loads(online_pattern_results[j])
                for online_pattern in online_pattern_dict:
                    if uid in user_online_dict:
                        try:
                            user_online_dict[uid][online_pattern] += online_pattern_dict[online_pattern]
                        except:
                            user_online_dict[uid][online_pattern] = online_pattern_dict[online_pattern]
                    else:
                        user_online_dict[uid] = {online_pattern: online_pattern_dict[online_pattern]}
            '''
            
            #attr: activity_geo by ip-timestamp
            if i==0 and ip_result[j]:
                ip_timestamp_dict = json.loads(ip_result[j])
                old_flow_information = user_info[uid]
                old_day_geo_list = json.loads(old_flow_information['activity_geo_dict'])
                for ip in ip_timestamp_dict:
                    ip_count = len(ip_timestamp_dict[ip].split('&'))
                    new_day_ip_dict[uid][ip] = ip_count
                geo_dict = ip2city(new_day_ip_dict[uid])
                if len(old_day_geo_list)>=30:
                    new_day_geo_list = old_day_geo_list[1:].append(geo_dict)
                else:
                    new_day_geo_list = old_day_geo_list.append(geo_dict)
                week_geo_list = []
                week_day_geo_list = new_day_geo[-7:]
                for day_geo_dict in week_day_geo_list:
                    week_geo_list.extend(day_geo_dict.keys())
                week_geo_list = list(set(week_geo_list))
                activity_geo_string = ''
                new_week_geo_list = []
                for geo_string in week_geo_list:
                    day_geo_string = '&'.join(geo_string.split('\t'))
                    new_week_geo_list.append(day_geo_string)
                activity_geo_string = '&'.join(new_week_geo_list)
                print 'activity_geo_string:', activity_geo_string
                

    for uid in uid_list:
        #attr: hashtag
        try:
            hashtag_dict = user_hashtag_dict[uid]
            hashtag_string = json.dumps(hashtag_dict)
            hashtag_list = '&'.join(hashtag_dict.keys())
        except KeyError:
            hashtag_string = ''
            hashtag_list = ''
        '''
        #attr: online_pattern
        try:
            online_dict = user_online_dict[uid]
            online_string = json.dumps(online_dict)
            online_list = '&'.join(online_dict.keys())
        except KeyError:
            online_string = ''
            online_list = ''
        '''
        result[uid] = {'hashtag_dict':hashtag_string, 'hashtag':hashtag_list, \
                       'activity_geo_dict': json.loads(new_day_geo_list), 'activity_geo': activity_geo_string, \
                       'online_pattern_dict': online_pattern_string, 'online_pattern': online_pattern_list}
    return result
예제 #15
0
   uhlist = zip(uidlist, hashtag_list)
   uhtlist = []
   for uh in uhlist:
       uh = list(uh)
       uh.append(ts)
       uhtlist.append(uh)
   data.extend(uhtlist)

with open("hashtag_0521.txt", "w") as fw:
    for d in data:
        if d[1] != None:
            fw.write("%s\n" % json.dumps(d))

at_data = []
for ts in tss:
   ns = "at_" + str(ts)
   hashtag_list = R_CLUSTER_FLOW2.hmget(ns, uidlist)
   hashtag_list = [json.loads(h) if h else None for h in hashtag_list]
   uhlist = zip(uidlist, hashtag_list)
   uhtlist = []
   for uh in uhlist:
       uh = list(uh)
       uh.append(ts)
       uhtlist.append(uh)
   at_data.extend(uhtlist)

with open("at_0521.txt", "w") as fw:
    for a in at_data:
        if a[1] != None:
            fw.write("%s\n" % json.dumps(a))
예제 #16
0
def get_flow_information(uid_list):
    results = {}
    #results = {uid:{'hashtag_dict':{},'hashtag':'', 'keywords_dict':{}, 'keywords_string':'', 'activity_geo':'', 'activity_geo_dict':dict}}
    iter_results = {
    }  # iter_results = {uid:{'hashtag': hashtag_dict, 'geo':geo_dict, 'keywords':keywords_dict}}
    now_ts = time.time()
    now_date_ts = datetime2ts(ts2datetime(now_ts))
    #test
    now_date_ts = test_ts
    for i in range(7, 0, -1):
        ts = now_date_ts - DAY * i
        iter_date = ts2datetime(ts)
        flow_text_index_name = flow_text_index_name_pre + iter_date
        uid_day_geo = {}
        #compute hashtag and geo
        hashtag_results = r_cluster.hmget('hashtag_' + str(ts), uid_list)
        ip_results = r_cluster.hmget('new_ip_' + str(ts), uid_list)
        #compute sensitive_words
        sensitive_results = r_cluster.hmget('sensitive_' + str(ts), uid_list)
        count = 0
        for uid in uid_list:
            #init iter_results[uid]
            if uid not in iter_results:
                iter_results[uid] = {
                    'hashtag': {},
                    'geo': {},
                    'geo_track': [],
                    'keywords': {},
                    'sensitive': {}
                }
            #compute hashtag
            hashtag_item = hashtag_results[count]
            if hashtag_item:
                uid_hashtag_dict = json.loads(hashtag_item)
            else:
                uid_hashtag_dict = {}
            for hashtag in uid_hashtag_dict:
                try:
                    iter_results[uid]['hashtag'][hashtag] += uid_hashtag_dict[
                        hashtag]
                except:
                    iter_results[uid]['hashtag'][hashtag] = uid_hashtag_dict[
                        hashtag]
            #compute sensitive
            sensitive_item = sensitive_results[count]
            if sensitive_item:
                uid_sensitive_dict = json.loads(sensitive_item)
            else:
                uid_sensitive_dict = {}
            for sensitive_word in uid_sensitive_dict:
                try:
                    iter_results[uid]['sensitive'][
                        sensitive_word] += uid_sensitive_dict[sensitive_word]
                except:
                    iter_results[uid]['sensitive'][
                        sensitive_word] = uid_sensitive_dict[sensitive_word]
            #compute geo
            uid_day_geo[uid] = {}
            ip_item = ip_results[count]
            if ip_item:
                uid_ip_dict = json.loads(ip_item)
            else:
                uid_ip_dict = {}
            for ip in uid_ip_dict:
                ip_count = len(uid_ip_dict[ip].split('&'))
                geo = ip2city(ip)
                if geo:
                    #print 'geo:', geo
                    try:
                        iter_results[uid]['geo'][geo] += ip_count
                    except:
                        iter_results[uid]['geo'][geo] = ip_count
                    try:
                        uid_day_geo[uid][geo] += ip_count
                    except:
                        uid_day_geo[uid][geo] = ip_count
            iter_results[uid]['geo_track'].append(uid_day_geo[uid])
            count += 1

        #compute keywords:
        try:
            text_results = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type, \
                                               body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'size':MAX_VALUE}, _source=True, fields=['uid', 'keywords_dict','text'])['hits']['hits']
        except:
            text_results = {}
        for item in text_results:
            #print 'keywords item:', item
            uid = item['fields']['uid'][0]
            uid_keywords_dict = json.loads(item['fields']['keywords_dict'][0])
            for keywords in uid_keywords_dict:
                try:
                    iter_results[uid]['keywords'][
                        keywords] += uid_keywords_dict[keywords]
                except:
                    iter_results[uid]['keywords'][
                        keywords] = uid_keywords_dict[keywords]

            #jln filter keyword 2016/11/08
            weibo_text = json.loads(item['fields']['text'][0])
            filter_keywords_dict = get_weibo_single(weibo_text)

            for keywords in filter_keywords_dict:
                try:
                    iter_results[uid]['filter_keywords'][
                        keywords] += uid_keywords_dict[keywords]
                except:
                    iter_results[uid]['filter_keywords'][
                        keywords] = uid_keywords_dict[keywords]

    #get keywords top
    for uid in uid_list:
        results[uid] = {}
        hashtag_dict = iter_results[uid]['hashtag']
        results[uid]['hashtag_dict'] = json.dumps(hashtag_dict)
        results[uid]['hashtag'] = '&'.join(hashtag_dict.keys())
        #sensitive words
        sensitive_word_dict = iter_results[uid]['sensitive']
        results[uid]['sensitive_dict'] = json.dumps(sensitive_word_dict)
        results[uid]['sensitive_string'] = '&'.join(sensitive_word_dict.keys())
        sensitive_score = 0
        for item in sensitive_word_dict:
            k = item
            v = sensitive_word_dict[k]
            tmp_stage = r_sensitive.hget('sensitive_words', k)
            if tmp_stage:
                sensitive_score += v * sensitive_score_dict[str(tmp_stage)]
        results[uid]['sensitive'] = sensitive_score
        #print 'sensitive_dict:', results[uid]['sensitive_dict']
        #print 'sensitive_string:', results[uid]['sensitive_string']
        #print 'sensitive:', results[uid]['sensitive']
        #geo
        geo_dict = iter_results[uid]['geo']
        geo_track_list = iter_results[uid]['geo_track']
        results[uid]['activity_geo_dict'] = json.dumps(geo_track_list)
        geo_dict_keys = geo_dict.keys()
        #print 'geo_dict_keys:', geo_dict_keys
        results[uid]['activity_geo'] = '&'.join(
            ['&'.join(item.split('\t')) for item in geo_dict_keys])
        #print 'activity_geo:',  results[uid]['activity_geo']

        keywords_dict = iter_results[uid]['keywords']
        keywords_top50 = sorted(keywords_dict.items(),
                                key=lambda x: x[1],
                                reverse=True)[:50]
        keywords_top50_string = '&'.join(
            [keyword_item[0] for keyword_item in keywords_top50])

        filter_keywords_dict = iter_results[uid]['filter_keywords']
        f_keywords_top50 = sorted(filter_keywords_dict.items(),
                                  key=lambda x: x[1],
                                  reverse=True)[:50]
        f_keywords_top50_string = '&'.join(
            [filter_keywords_dict[0] for keyword_item in f_keywords_top50])

        results[uid]['keywords'] = json.dumps(keywords_top50)
        results[uid]['keywords_string'] = keywords_top50_string

        results[uid]['filter_keywords'] = json.dumps(f_keywords_top50)
        results[uid]['filter_keywords_string'] = f_keywords_top50_string
    return results
예제 #17
0
def get_flow_information_v2(uid_list, all_user_keywords_dict):
    results = {}
    #results = {uid:{'hashtag_dict':{},'hashtag':'', 'keywords_dict':{}, 'keywords_string':'', 'activity_geo':'', 'activity_geo_dict':dict, 'activity_geo_aggs':''}}
    iter_results = {
    }  # iter_results = {uid:{'hashtag': hashtag_dict, 'geo':geo_dict, 'keywords':keywords_dict}}
    now_ts = time.time()
    #run_type
    today_sensitive_results = {}
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = test_ts
    print 'run_type:', RUN_TYPE
    for i in range(WEEK, 0, -1):
        ts = now_date_ts - DAY * i
        uid_day_geo = {}
        #compute hashtag and geo
        hashtag_results = r_cluster_3.hmget('hashtag_' + str(ts), uid_list)
        ip_results = r_cluster.hmget('new_ip_' + str(ts), uid_list)
        #print 'ip_results:', ip_results
        #compute sensitive_words
        sensitive_results = r_cluster_3.hmget('sensitive_' + str(ts), uid_list)
        count = 0
        for uid in uid_list:
            #init iter_results[uid]
            if uid not in iter_results:
                iter_results[uid] = {
                    'hashtag': {},
                    'geo': {},
                    'geo_track': [],
                    'keywords': {},
                    'sensitive': {},
                    'school': {},
                    'week_ip': {
                        0: {},
                        1: {},
                        2: {},
                        3: {},
                        4: {},
                        5: {}
                    },
                    'ip': {}
                }
            if uid not in today_sensitive_results:
                today_sensitive_results[uid] = {}
            #compute hashtag
            hashtag_item = hashtag_results[count]
            if hashtag_item:
                uid_hashtag_dict = json.loads(hashtag_item)
            else:
                uid_hashtag_dict = {}
            for hashtag in uid_hashtag_dict:
                try:
                    iter_results[uid]['hashtag'][hashtag] += uid_hashtag_dict[
                        hashtag]
                except:
                    iter_results[uid]['hashtag'][hashtag] = uid_hashtag_dict[
                        hashtag]
            #compute sensitive
            sensitive_item = sensitive_results[count]
            if sensitive_item:
                uid_sensitive_dict = json.loads(sensitive_item)
            else:
                uid_sensitive_dict = {}
            for sensitive_word in uid_sensitive_dict:
                try:
                    iter_results[uid]['sensitive'][
                        sensitive_word] += uid_sensitive_dict[sensitive_word]
                except:
                    iter_results[uid]['sensitive'][
                        sensitive_word] = uid_sensitive_dict[sensitive_word]
                if ts == now_date_ts - DAY:
                    try:
                        today_sensitive_results[uid][
                            sensitive_word] += uid_sensitive_dict[
                                sensitive_word]
                    except:
                        today_sensitive_results[uid][
                            sensitive_word] = uid_sensitive_dict[
                                sensitive_word]
            #compute geo
            uid_day_geo[uid] = {}
            ip_item = ip_results[count]
            if ip_item:
                uid_ip_dict = json.loads(ip_item)
            else:
                uid_ip_dict = {}
            for ip in uid_ip_dict:
                ip_count = len(uid_ip_dict[ip].split('&'))
                geo, school = ip2city(ip)
                if geo:
                    try:
                        iter_results[uid]['geo'][geo] += ip_count
                    except:
                        iter_results[uid]['geo'][geo] = ip_count
                    try:
                        uid_day_geo[uid][geo] += ip_count
                    except:
                        uid_day_geo[uid][geo] = ip_count
                if school:
                    try:
                        iter_results[uid]['school'][school] += ip_count
                    except:
                        iter_results[uid]['school'][school] = ip_count
                #deal ip: job_ip&home_ip&active_ip
                ip_time_list = uid_ip_dict[ip].split('&')
                try:
                    iter_results[uid]['ip'][ip] += ip_count
                except:
                    iter_results[uid]['ip'] = {ip: ip_count}
                for ip_time_item in ip_time_list:
                    ip_timesegment = (int(ip_time_item) - ts) / IP_TIME_SEGMENT
                    try:
                        iter_results[uid]['week_ip'][ip_timesegment][ip] += 1
                    except:
                        iter_results[uid]['week_ip'][ip_timesegment][ip] = 1
                #end deal ip
            iter_results[uid]['geo_track'].append(uid_day_geo[uid])
            count += 1
    #get keywords top
    for uid in uid_list:
        #print 'test iter_results_ip:', iter_results[uid]['week_ip']
        results[uid] = {}
        #hashtag
        hashtag_dict = iter_results[uid]['hashtag']
        results[uid]['hashtag_dict'] = json.dumps(hashtag_dict)
        results[uid]['hashtag'] = '&'.join(hashtag_dict.keys())
        #sensitive words
        sensitive_word_dict = iter_results[uid]['sensitive']
        results[uid]['sensitive_dict'] = json.dumps(sensitive_word_dict)
        results[uid]['sensitive_string'] = '&'.join(sensitive_word_dict.keys())
        sensitive_score = 0
        today_sensitive_results_user = today_sensitive_results[uid]
        for sensitive_item in today_sensitive_results_user:
            k = sensitive_item
            v = today_sensitive_results_user[sensitive_item]
            tmp_stage = r_sensitive.hget('sensitive_words', k)
            if tmp_stage:
                sensitive_score += v * sensitive_score_dict[str(tmp_stage)]
        results[uid]['sensitive'] = sensitive_score
        #geo
        geo_dict = iter_results[uid]['geo']
        geo_track_list = iter_results[uid]['geo_track']
        results[uid]['activity_geo_dict'] = json.dumps(geo_track_list)
        geo_dict_keys = geo_dict.keys()
        results[uid]['activity_geo'] = '&'.join(
            ['&'.join(item.split('\t')) for item in geo_dict_keys])
        try:
            results[uid]['activity_geo_aggs'] = '&'.join(
                [item.split('\t')[-1] for item in geo_dict_keys])
        except:
            results[uid]['activity_geo_aggs'] = ''
        #keywords
        keywords_dict = all_user_keywords_dict[uid]
        keywords_top50 = sorted(keywords_dict.items(),
                                key=lambda x: x[1],
                                reverse=True)[:50]
        keywords_top50_string = '&'.join(
            [keyword_item[0] for keyword_item in keywords_top50])
        results[uid]['keywords'] = json.dumps(keywords_top50)
        results[uid]['keywords_string'] = keywords_top50_string
        #school dict
        school_dict = iter_results[uid]['school']
        school_string = '&'.join(school_dict.keys())
        if school_dict != {}:
            is_school = '1'
        else:
            is_school = '0'
        results[uid]['is_school'] = is_school
        results[uid]['school_string'] = school_string
        results[uid]['school_dict'] = json.dumps(school_dict)
        #ip: job_ip&home_ip&activity_ip
        #activity_ip
        all_ip_dict = iter_results[uid]['ip']
        sort_all_ip = sorted(all_ip_dict.items(),
                             key=lambda x: x[1],
                             reverse=True)
        try:
            activity_ip = sort_all_ip[0][0]
        except:
            activity_ip = ''
        results[uid]['activity_ip'] = str(activity_ip)
        #job_ip & home_ip
        week_time_ip_dict = iter_results[uid]['week_ip']
        for i in range(0, 6):
            try:
                segment_dict = week_time_ip_dict[i]
            except:
                week_time_ip_dict[i] = {}
        home_ip, job_ip = get_ip_description(week_time_ip_dict)
        results[uid]['home_ip'] = str(home_ip)
        results[uid]['job_ip'] = str(job_ip)

    return results
def get_flow_information(uid_list):
    # 每天更新,只计算前一天的数据
    result_dict = {}
    now_ts = time.time()-3600*24
    now_date = ts2datetime(now_ts) # date: 2013-09-01
    #now_date = "2013-09-08"

    hashtag_results = {}
    geo_results = {}
    #ts = datetime2ts(now_date)
    user_hashtag_result = {}
    user_sensitive_hashtag = {}
    sensitive_words = {}
    user_ip_result = {}
    user_sensitive_ip = {}
    for i in range(1,8):
        ts = ts - 3600*24
        date = ts2datetime(ts).replace('-','')
        hashtag_results = r_cluster.hmget('hashtag_'+str(date), uid_list)
        sensitive_hashtag = r_cluster.hmget('sensitive_hashtag_'+str(date), uid_list)
        ip_results = r_cluster.hmget('ip_'+str(date), uid_list)
        sensitive_ip = r_cluster.hmget('sensitive_ip_'+str(date), uid_list)
        sensitive_results = r_cluster.hmget('sensitive_'+str(date), uid_list)

        for j in range(0, len(uid_list)):
            uid = uid_list[j]
            if sensitive_results[j]:
                sensitive_words_results = json.loads(sensitive_results[j])
                if sensitive_words.has_key(uid):
                    sensitive_words[uid].update({date: sensitive_words_results})
                else:
                    sensitive_words[uid] = {date: sensitive_words_results}
            if hashtag_results[j]:
                hashtag_dict = json.loads(hashtag_results[j])
                if user_hashtag_result.has_key(uid):
                    user_hashtag_result[uid].update({date: hashtag_dict})
                else:
                    user_hashtag_result[uid] = {date: hashtag_dict}
            if sensitive_hashtag[j]:
                sensitive_hashtag_dict = json.loads(sensitive_hashtag[j])
                if user_sensitive_hashtag.has_key(uid):
                    user_sensitive_hashtag[uid].update({date: sensitive_hashtag_dict})
                else:
                    user_sensitive_hashtag[uid] = {date: sensitive_hashtag_dict}
            if ip_results[j]:
                ip_dict = json.loads(ip_results[j])
                if user_ip_result.has_key(uid):
                    user_ip_result[uid].update({date: ip_dict})
                else:
                    user_ip_result[uid] = {date: ip_dict}
            if sensitive_ip[j]:
                sensitive_ip_result = json.loads(sensitive_ip[j])
                if user_sensitive_ip.has_key(uid):
                    user_sensitive_ip[uid].update({date: sensitive_ip_result})
                else:
                    user_sensitive_ip[uid] = {date: sensitive_ip_result}


    for uid in uid_list:
        hashtag_string = ''
        sensitive_hashtag_string = ''
        ip_string = ''
        ip_all = ""
        sensitive_ip_string = ''
        hashtag_dict = {}
        sensitive_hashtag_dict = {}
        ip_dict = {}
        sensitive_ip_dict = {}
        sensitive_words_string = ''
        sensitive_words_dict = {}

        if sensitive_words.has_key(uid):
            sensitive_words_string = extract_string(sensitive_words[uid])
            sensitive_words_dict = json.dumps(sensitive_words[uid])
        if user_hashtag_result.has_key(uid):
            hashtag_string = extract_string(user_hashtag_result[uid])
            hashtag_dict = json.dumps(user_hashtag_result[uid])
        if user_sensitive_hashtag.has_key(uid):
            sensitive_hashtag_string = extract_string(user_sensitive_hashtag[uid])
            sensitive_hashtag_dict = json.dumps(user_sensitive_hashtag[uid])
        if user_ip_result.has_key(uid):
            ip_string = extract_geo(user_ip_result[uid])
            ip_dict = json.dumps(ip_to_geo(user_ip_result[uid]))
            ip_all = json.dumps(user_ip_result[uid])
        if user_sensitive_ip.has_key(uid):
            sensitive_ip_string = extract_geo(user_sensitive_ip[uid])
            sensitive_ip_dict = json.dumps(ip_to_geo(user_sensitive_ip[uid]))

        result_dict[uid] = {"hashtag_string": hashtag_string, "hashtag_dict": hashtag_dict, \
                            "sensitive_hashtag_string": sensitive_hashtag_string, "sensitive_hashtag_dict": sensitive_hashtag_dict, \
                            "geo_activity": ip_dict, "geo_string": ip_string, 'ip': ip_all, \
                             "sensitive_geo_activity": sensitive_ip_dict, "sensitive_geo_string":sensitive_ip_string, \
                             'sensitive_words_string': sensitive_words_string, 'sensitive_words_dict': sensitive_words_dict}
    return result_dict
예제 #19
0
def get_flow_information_v2(uid_list, all_user_keywords_dict):
    results = {}
    #results = {uid:{'hashtag_dict':{},'hashtag':'', 'keywords_dict':{}, 'keywords_string':'', 'activity_geo':'', 'activity_geo_dict':dict, 'activity_geo_aggs':''}}
    iter_results = {
    }  # iter_results = {uid:{'hashtag': hashtag_dict, 'geo':geo_dict, 'keywords':keywords_dict}}
    now_ts = time.time()
    #run_type
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = test_ts
    for i in range(WEEK, 0, -1):
        ts = now_date_ts - DAY * i
        print ts
        uid_day_geo = {}
        #compute hashtag and geo
        hashtag_results = r_cluster.hmget('hashtag_' + str(ts), uid_list)
        ip_results = r_cluster.hmget('new_ip_' + str(ts), uid_list)
        #compute sensitive_words
        sensitive_results = r_cluster.hmget('sensitive_' + str(ts), uid_list)
        count = 0
        for uid in uid_list:
            #init iter_results[uid]
            if uid not in iter_results:
                iter_results[uid] = {
                    'hashtag': {},
                    'geo': {},
                    'geo_track': [],
                    'keywords': {},
                    'sensitive': {}
                }
            #compute hashtag
            hashtag_item = hashtag_results[count]
            if hashtag_item:
                uid_hashtag_dict = json.loads(hashtag_item)
            else:
                uid_hashtag_dict = {}
            for hashtag in uid_hashtag_dict:
                try:
                    iter_results[uid]['hashtag'][hashtag] += uid_hashtag_dict[
                        hashtag]
                except:
                    iter_results[uid]['hashtag'][hashtag] = uid_hashtag_dict[
                        hashtag]
            #compute sensitive
            sensitive_item = sensitive_results[count]
            if sensitive_item:
                uid_sensitive_dict = json.loads(sensitive_item)
            else:
                uid_sensitive_dict = {}
            for sensitive_word in uid_sensitive_dict:
                try:
                    iter_results[uid]['sensitive'][
                        sensitive_word] += uid_sensitive_dict[sensitive_word]
                except:
                    iter_results[uid]['sensitive'][
                        sensitive_word] = uid_sensitive_dict[sensitive_word]
            #compute geo
            uid_day_geo[uid] = {}
            ip_item = ip_results[count]
            if ip_item:
                uid_ip_dict = json.loads(ip_item)
            else:
                uid_ip_dict = {}
            for ip in uid_ip_dict:
                ip_count = len(uid_ip_dict[ip].split('&'))
                geo = ip2city(ip)
                if geo:
                    try:
                        iter_results[uid]['geo'][geo] += ip_count
                    except:
                        iter_results[uid]['geo'][geo] = ip_count
                    try:
                        uid_day_geo[uid][geo] += ip_count
                    except:
                        uid_day_geo[uid][geo] = ip_count
            iter_results[uid]['geo_track'].append(uid_day_geo[uid])
            count += 1

    #get keywords top
    for uid in uid_list:
        results[uid] = {}
        #hashtag
        hashtag_dict = iter_results[uid]['hashtag']
        results[uid]['hashtag_dict'] = json.dumps(hashtag_dict)
        results[uid]['hashtag'] = '&'.join(hashtag_dict.keys())
        #sensitive words
        sensitive_word_dict = iter_results[uid]['sensitive']
        results[uid]['sensitive_dict'] = json.dumps(sensitive_word_dict)
        results[uid]['sensitive_string'] = '&'.join(sensitive_word_dict.keys())
        sensitive_score = 0
        for sensitive_item in sensitive_word_dict:
            k = sensitive_item
            v = sensitive_word_dict[sensitive_item]
            tmp_stage = r_sensitive.hget('sensitive_words', k)
            if tmp_stage:
                sensitive_score += v * sensitive_score_dict[str(tmp_stage)]
        results[uid]['sensitive'] = sensitive_score
        #print 'sensitive_dict:', results[uid]['sensitive_dict']
        #print 'sensitive_string:', results[uid]['sensitive_string']
        #print 'sensitive:', results[uid]['sensitive']
        #geo
        geo_dict = iter_results[uid]['geo']
        geo_track_list = iter_results[uid]['geo_track']
        results[uid]['activity_geo_dict'] = json.dumps(geo_track_list)
        geo_dict_keys = geo_dict.keys()
        results[uid]['activity_geo'] = '&'.join(
            ['&'.join(item.split('\t')) for item in geo_dict_keys])
        try:
            results[uid]['activity_geo_aggs'] = '&'.join(
                [item.split('\t')[-1] for item in geo_dict_keys])
        except:
            results[uid]['activity_geo_aggs'] = ''

        keywords_dict = all_user_keywords_dict[uid]
        keywords_top50 = sorted(keywords_dict.items(),
                                key=lambda x: x[1],
                                reverse=True)[:50]
        keywords_top50_string = '&'.join(
            [keyword_item[0] for keyword_item in keywords_top50])
        results[uid]['keywords'] = json.dumps(keywords_top50)
        results[uid]['keywords_string'] = keywords_top50_string

    return results
def get_flow_information_v2(uid_list, all_user_keywords_dict):
    results = {}      
    #results = {uid:{'hashtag_dict':{},'hashtag':'', 'keywords_dict':{}, 'keywords_string':'', 'activity_geo':'', 'activity_geo_dict':dict, 'activity_geo_aggs':''}}
    iter_results = {} # iter_results = {uid:{'hashtag': hashtag_dict, 'geo':geo_dict, 'keywords':keywords_dict}}
    now_ts = time.time()
    #run_type
    today_sensitive_results = {}
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = test_ts
    for i in range(WEEK,0,-1):
        ts = now_date_ts - DAY*i
        uid_day_geo = {}
        #compute hashtag and geo
        hashtag_results = r_cluster_3.hmget('hashtag_'+str(ts), uid_list)
        ip_results = r_cluster.hmget('new_ip_'+str(ts), uid_list)
        #compute sensitive_words
        sensitive_results = r_cluster_3.hmget('sensitive_'+str(ts), uid_list)
        count = 0 
        for uid in uid_list:
            #init iter_results[uid]
            if uid not in iter_results:
                iter_results[uid] = {'hashtag':{}, 'geo':{},'geo_track':[],'keywords':{}, 'sensitive':{}, 'school':{}}
            if uid not in today_sensitive_results:
                today_sensitive_results[uid] = {}
            #compute hashtag
            hashtag_item = hashtag_results[count]
            if hashtag_item:
                uid_hashtag_dict = json.loads(hashtag_item)
            else:
                uid_hashtag_dict = {}
            for hashtag in uid_hashtag_dict:
                try:
                    iter_results[uid]['hashtag'][hashtag] += uid_hashtag_dict[hashtag]
                except:
                    iter_results[uid]['hashtag'][hashtag] = uid_hashtag_dict[hashtag]
            #compute sensitive
            sensitive_item = sensitive_results[count]
            if sensitive_item:
                uid_sensitive_dict = json.loads(sensitive_item)
            else:
                uid_sensitive_dict = {}
            for sensitive_word in uid_sensitive_dict:
                try:
                    iter_results[uid]['sensitive'][sensitive_word] += uid_sensitive_dict[sensitive_word]
                except:
                    iter_results[uid]['sensitive'][sensitive_word] = uid_sensitive_dict[sensitive_word]
                if ts == now_date_ts - DAY:
                    try:
                        today_sensitive_results[uid][sensitive_word] += uid_sensitive_dict[sensitive_word]
                    except:
                        today_sensitive_results[uid][sensitive_word] = uid_sensitive_dict[sensitive_word]
            #compute geo
            uid_day_geo[uid] = {}
            ip_item = ip_results[count]
            if ip_item:
                uid_ip_dict = json.loads(ip_item)
            else:
                uid_ip_dict = {}
            for ip in uid_ip_dict:
                ip_count = len(uid_ip_dict[ip].split('&'))
                geo, school = ip2city(ip)
                if geo:
                    try:
                        iter_results[uid]['geo'][geo] += ip_count
                    except:
                        iter_results[uid]['geo'][geo] = ip_count
                    try:
                        uid_day_geo[uid][geo] += ip_count
                    except:
                        uid_day_geo[uid][geo] = ip_count
                if school:
                    try:
                        iter_results[uid]['school'][school] += ip_count
                    except:
                        iter_results[uid]['school'][school] = ip_count
            iter_results[uid]['geo_track'].append(uid_day_geo[uid])
            count += 1
               
    #get keywords top
    for uid in uid_list:
        results[uid] = {}
        #hashtag
        hashtag_dict = iter_results[uid]['hashtag']
        results[uid]['hashtag_dict'] = json.dumps(hashtag_dict)
        results[uid]['hashtag'] = '&'.join(hashtag_dict.keys())
        #sensitive words
        sensitive_word_dict = iter_results[uid]['sensitive']
        results[uid]['sensitive_dict'] = json.dumps(sensitive_word_dict)
        results[uid]['sensitive_string'] = '&'.join(sensitive_word_dict.keys())
        sensitive_score = 0
        today_sensitive_results_user = today_sensitive_results[uid]
        for sensitive_item in today_sensitive_results_user:
            k = sensitive_item
            v = today_sensitive_results_user[sensitive_item]
            tmp_stage = r_sensitive.hget('sensitive_words', k)
            if tmp_stage:
                sensitive_score += v * sensitive_score_dict[str(tmp_stage)]
        results[uid]['sensitive'] = sensitive_score
        #print 'sensitive_dict:', results[uid]['sensitive_dict']
        #print 'sensitive_string:', results[uid]['sensitive_string']
        #print 'sensitive:', results[uid]['sensitive']
        #geo
        geo_dict = iter_results[uid]['geo']
        geo_track_list = iter_results[uid]['geo_track']
        results[uid]['activity_geo_dict'] = json.dumps(geo_track_list)
        geo_dict_keys = geo_dict.keys()
        results[uid]['activity_geo'] = '&'.join(['&'.join(item.split('\t')) for item in geo_dict_keys])
        try:
            results[uid]['activity_geo_aggs'] = '&'.join([item.split('\t')[-1] for item in geo_dict_keys])
        except:
            results[uid]['activity_geo_aggs'] = ''
        #keywords
        keywords_dict = all_user_keywords_dict[uid]
        keywords_top50 = sorted(keywords_dict.items(), key=lambda x:x[1], reverse=True)[:50]
        keywords_top50_string = '&'.join([keyword_item[0] for keyword_item in keywords_top50])
        results[uid]['keywords'] = json.dumps(keywords_top50)
        results[uid]['keywords_string'] = keywords_top50_string
        #school dict
        school_dict = iter_results[uid]['school']
        school_string = '&'.join(school_dict.keys())
        if school_dict != {}:
            is_school = '1'
        else:
            is_school = '0'
        results[uid]['is_school'] = is_school
        results[uid]['school_string'] = school_string
        results[uid]['school_dict'] = json.dumps(school_dict)
        
    return results
예제 #21
0
def update_flow_information(user_info):
    results = {
    }  # results ={uid: {'activity_geo_dict':'', 'activity_geo':'', 'hashtag_dict':'', 'hashtag':'', 'online_pattern_dict':'', 'online_pattern':''}}
    uid_list = user_info.keys()
    now_ts = time.time()
    now_date = ts2datetime(now_ts)
    timestamp = datetime2ts(now_date)
    #test
    timestamp = datetime2ts('2013-09-08')
    user_hashtag_dict = dict()
    user_online_dict = dict()
    ip_user_count_dict = {}
    new_day_ip_dict = dict()
    for i in range(7, 0, -1):
        ts = timestamp - 24 * 3600 * i
        print 'iter date:', ts2date(ts)
        results = r_cluster.hmget('hashtag_' + str(ts), uid_list)
        online_pattern_results = r_cluster.hmget('online_' + str(ts), uid_list)

        if i == 0:
            ip_result = r_cluater.hmget('hashtag_' + str(ts), uid_list)

        for j in range(0, len(uid_list)):
            uid = uid_list[j]
            #attr: hashtag
            if results[j]:
                hashtag_dict = json.loads(results[j])
                for hashtag in hashtag_dict:
                    if uid in user_hashtag_dict:
                        try:
                            user_hashtag_dict[uid][hashtag] += hashtag_dict[
                                hashtag]
                        except:
                            user_hashtag_dict[uid][hashtag] = hashtag_dict[
                                hashtag]
                    else:
                        user_hashtag_dict[uid] = {
                            hashtag: hashtag_dict[hashtag]
                        }
            '''
            #attr: online_pattern
            if online_pattern_results[j]:
                online_pattern_dict = json.loads(online_pattern_results[j])
                for online_pattern in online_pattern_dict:
                    if uid in user_online_dict:
                        try:
                            user_online_dict[uid][online_pattern] += online_pattern_dict[online_pattern]
                        except:
                            user_online_dict[uid][online_pattern] = online_pattern_dict[online_pattern]
                    else:
                        user_online_dict[uid] = {online_pattern: online_pattern_dict[online_pattern]}
            '''

            #attr: activity_geo by ip-timestamp
            if i == 0 and ip_result[j]:
                ip_timestamp_dict = json.loads(ip_result[j])
                old_flow_information = user_info[uid]
                old_day_geo_list = json.loads(
                    old_flow_information['activity_geo_dict'])
                for ip in ip_timestamp_dict:
                    ip_count = len(ip_timestamp_dict[ip].split('&'))
                    new_day_ip_dict[uid][ip] = ip_count
                geo_dict = ip2city(new_day_ip_dict[uid])
                if len(old_day_geo_list) >= 30:
                    new_day_geo_list = old_day_geo_list[1:].append(geo_dict)
                else:
                    new_day_geo_list = old_day_geo_list.append(geo_dict)
                week_geo_list = []
                week_day_geo_list = new_day_geo[-7:]
                for day_geo_dict in week_day_geo_list:
                    week_geo_list.extend(day_geo_dict.keys())
                week_geo_list = list(set(week_geo_list))
                activity_geo_string = ''
                new_week_geo_list = []
                for geo_string in week_geo_list:
                    day_geo_string = '&'.join(geo_string.split('\t'))
                    new_week_geo_list.append(day_geo_string)
                activity_geo_string = '&'.join(new_week_geo_list)
                print 'activity_geo_string:', activity_geo_string

    for uid in uid_list:
        #attr: hashtag
        try:
            hashtag_dict = user_hashtag_dict[uid]
            hashtag_string = json.dumps(hashtag_dict)
            hashtag_list = '&'.join(hashtag_dict.keys())
        except KeyError:
            hashtag_string = ''
            hashtag_list = ''
        '''
        #attr: online_pattern
        try:
            online_dict = user_online_dict[uid]
            online_string = json.dumps(online_dict)
            online_list = '&'.join(online_dict.keys())
        except KeyError:
            online_string = ''
            online_list = ''
        '''
        result[uid] = {'hashtag_dict':hashtag_string, 'hashtag':hashtag_list, \
                       'activity_geo_dict': json.loads(new_day_geo_list), 'activity_geo': activity_geo_string, \
                       'online_pattern_dict': online_pattern_string, 'online_pattern': online_pattern_list}
    return result
def get_flow_information(uid_list):
    results = {}      
    #results = {uid:{'hashtag_dict':{},'hashtag':'', 'keywords_dict':{}, 'keywords_string':'', 'activity_geo':'', 'activity_geo_dict':dict}}
    iter_results = {} # iter_results = {uid:{'hashtag': hashtag_dict, 'geo':geo_dict, 'keywords':keywords_dict}}
    now_ts = time.time()
    now_date_ts = datetime2ts(ts2datetime(now_ts))
    #test
    now_date_ts = test_ts
    for i in range(7,0,-1):
        ts = now_date_ts - DAY*i
        iter_date = ts2datetime(ts)
        flow_text_index_name = flow_text_index_name_pre + iter_date
        uid_day_geo = {}
        #compute hashtag and geo
        hashtag_results = r_cluster_3.hmget('hashtag_'+str(ts), uid_list)
        ip_results = r_cluster.hmget('new_ip_'+str(ts), uid_list)
        #compute sensitive_words
        sensitive_results = r_cluster_3.hmget('sensitive_'+str(ts), uid_list)
        count = 0 
        for uid in uid_list:
            #init iter_results[uid]
            if uid not in iter_results:
                iter_results[uid] = {'hashtag':{}, 'geo':{},'geo_track':[],'keywords':{}, 'sensitive':{}}
            #compute hashtag
            hashtag_item = hashtag_results[count]
            if hashtag_item:
                uid_hashtag_dict = json.loads(hashtag_item)
            else:
                uid_hashtag_dict = {}
            for hashtag in uid_hashtag_dict:
                try:
                    iter_results[uid]['hashtag'][hashtag] += uid_hashtag_dict[hashtag]
                except:
                    iter_results[uid]['hashtag'][hashtag] = uid_hashtag_dict[hashtag]
            #compute sensitive
            sensitive_item = sensitive_results[count]
            if sensitive_item:
                uid_sensitive_dict = json.loads(sensitive_item)
            else:
                uid_sensitive_dict = {}
            for sensitive_word in uid_sensitive_dict:
                try:
                    iter_results[uid]['sensitive'][sensitive_word] += uid_sensitive_dict[sensitive_word]
                except:
                    iter_results[uid]['sensitive'][sensitive_word] = uid_sensitive_dict[sensitive_word]
            #compute geo
            uid_day_geo[uid] = {}
            ip_item = ip_results[count]
            if ip_item:
                uid_ip_dict = json.loads(ip_item)
            else:
                uid_ip_dict = {}
            for ip in uid_ip_dict:
                ip_count = len(uid_ip_dict[ip].split('&'))
                geo = ip2city(ip)
                if geo:
                    #print 'geo:', geo
                    try:
                        iter_results[uid]['geo'][geo] += ip_count
                    except:
                        iter_results[uid]['geo'][geo] = ip_count
                    try:
                        uid_day_geo[uid][geo] += ip_count
                    except:
                        uid_day_geo[uid][geo] = ip_count
            iter_results[uid]['geo_track'].append(uid_day_geo[uid])
            count += 1
        
        #compute keywords:        
        try:
            text_results = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type, \
                                               body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'size':MAX_VALUE}, _source=True, fields=['uid', 'keywords_dict'])['hits']['hits']
        except:
            text_results = {}
        for item in text_results:
            #print 'keywords item:', item
            uid = item['fields']['uid'][0]
            uid_keywords_dict = json.loads(item['fields']['keywords_dict'][0])
            for keywords in uid_keywords_dict:
                try:
                    iter_results[uid]['keywords'][keywords] += uid_keywords_dict[keywords]
                except:
                    iter_results[uid]['keywords'][keywords] = uid_keywords_dict[keywords]

       
    #get keywords top
    for uid in uid_list:
        results[uid] = {}
        hashtag_dict = iter_results[uid]['hashtag']
        results[uid]['hashtag_dict'] = json.dumps(hashtag_dict)
        results[uid]['hashtag'] = '&'.join(hashtag_dict.keys())
        #sensitive words
        sensitive_word_dict = iter_results[uid]['sensitive']
        results[uid]['sensitive_dict'] = json.dumps(sensitive_word_dict)
        results[uid]['sensitive_string'] = '&'.join(sensitive_word_dict.keys())
        sensitive_score = 0
        for item in sensitive_word_dict:
            k = item
            v = sensitive_word_dict[k]
            tmp_stage = r_sensitive.hget('sensitive_words', k)
            if tmp_stage:
                sensitive_score += v * sensitive_score_dict[str(tmp_stage)]
        results[uid]['sensitive'] = sensitive_score
        #print 'sensitive_dict:', results[uid]['sensitive_dict']
        #print 'sensitive_string:', results[uid]['sensitive_string']
        #print 'sensitive:', results[uid]['sensitive']
        #geo
        geo_dict = iter_results[uid]['geo']
        geo_track_list = iter_results[uid]['geo_track']
        results[uid]['activity_geo_dict'] = json.dumps(geo_track_list)
        geo_dict_keys = geo_dict.keys()
        #print 'geo_dict_keys:', geo_dict_keys
        results[uid]['activity_geo'] = '&'.join(['&'.join(item.split('\t')) for item in geo_dict_keys])
        #print 'activity_geo:',  results[uid]['activity_geo']

        keywords_dict = iter_results[uid]['keywords']
        keywords_top50 = sorted(keywords_dict.items(), key=lambda x:x[1], reverse=True)[:50]
        keywords_top50_string = '&'.join([keyword_item[0] for keyword_item in keywords_top50])
        results[uid]['keywords'] = json.dumps(keywords_top50)
        results[uid]['keywords_string'] = keywords_top50_string
        
    return results
예제 #23
0
    uhlist = zip(uidlist, hashtag_list)
    uhtlist = []
    for uh in uhlist:
        uh = list(uh)
        uh.append(ts)
        uhtlist.append(uh)
    data.extend(uhtlist)

with open("hashtag_0521.txt", "w") as fw:
    for d in data:
        if d[1] != None:
            fw.write("%s\n" % json.dumps(d))

at_data = []
for ts in tss:
    ns = "at_" + str(ts)
    hashtag_list = R_CLUSTER_FLOW2.hmget(ns, uidlist)
    hashtag_list = [json.loads(h) if h else None for h in hashtag_list]
    uhlist = zip(uidlist, hashtag_list)
    uhtlist = []
    for uh in uhlist:
        uh = list(uh)
        uh.append(ts)
        uhtlist.append(uh)
    at_data.extend(uhtlist)

with open("at_0521.txt", "w") as fw:
    for a in at_data:
        if a[1] != None:
            fw.write("%s\n" % json.dumps(a))
예제 #24
0
def get_flow_information_v2(uid_list, all_user_keywords_dict):
    results = {}      
    #results = {uid:{'hashtag_dict':{},'hashtag':'', 'keywords_dict':{}, 'keywords_string':'', 'activity_geo':'', 'activity_geo_dict':dict, 'activity_geo_aggs':''}}
    iter_results = {} # iter_results = {uid:{'hashtag': hashtag_dict, 'geo':geo_dict, 'keywords':keywords_dict}}
    now_ts = time.time()
    #run_type
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = test_ts
    for i in range(WEEK,0,-1):
        ts = now_date_ts - DAY*i
        uid_day_geo = {}
        #compute hashtag and geo
        hashtag_results = r_cluster.hmget('hashtag_'+str(ts), uid_list)
        ip_results = r_cluster.hmget('new_ip_'+str(ts), uid_list)
        count = 0 
        for uid in uid_list:
            #init iter_results[uid]
            if uid not in iter_results:
                iter_results[uid] = {'hashtag':{}, 'geo':{},'geo_track':[],'keywords':{}}
            #compute hashtag
            hashtag_item = hashtag_results[count]
            if hashtag_item:
                uid_hashtag_dict = json.loads(hashtag_item)
            else:
                uid_hashtag_dict = {}
            for hashtag in uid_hashtag_dict:
                try:
                    iter_results[uid]['hashtag'][hashtag] += uid_hashtag_dict[hashtag]
                except:
                    iter_results[uid]['hashtag'][hashtag] = uid_hashtag_dict[hashtag]
            #compute geo
            uid_day_geo[uid] = {}
            ip_item = ip_results[count]
            if ip_item:
                uid_ip_dict = json.loads(ip_item)
            else:
                uid_ip_dict = {}
            for ip in uid_ip_dict:
                ip_count = len(uid_ip_dict[ip].split('&'))
                geo = ip2city(ip)
                if geo:
                    try:
                        iter_results[uid]['geo'][geo] += ip_count
                    except:
                        iter_results[uid]['geo'][geo] = ip_count
                    try:
                        uid_day_geo[uid][geo] += ip_count
                    except:
                        uid_day_geo[uid][geo] = ip_count
            iter_results[uid]['geo_track'].append(uid_day_geo[uid])
            count += 1
               
    #get keywords top
    for uid in uid_list:
        results[uid] = {}
        hashtag_dict = iter_results[uid]['hashtag']
        results[uid]['hashtag_dict'] = json.dumps(hashtag_dict)
        results[uid]['hashtag'] = '&'.join(hashtag_dict.keys())
        geo_dict = iter_results[uid]['geo']
        geo_track_list = iter_results[uid]['geo_track']
        results[uid]['activity_geo_dict'] = json.dumps(geo_track_list)
        geo_dict_keys = geo_dict.keys()
        results[uid]['activity_geo'] = '&'.join(['&'.join(item.split('\t')) for item in geo_dict_keys])
        try:
            results[uid]['activity_geo_aggs'] = '&'.join([item.split('\t')[-1] for item in geo_dict_keys])
        except:
            results[uid]['activity_geo_aggs'] = ''

        keywords_dict = all_user_keywords_dict[uid]
        keywords_top50 = sorted(keywords_dict.items(), key=lambda x:x[1], reverse=True)[:50]
        keywords_top50_string = '&'.join([keyword_item[0] for keyword_item in keywords_top50])
        results[uid]['keywords'] = json.dumps(keywords_top50)
        results[uid]['keywords_string'] = keywords_top50_string
        
    return results