Пример #1
0
def save_activity(uid, ts, time_segment, sensitive):
    key = str(ts)
    try:
        if sensitive:
            activity_count_dict = r_cluster.hget('sensitive_activity_' + key,
                                                 str(uid))
        else:
            activity_count_dict = r_cluster.hget('activity_' + key, str(uid))
        activity_count_dict = json.loads(activity_count_dict)
        try:
            activity_count_dict[str(time_segment)] += 1
        except:
            activity_count_dict[str(time_segment)] = 1
        if sensitive:
            r_cluster.hset('sensitive_activity_' + key, str(uid),
                           json.dumps(activity_count_dict))
        else:
            r_cluster.hset('activity_' + key, str(uid),
                           json.dumps(activity_count_dict))
    except:
        if sensitive:
            r_cluster.hset('sensitive_activity_' + key, str(uid),
                           json.dumps({str(time_segment): 1}))
        else:
            r_cluster.hset('activity_' + key, str(uid),
                           json.dumps({str(time_segment): 1}))
def cal_text_work(item):
    uid = item['uid']
    timestamp = item['timestamp']
    date = ts2datetime(timestamp)
    ts = datetime2ts(date)
    text = item['text']
    if isinstance(text, str):
        text = text.decode('utf-8', 'ignore')
    RE = re.compile(u'#([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)#', re.UNICODE)
    hashtag_list = RE.findall(text)
    if hashtag_list:
        # there all use unicode·
        hashtag_dict = dict()
        for hashtag in hashtag_list:
            try:
                hashtag_dict[hashtag] += 1
            except:
                hashtag_dict[hashtag] = 1
        try:
            hashtag_count_string = r_cluster.hget('hashtag_'+str(ts), str(uid))
            hashtag_count_dict = json.loads(hashtag_count_string)
            for hashtag in hashtag_dict:
                count = hashtag_dict[hashtag]
                try:
                    hashtag_count_dict[hashtag] += count
                except:
                    hashtag_count_dict[hashtag] = count
            r_cluster.hset('hashtag_'+str(ts), str(uid), json.dumps(hashtag_count_dict))
        except:
            r_cluster.hset('hashtag_'+str(ts), str(uid), json.dumps(hashtag_dict))
Пример #3
0
def save_at(uid, at_uid, timestamp, sensitive):
    ts = ts2datetime(timestamp).replace('-', '')
    key = str(uid)
    try:
        if sensitive:
            ruid_count_string = r_cluster.hget('sensitive_at_' + str(ts),
                                               str(uid))
        else:
            ruid_count_string = r_cluster.hget('at_' + str(ts), str(uid))

        ruid_count_dict = json.loads(ruid_count_string)
        try:
            ruid_count_dict[str(at_uid)] += 1
        except:
            ruid_count_dict[str(at_uid)] = 1
        if sensitive:
            r_cluster.hset('sensitive_at_' + str(ts), str(uid),
                           json.dumps(ruid_count_dict))
        else:
            r_cluster.hset('at_' + str(ts), str(uid),
                           json.dumps(ruid_count_dict))

    except:
        if sensitive:
            r_cluster.hset('sensitive_at_' + str(ts), str(uid),
                           json.dumps({str(at_uid): 1}))
        else:
            r_cluster.hset('at_' + str(ts), str(uid),
                           json.dumps({str(at_uid): 1}))
def save_city(uid, ip, timestamp, sensitive):
    ts = ts2datetime(timestamp).replace('-','')
    key = str(uid)
    try:
        if sensitive:
            ip_count_string = r_cluster.hget('sensitive_ip_'+str(ts), str(uid))
        else:
            ip_count_string = r_cluster.hget('ip_'+str(ts), str(uid))

        ip_count_dict = json.loads(ip_count_string)

        try:
            ip_count_dict[str(ip)] += 1
        except:
            ip_count_dict[str(ip)] = 1

        if sensitive:
            r_cluster.hset('sensitive_ip_'+str(ts), str(uid), json.dumps(ip_count_dict))
        else:
            r_cluster.hset('ip_'+str(ts), str(uid), json.dumps(ip_count_dict))

    except:
        if sensitive:
            r_cluster.hset('sensitive_ip_'+str(ts), str(uid), json.dumps({str(ip):1}))
        else:
            r_cluster.hset('ip_'+str(ts), str(uid), json.dumps({str(ip):1}))
Пример #5
0
def cal_text_sensitive(item):
    text = item['text']
    uid = item['uid']
    timestamp = item['timestamp']
    date = ts2datetime(timestamp)
    ts = datetime2ts(date)
    if isinstance(text, str):
        text = text.decode('utf-8', 'ignore')
    sensitive_result = [word for word in SENSITIVE_WORD if word in text]
    if sensitive_result:
        sensitive_dict = dict()
        for word in sensitive_result:
            try:
                sensitive_dict[word] += 1
            except:
                sensitive_dict[word] = 1
        try:
            sensitive_count_string = r_cluster.hget('sensitive_'+str(ts), str(uid))
            sensitive_count_dict = json.loads(sensitive_count_string)
            for word in sensitive_dict:
                count = sensitive_dict[word]
                try:
                    sensitive_count_dict[word] += count
                except:
                    sensitive_count_dict[word] = count
            r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_count_dict))
        except:
            r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_dict))
def cal_class_ratio():
    ratio_results = {}
    date = '2013-09-07'
    ts = datetime2ts(date)
    scan_count = 0
    scan_cursor = 0
    all_count = 0
    while 1:
        if scan_count == 1000000:
            break
        results = r_cluster.hscan('activity_' + str(ts),
                                  scan_cursor,
                                  count=1000)
        scan_cursor = results[0]
        scan_count += 1000
        for uid in results[1]:
            activity_dict_string = r_cluster.hget('activity_' + str(ts), uid)
            activity_dict = json.loads(activity_dict_string)
            weibo_count = 0
            for time_seg in activity_dict:
                weibo_count += int(activity_dict[time_seg])
            if weibo_count >= 6:
                indic_3 = '1'
            else:
                indic_3 = '0'
            retweet_results = r.hgetall('retweet_' + str(uid))
            retweet_count = len(retweet_results)
            if retweet_count >= 8:
                indic_1 = '1'
            else:
                indic_1 = '0'
            be_retweet_results = r.hgetall('be_retweet_' + str(uid))
            be_retweet_count = len(be_retweet_results)
            #print 'be_retweet_count:', be_retweet_count
            if be_retweet_count >= 9:
                indic_2 = '1'
            else:
                indic_2 = '0'
            #print 'indic_2:', indic_2
            key = indic_1 + indic_2 + indic_3
            try:
                ratio_results[key] += 1
            except:
                ratio_results[key] = 1
            # write eight type users
            '''
            if key=='001':
                writer1.writerow([uid, retweet_count, be_retweet_count, weibo_count])
            elif key=='111':
                writer2.writerow([uid, retweet_count, be_retweet_count, weibo_count])
            elif key=='101':
                writer3.writerow([uid, retweet_count, be_retweet_count, weibo_count])
            elif key=='011':
                writer4.writerow([uid, retweet_count, be_retweet_count, weibo_count])
            elif key=='110':
                writer5.writerow([uid, retweet_count, be_retweet_count, weibo_count])
            if key=='010':
                writer6.writerow([uid, retweet_count, be_retweet_count, weibo_count])
            '''
    print 'ratio_results:', ratio_results
def cal_text_sensitive(item):
    text = item['text']
    uid = item['uid']
    timestamp = item['timestamp']
    date = ts2datetime(timestamp)
    ts = datetime2ts(date)
    if isinstance(text, str):
        text = text.decode('utf-8', 'ignore')
    sensitive_result = [word for word in SENSITIVE_WORD if word in text]
    if sensitive_result:
        sensitive_dict = dict()
        for word in sensitive_result:
            try:
                sensitive_dict[word] += 1
            except:
                sensitive_dict[word] = 1
        try:
            sensitive_count_string = r_cluster.hget('sensitive_'+str(ts), str(uid))
            sensitive_count_dict = json.loads(sensitive_count_string)
            for word in sensitive_dict:
                count = sensitive_dict[word]
                try:
                    sensitive_count_dict[word] += count
                except:
                    sensitive_count_dict[word] = count
            r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_count_dict))
        except:
            r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_dict))
Пример #8
0
def cal_text_work(item):
    uid = item['uid']
    timestamp = item['timestamp']
    date = ts2datetime(timestamp)
    ts = datetime2ts(date)
    text = item['text']
    if isinstance(text, str):
        text = text.decode('utf-8', 'ignore')
    RE = re.compile(u'#([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)#', re.UNICODE)
    hashtag_list = RE.findall(text)
    if hashtag_list:
        # there all use unicode·
        hashtag_dict = dict()
        for hashtag in hashtag_list:
            try:
                hashtag_dict[hashtag] += 1
            except:
                hashtag_dict[hashtag] = 1
        try:
            hashtag_count_string = r_cluster.hget('hashtag_'+str(ts), str(uid))
            hashtag_count_dict = json.loads(hashtag_count_string)
            for hashtag in hashtag_dict:
                count = hashtag_dict[hashtag]
                try:
                    hashtag_count_dict[hashtag] += count
                except:
                    hashtag_count_dict[hashtag] = count
            r_cluster.hset('hashtag_'+str(ts), str(uid), json.dumps(hashtag_count_dict))
        except:
            r_cluster.hset('hashtag_'+str(ts), str(uid), json.dumps(hashtag_dict))
def cal_sensitive_words_work(item, sw_list):
    timestamp = item['timestamp']
    uid = item['uid']
    timestamp = ts2datetime(timestamp).replace('-','')
    ts = timestamp
    map = {}
    for w in sw_list:
        word = "".join([chr(x) for x in w])
        word = word.decode('utf-8')
        if not map.__contains__(word):
            map[word] = 1
        else:
            map[word] += 1
    try:
        sensitive_count_string = r_cluster.hget('sensitive_'+str(ts), str(uid))
        sensitive_count_dict = json.loads(sensitive_count_string)
        for word in map:
            count = map[word]
            if sensitive_count_dict.__contains__(word):
                sensitive_count_dict[word] += count
            else:
                sensitive_count_dict[word] = count
        r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_count_dict))
    except:
        r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(map))
Пример #10
0
def save_city(uid, ip, timestamp, sensitive):
    ts = ts2datetime(timestamp).replace('-', '')
    key = str(uid)
    try:
        if sensitive:
            ip_count_string = r_cluster.hget('sensitive_ip_' + str(ts),
                                             str(uid))
        else:
            ip_count_string = r_cluster.hget('ip_' + str(ts), str(uid))

        ip_count_dict = json.loads(ip_count_string)

        try:
            ip_count_dict[str(ip)] += 1
        except:
            ip_count_dict[str(ip)] = 1

        if sensitive:
            r_cluster.hset('sensitive_ip_' + str(ts), str(uid),
                           json.dumps(ip_count_dict))
        else:
            r_cluster.hset('ip_' + str(ts), str(uid),
                           json.dumps(ip_count_dict))

    except:
        if sensitive:
            r_cluster.hset('sensitive_ip_' + str(ts), str(uid),
                           json.dumps({str(ip): 1}))
        else:
            r_cluster.hset('ip_' + str(ts), str(uid), json.dumps({str(ip): 1}))
def cal_ave_weibo():
    date = '2013-09-07'
    timestamp = datetime2ts(date)
    scan_count = 0
    scan_cursor = 0
    all_count = 0
    while 1:
        if scan_count == 1000000:
            break
        results = r_cluster.hscan('activity_' + str(timestamp),
                                  scan_cursor,
                                  count=1000)
        scan_cursor = results[0]
        scan_count += 1000
        for uid in results[1]:
            for i in range(0, 1):
                ts = timestamp - 24 * 3600 * i
                activity_dict_string = r_cluster.hget('activity_' + str(ts),
                                                      uid)
                if activity_dict_string:
                    activity_dict = json.loads(activity_dict_string)
                    weibo_count = 0
                    for time_seg in activity_dict:
                        weibo_count += int(activity_dict[time_seg])
                    all_count += weibo_count
    ave_count = float(all_count) / scan_count
    print 'ave_count:', ave_count
def cal_sensitive_words_work(item, sw_list):
    timestamp = item['timestamp']
    uid = item['uid']
    timestamp = ts2datetime(timestamp).replace('-', '')
    ts = timestamp
    map = {}
    for w in sw_list:
        word = "".join([chr(x) for x in w])
        word = word.decode('utf-8')
        if not map.__contains__(word):
            map[word] = 1
        else:
            map[word] += 1
    try:
        sensitive_count_string = r_cluster.hget('sensitive_' + str(ts),
                                                str(uid))
        sensitive_count_dict = json.loads(sensitive_count_string)
        for word in map:
            count = map[word]
            if sensitive_count_dict.__contains__(word):
                sensitive_count_dict[word] += count
            else:
                sensitive_count_dict[word] = count
        r_cluster.hset('sensitive_' + str(ts), str(uid),
                       json.dumps(sensitive_count_dict))
    except:
        r_cluster.hset('sensitive_' + str(ts), str(uid), json.dumps(map))
def cal_class_ratio():
    ratio_results = {}
    date = '2013-09-07'
    ts = datetime2ts(date)
    scan_count = 0
    scan_cursor = 0
    all_count = 0
    while 1:
        if scan_count == 1000000:
            break
        results = r_cluster.hscan('activity_'+str(ts), scan_cursor, count=1000)
        scan_cursor = results[0]
        scan_count += 1000
        for uid in results[1]:
            activity_dict_string = r_cluster.hget('activity_'+str(ts), uid)
            activity_dict = json.loads(activity_dict_string)
            weibo_count = 0
            for time_seg in activity_dict:
                weibo_count += int(activity_dict[time_seg])
            if weibo_count >= 6:
                indic_3 = '1'
            else:
                indic_3 = '0'
            retweet_results = r.hgetall('retweet_'+str(uid))
            retweet_count = len(retweet_results)
            if retweet_count >= 8:
                indic_1 = '1'
            else:
                indic_1 = '0'
            be_retweet_results = r.hgetall('be_retweet_'+str(uid))
            be_retweet_count = len(be_retweet_results)
            #print 'be_retweet_count:', be_retweet_count
            if be_retweet_count >= 9:
                indic_2 = '1'
            else:
                indic_2 = '0'
            #print 'indic_2:', indic_2
            key = indic_1 + indic_2 + indic_3
            try:
                ratio_results[key] += 1
            except:
                ratio_results[key] = 1
            # write eight type users
            '''
            if key=='001':
                writer1.writerow([uid, retweet_count, be_retweet_count, weibo_count])
            elif key=='111':
                writer2.writerow([uid, retweet_count, be_retweet_count, weibo_count])
            elif key=='101':
                writer3.writerow([uid, retweet_count, be_retweet_count, weibo_count])
            elif key=='011':
                writer4.writerow([uid, retweet_count, be_retweet_count, weibo_count])
            elif key=='110':
                writer5.writerow([uid, retweet_count, be_retweet_count, weibo_count])
            if key=='010':
                writer6.writerow([uid, retweet_count, be_retweet_count, weibo_count])
            '''
    print 'ratio_results:', ratio_results
Пример #14
0
def save_activity(uid, ts, time_segment):
    key = str(ts)
    try:
        activity_count_dict = r_cluster.hget('activity_' + key, str(uid))
        activity_count_dict = json.loads(activity_count_dict)
        try:
            activity_count_dict[str(time_segment)] += 1
        except:
            activity_count_dict[str(time_segment)] = 1
        r_cluster.hset('activity_' + key, str(uid), json.dumps(activity_count_dict))
    except:
        r_cluster.hset('activity_' + key, str(uid), json.dumps({str(time_segment): 1}))
Пример #15
0
def save_city(uid, ip, timestamp):
    date = ts2datetime(timestamp)
    ts = datetime2ts(date)
    key = str(uid)
    try:
        ip_count_string = r_cluster.hget('ip_'+str(ts), str(uid))
        ip_count_dict = json.loads(ip_count_string)
        try:
            ip_count_dict[str(ip)] += 1
        except:
            ip_count_dict[str(ip)] = 1
        r_cluster.hset('ip_'+str(ts), str(uid), json.dumps(ip_count_dict))
    except:
        r_cluster.hset('ip_'+str(ts), str(uid), json.dumps({str(ip):1}))
Пример #16
0
def save_at(uid, at_uid, timestamp):
    date = ts2datetime(timestamp)
    ts = datetime2ts(date)
    key = str(uid)
    try:
        ruid_count_string = r_cluster.hget('at_'+str(ts), str(uid))
        ruid_count_dict = json.loads(ruid_count_string)
        try:
            ruid_count_dict[str(at_uid)] += 1
        except:
            ruid_count_dict[str(at_uid)] = 1
        r_cluster.hset('at_'+str(ts), str(uid), json.dumps(ruid_count_dict))
    except:
        r_cluster.hset('at_'+str(ts), str(uid), json.dumps({str(at_uid):1}))
Пример #17
0
def save_at(uid, at_uid, timestamp):
    date = ts2datetime(timestamp)
    ts = datetime2ts(date)
    key = str(uid)
    try:
        ruid_count_string = r_cluster.hget('at_' + str(ts), str(uid))
        ruid_count_dict = json.loads(ruid_count_string)
        try:
            ruid_count_dict[str(at_uid)] += 1
        except:
            ruid_count_dict[str(at_uid)] = 1
        r_cluster.hset('at_' + str(ts), str(uid), json.dumps(ruid_count_dict))
    except:
        r_cluster.hset('at_' + str(ts), str(uid), json.dumps({str(at_uid): 1}))
def save_city_timestamp(uid, ip, timestamp):
    date = ts2datetime(timestamp)
    ts = datetime2ts(date)
    try:
        ip_timestamp_string = r_cluster.hget('new_ip_'+str(ts), str(uid))
        ip_timestamp_string_dict = json.loads(ip_timestamp_string)
        try:
            add_string = '&'+str(timestamp)
            ip_timestamp_string_dict[str(ip)] += add_string
        except:
            ip_timestamp_string_dict[str(ip)] = str(timestamp)
        r_cluster.hset('new_ip_'+str(ts), str(uid), json.dumps(ip_timestamp_string_dict))

    except:
        r_cluster.hset('new_ip_'+str(ts), str(uid), json.dumps({str(ip): str(timestamp)}))
Пример #19
0
def filter_activity(user_set):
    results = []
    #run_type
    if RUN_TYPE == 1:
        now_date = ts2datetime(time.time())
    else:
        now_date = RUN_TEST_TIME
    ts = datetime2ts(now_date) - DAY
    date = ts2datetime(ts)
    timestamp = datetime2ts(date)
    for user in user_set:
        over_count = 0
        for i in range(0, 7):
            ts = timestamp - DAY * i
            result = r_cluster.hget('activity_' + str(ts), str(user))
            if result:
                items_dict = json.loads(result)
                for item in items_dict:
                    weibo_count = items_dict[item]
                    if weibo_count > activity_threshold:
                        over_count += 1
        if over_count == 0:
            results.append(user)
        else:
            writer.writerow([user, 'activity'])

    return results
Пример #20
0
def filter_activity(user_set):
    results = []
    now_date = ts2datetime(time.time())
    # test
    now_date = '2013-09-08'
    ts = datetime2ts(now_date) - 24*3600
    date = ts2datetime(ts)
    #print 'date:', date
    timestamp = datetime2ts(date)
    for user in user_set:
        over_count = 0
        for i in range(0,7):
            ts = timestamp - 3600*24*i
            result = r_cluster.hget('activity_'+str(ts), str(user))
            if result:
                items_dict = json.loads(result)
                for item in items_dict:
                    weibo_count = items_dict[item]
                    if weibo_count > activity_threshold:
                        over_count += 1
        if over_count == 0:
            results.append(user)
        else:
            writer.writerow([user, 'activity'])
            
    print 'after filter activity:', len(results)    
    return results
def filter_activity(user_set):
    results = []
    now_date = ts2datetime(time.time())
    now_date = '2013-09-08'
    ts = datetime2ts(now_date) - 24 * 3600
    date = ts2datetime(ts)
    timestamp = datetime2ts(date)
    ts = ts.replace('-', '')
    for user in user_set:
        over_count = 0
        for i in range(0, 7):
            ts = timestamp - 3600 * 24 * i
            result = r_cluster.hget('activity_' + str(ts), str(user))
            if result:
                item_dict = json.loads(result)
                sorted_dict = sorted(item_dict.iteritems(),
                                     key=lambda asd: asd[1],
                                     reverse=True)
                if sorted_dict[0][1] > activity_threshold:
                    over_count = 1
        if over_count == 0:
            results.append(user)
        else:
            writer.writerow([user, 'activity'])

    print 'after filter activity: ', len(results)
    return results
def filter_activity(user_set):
    results = []
    now_date = ts2datetime(time.time())
    now_date = '2013-09-08'
    ts = datetime2ts(now_date) - 24*3600
    date = ts2datetime(ts)
    timestamp = datetime2ts(date)
    ts = ts.replace('-','')
    for user in user_set:
        over_count = 0
        for i in range(0,7):
            ts = timestamp - 3600*24*i
            result = r_cluster.hget('activity_'+str(ts), str(user))
            if result:
                item_dict = json.loads(result)
                sorted_dict = sorted(item_dict.iteritems(), key=lambda asd:asd[1], reverse=True)
                if sorted_dict[0][1] > activity_threshold:
                    over_count = 1
        if over_count == 0:
            results.append(user)
        else:
            writer.writerow([user, 'activity'])

    print 'after filter activity: ', len(results)
    return results
Пример #23
0
def get_attr_geo_track(uid_list):
    date_results = [] # results = {'2013-09-01':[(geo1, count1), (geo2, track2)], '2013-09-02'...} 7day
    now_ts = time.time()
    now_date = ts2datetime(now_ts)
    #test
    now_date = '2013-09-08'
    ts = datetime2ts(now_date)
    for i in range(7, 0, -1):
        timestamp = ts - i*24*3600
        #print 'timestamp:', ts2datetime(timestamp)
        ip_dict = dict()
        results = r_cluster.hmget('ip_'+str(timestamp), uid_list)
        #print 'results:',results
        for item in results:
            if item:
                item_dict = json.loads(item)
                #print 'item_dict:', item_dict
                for ip_item in item_dict:
                    try:
                        ip_dict[ip_item] += item_dict[ip_item]
                    except:
                        ip_dict[ip_item] = item_dict[ip_item]
        geo_dict = ip2geo(ip_dict)
        sort_geo_dict = sorted(geo_dict.items(), key=lambda x:x[1], reverse=True)
        date_key = ts2datetime(timestamp)
        date_results.append([date_key, sort_geo_dict[:2]])
    #print 'results:', date_results
    return {'geo_track': json.dumps(date_results)}
Пример #24
0
def update_day_hashtag(uid_list):
    results = {}
    all_results = {}
    now_ts = time.time()
    #run_type
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = test_ts

    for i in range(WEEK, 0, -1):
        ts = now_date_ts - DAY * i
        count = 0
        hashtag_results = r_cluster.hmget('hashtag_' + str(ts), uid_list)
        for uid in uid_list:
            if uid not in results:
                results[uid] = {}
            hashtag_item = hashtag_results[count]
            if hashtag_item:
                hashtag_dict = json.loads(hashtag_item)
            else:
                hashtag_dict = {}
            for hashtag in hashtag_dict:
                try:
                    results[uid][hashtag] += 1
                except:
                    results[uid][hashtag] = 1
    for uid in uid_list:
        user_hashtag_dict = results[uid]
        hashtag_string = '&'.join(user_hashtag_dict.keys())
        all_results[uid] = {
            'hashtag': hashtag_string,
            'hashtag_dict': user_hashtag_dict
        }
    return results
Пример #25
0
def update_day_hashtag(uid_list):
    results = {}
    all_results = {}
    now_ts = time.time()
    #run_type
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = test_ts

    for i in range(WEEK,0,-1):
        ts = now_date_ts - DAY*i
        count = 0
        hashtag_results = r_cluster.hmget('hashtag_'+str(ts), uid_list)
        for uid in uid_list:
            if uid not in results:
                results[uid] = {}
            hashtag_item = hashtag_results[count]
            if hashtag_item:
                hashtag_dict = json.loads(hashtag_item)
            else:
                hashtag_dict = {}
            for hashtag in hashtag_dict:
                try:
                    results[uid][hashtag] += 1
                except:
                    results[uid][hashtag] = 1
    for uid in uid_list:
        user_hashtag_dict = results[uid]
        hashtag_string = '&'.join(user_hashtag_dict.keys())
        all_results[uid] = {'hashtag': hashtag_string, 'hashtag_dict':user_hashtag_dict}
    return all_results
def filter_activity(user_set):
    results = []
    #run_type
    if RUN_TYPE == 1:
        now_date = ts2datetime(time.time())
    else:
        now_date = RUN_TEST_TIME
    ts = datetime2ts(now_date) - DAY
    date = ts2datetime(ts)
    timestamp = datetime2ts(date)
    for user in user_set:
        over_count = 0
        for i in range(0,7):
            ts = timestamp - DAY*i
            result = r_cluster.hget('activity_'+str(ts), str(user))
            if result:
                items_dict = json.loads(result)
                for item in items_dict:
                    weibo_count = items_dict[item]
                    if weibo_count > activity_threshold:
                        over_count += 1
        if over_count == 0:
            results.append(user)
        else:
            writer.writerow([user, 'activity'])
            
    return results
Пример #27
0
def save_city_timestamp(uid, ip, timestamp):
    date = ts2datetime(timestamp)
    ts = datetime2ts(date)
    try:
        ip_timestamp_string = r_cluster.hget('new_ip_' + str(ts), str(uid))
        ip_timestamp_string_dict = json.loads(ip_timestamp_string)
        try:
            add_string = '&' + str(timestamp)
            ip_timestamp_string_dict[str(ip)] += add_string
        except:
            ip_timestamp_string_dict[str(ip)] = str(timestamp)
        r_cluster.hset('new_ip_' + str(ts), str(uid),
                       json.dumps(ip_timestamp_string_dict))

    except:
        r_cluster.hset('new_ip_' + str(ts), str(uid),
                       json.dumps({str(ip): str(timestamp)}))
def cal_hashtag_work(item, sensitive):
    text = item['text']
    uid = item['uid']
    timestamp = item['timestamp']
    ts = ts2datetime(timestamp).replace('-', '')

    if isinstance(text, str):
        text = text.decode('utf-8', 'ignore')
    RE = re.compile(u'#([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)#',
                    re.UNICODE)
    hashtag_list = RE.findall(text)
    if hashtag_list:
        hashtag_dict = {}
        for hashtag in hashtag_list:
            try:
                hashtag_dict[hashtag] += 1
            except:
                hashtag_dict[hashtag] = 1

        try:
            if sensitive:
                hashtag_count_string = r_cluster.hget(
                    'sensitive_hashtag_' + str(ts), str(uid))
            else:
                hashtag_count_string = r_cluster.hget('hashtag_' + str(ts),
                                                      str(uid))
            hashtag_count_dict = json.loads(hashtag_count_string)
            for hashtag in hashtag_dict:
                count = hashtag_dict[hashtag]
                try:
                    hashtag_count_dict[hashtag] += count
                except:
                    hashtag_count_dict[hashtag] = count
            if sensitive:
                r_cluster.hset('sensitive_hashtag_' + str(ts), str(uid),
                               json.dumps(hashtag_count_dict))
            else:
                r_cluster.hset('hashtag_' + str(ts), str(uid),
                               json.dumps(hashtag_count_dict))
        except:
            if sensitive:
                r_cluster.hset('sensitive_hashtag_' + str(ts), str(uid),
                               json.dumps(hashtag_dict))
            else:
                r_cluster.hset('hashtag_' + str(ts), str(uid),
                               json.dumps(hashtag_dict))
Пример #29
0
def update_day_geo(uid_list, user_info_list):
    results = {}
    now_ts = time.time()
    #run_type
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = test_ts
    ip_results = r_cluster.hmget('new_ip_' + str(now_date_ts - DAY), uid_list)
    count = 0
    for uid in uid_list:
        if uid not in results:
            results[uid] = {'activity_geo': {}, 'activity_geo_dict': []}
        uid_ip_results = ip_results[count]
        count += 1
        if uid_ip_results:
            uid_ip_dict = json.loads(uid_ip_results)
        else:
            uid_ip_dict = {}
        day_results = {}
        for ip in uid_ip_dict:
            ip_count = len(uid_ip_dict[ip].split('&'))
            geo = ip2city(ip)
            geo = geo.decode('utf-8')
            try:
                day_results[geo] += ip_count
            except:
                day_results[geo] = ip_count
        #update the activity_geo_dict
        activity_geo_history_list = json.loads(
            user_info_list[uid]['activity_geo_dict'])
        activity_geo_history_list.append(day_results)
        results[uid]['activity_geo_dict'] = json.dumps(
            activity_geo_history_list[-30:])
        #update the activity_geo
        week_activity_geo_list = activity_geo_history_list[-7:]
        week_geo_list = []
        for activity_geo_item in week_activity_geo_list:
            geo_list = activity_geo_item.keys()
            week_geo_list.extend(geo_list)
        week_geo_list = list(set(week_geo_list))
        week_geo_string = '&'.join([
            '&'.join((item.encode('utf-8')).split('\t'))
            for item in week_geo_list
        ])
        try:
            week_geo_aggs_string = '&'.join([
                (item.encode('utf-8')).split('\t')[-1]
                for item in week_geo_list
            ])
        except:
            week_geo_aggs_string = ''

        results[uid]['activity_geo'] = week_geo_string
        results[uid]['activity_geo_aggs'] = week_geo_aggs_string

    return results
def cal_core_class():
    date = '2013-09-07'
    timestamp = datetime2ts(date)
    f_r = open(
        '/home/ubuntu8/huxiaoqian/user_portrait/user_portrait/cron/recommentation_in/core_list.csv',
        'rb')
    reader = csv.reader(f_r)
    f_w = open(
        '/home/ubuntu8/huxiaoqian/user_portrait/user_portrait/cron/recommentation_in/core_class.csv',
        'wb')
    writer = csv.writer(f_w)
    result_list = []
    count011 = 0
    for line in reader:
        uid = line[0]
        retweet_results = r.hgetall('retweet_' + str(uid))
        retweet_count = len(retweet_results)
        be_retweet_results = r.hgetall('be_retweet_' + str(uid))
        be_retweet_count = len(be_retweet_results)
        weibo_count = 0
        for i in range(0, 7):
            ts = timestamp - 24 * 3600 * i
            activity_string = r_cluster.hget('activity_' + str(ts), str(uid))
            if activity_string:
                activity_dict = json.loads(activity_string)
            else:
                activity_dict = {}
            for time_seg in activity_dict:
                count = activity_dict[time_seg]
                weibo_count += count
        ave_weibo_count = float(weibo_count) / 7
        if retweet_count >= 8:
            indic_1 = '1'
        else:
            indic_1 = '0'
        if be_retweet_count >= 9:
            indic_2 = '1'
        else:
            indic_2 = '0'
        if ave_weibo_count >= 6:
            indic_3 = '1'
        else:
            indic_3 = '0'
        key = indic_1 + indic_2 + indic_3
        if key == '011':
            count011 += 1
        result_list.append(
            [uid, key, retweet_count, be_retweet_count, ave_weibo_count])
    f_r.close()
    sort_result = sorted(result_list, key=lambda x: x[3], reverse=True)
    for item in sort_result:
        writer.writerow(list(item))
    f_w.close()
    print 'count011:', count011
def main():
    now_ts = time.time()
    delete_ts = datetime2ts(ts2datetime(now_ts-EXPIRE_TIME))  #待删除的时间戳
    delete_date = ts2datetime(now_ts-EXPIRE_TIME)

    #delete @
    r_cluster.delete("at_"+str(delete_ts))

    #delete ip
    r_cluster.delete('new_ip_'+str(delete_ts))

    #delete activity
    r_cluster.delete('activity_'+str(delete_ts))

    #delete hashtag
    r_cluster.delete('hashtag_'+str(delete_ts))

    #delete sensitive words
    r_cluster.delete('sensitive_'+str(delete_ts))

    #delete recommendation
    r.delete('recomment_'+str(delete_date))
def cal_hashtag_work(item, sensitive):
    text = item['text']
    uid = item['uid']
    timestamp = item['timestamp']
    ts = ts2datetime(timestamp).replace('-','')

    if isinstance(text, str):
        text = text.decode('utf-8', 'ignore')
    RE = re.compile(u'#([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)#', re.UNICODE)
    hashtag_list = RE.findall(text)
    if hashtag_list:
        hashtag_dict = {}
        for hashtag in hashtag_list:
            try:
                hashtag_dict[hashtag] += 1
            except:
                hashtag_dict[hashtag] = 1

        try:
            if sensitive:
                hashtag_count_string = r_cluster.hget('sensitive_hashtag_'+str(ts), str(uid))
            else:
                hashtag_count_string = r_cluster.hget('hashtag_'+str(ts), str(uid))
            hashtag_count_dict = json.loads(hashtag_count_string)
            for hashtag in hashtag_dict:
                count = hashtag_dict[hashtag]
                try:
                    hashtag_count_dict[hashtag] += count
                except:
                    hashtag_count_dict[hashtag] = count
            if sensitive:
                r_cluster.hset('sensitive_hashtag_'+str(ts), str(uid), json.dumps(hashtag_count_dict))
            else:
                r_cluster.hset('hashtag_'+str(ts), str(uid), json.dumps(hashtag_count_dict))
        except:
            if sensitive:
                r_cluster.hset('sensitive_hashtag_'+str(ts), str(uid), json.dumps(hashtag_dict))
            else:
                r_cluster.hset('hashtag_'+str(ts), str(uid), json.dumps(hashtag_dict))
Пример #33
0
def get_activity_time(uid_list):
    results = {}
    now_ts = time.time()
    now_date = ts2datetime(now_ts)
    #run_type
    if RUN_TYPE == 1:
        timestamp = datetime2ts(now_date)
    else:
        timestamp = datetime2ts(RUN_TEST_TIME)
    activity_list_dict = {}  # {uid:[activity_list], uid:[]}
    for i in range(1, WEEK + 1):
        ts = timestamp - DAY * i
        print ts
        r_result = r_cluster.hmget('activity_' + str(ts), uid_list)
        #print r_result
        if r_result:
            for j in range(0, len(uid_list)):
                uid = uid_list[j]
                if uid not in activity_list_dict:
                    activity_list_dict[uid] = [0 for i in range(0, 96)]
                user_r_result = r_result[j]
                if user_r_result:
                    user_activity_dict = json.loads(user_r_result)
                    for i in range(0, 96):
                        try:
                            count = user_activity_dict[str(i)]
                        except:
                            count = 0
                        activity_list_dict[uid].append(count)
    for uid in uid_list:
        activity_list = activity_list_dict[uid]
        statusnum = sum(activity_list)
        signal = np.array(activity_list)
        fftResult = np.abs(np.fft.fft(signal))**2
        n = signal.size
        freq = np.fft.fftfreq(n, d=1)
        i = 0
        max_val = 0
        max_freq = 0
        for val in fftResult:
            if val > max_val and freq[i] > 0:
                max_val = val
                max_freq = freq[i]
            i += 1
        results[uid] = {
            'statusnum': statusnum,
            'activity_time': math.log(max_freq + 1)
        }

    return results
Пример #34
0
def update_day_geo(uid_list, user_info_list):
    results = {}
    now_ts = time.time()
    #run_type
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = test_ts
    ip_results = r_cluster.hmget('new_ip_'+str(now_date_ts - DAY), uid_list)
    count = 0
    for uid in uid_list:
        if uid not in results:
            results[uid] = {'activity_geo':{}, 'activity_geo_dict':[]}
        uid_ip_results = ip_results[count]
        count += 1
        if uid_ip_results:
            uid_ip_dict = json.loads(uid_ip_results)
        else:
            uid_ip_dict = {}
        day_results = {}
        for ip in uid_ip_dict:
            ip_count = len(uid_ip_dict[ip].split('&'))
            geo, school = ip2city(ip)
            if geo:
                geo = geo.decode('utf-8')
                try:
                    day_results[geo] += ip_count
                except:
                    day_results[geo] = ip_count
        #update the activity_geo_dict
        activity_geo_history_list = json.loads(user_info_list[uid]['activity_geo_dict'])
        activity_geo_history_list.append(day_results)
        results[uid]['activity_geo_dict'] = json.dumps(activity_geo_history_list[-30:])
        #update the activity_geo
        week_activity_geo_list = activity_geo_history_list[-7:]
        week_geo_list = []
        for activity_geo_item in week_activity_geo_list:
            geo_list = activity_geo_item.keys()
            week_geo_list.extend(geo_list)
        week_geo_list = list(set(week_geo_list))
        week_geo_string = '&'.join(['&'.join(item.split('\t')) for item in week_geo_list])
        try:
            week_geo_aggs_string = '&'.join([item.split('\t')[-1] for item in week_geo_list])
        except:
            week_geo_aggs_string = ''

        results[uid]['activity_geo'] = week_geo_string
        results[uid]['activity_geo_aggs'] = week_geo_aggs_string
    #print 'update geo results:', results
    return results
def cal_core_class():
    date = '2013-09-07'
    timestamp = datetime2ts(date)
    f_r = open('/home/ubuntu8/huxiaoqian/user_portrait/user_portrait/cron/recommentation_in/core_list.csv', 'rb')
    reader = csv.reader(f_r)
    f_w = open('/home/ubuntu8/huxiaoqian/user_portrait/user_portrait/cron/recommentation_in/core_class.csv', 'wb')
    writer = csv.writer(f_w)
    result_list = []
    count011 = 0
    for line in reader:
        uid = line[0]
        retweet_results = r.hgetall('retweet_'+str(uid))
        retweet_count = len(retweet_results)
        be_retweet_results = r.hgetall('be_retweet_'+str(uid))
        be_retweet_count = len(be_retweet_results)
        weibo_count = 0
        for i in range(0,7):
            ts = timestamp - 24*3600*i
            activity_string = r_cluster.hget('activity_'+str(ts), str(uid))
            if activity_string:
                activity_dict = json.loads(activity_string)
            else:
                activity_dict = {}
            for time_seg in activity_dict:
                count = activity_dict[time_seg]
                weibo_count += count
        ave_weibo_count = float(weibo_count) / 7
        if retweet_count >= 8:
            indic_1 = '1'
        else:
            indic_1 = '0'
        if be_retweet_count >= 9:
            indic_2 = '1'
        else:
            indic_2 = '0'
        if ave_weibo_count >= 6:
            indic_3 = '1'
        else:
            indic_3 = '0'
        key = indic_1 + indic_2 + indic_3
        if key=='011':
            count011 += 1
        result_list.append([uid, key, retweet_count, be_retweet_count, ave_weibo_count])
    f_r.close()
    sort_result = sorted(result_list, key=lambda x:x[3], reverse=True)
    for item in sort_result:
        writer.writerow(list(item))
    f_w.close()
    print 'count011:', count011
def cal_ave_weibo():
    date = '2013-09-07'
    timestamp = datetime2ts(date)
    scan_count = 0
    scan_cursor = 0
    all_count = 0
    while 1:
        if scan_count == 1000000:
            break
        results = r_cluster.hscan('activity_'+str(timestamp), scan_cursor, count=1000)
        scan_cursor = results[0]
        scan_count += 1000
        for uid in results[1]:
            for i in range(0,1):
                ts = timestamp - 24*3600*i
                activity_dict_string = r_cluster.hget('activity_'+str(ts), uid)
                if activity_dict_string:
                    activity_dict = json.loads(activity_dict_string)
                    weibo_count = 0
                    for time_seg in activity_dict:
                        weibo_count += int(activity_dict[time_seg])
                    all_count += weibo_count
    ave_count = float(all_count) / scan_count
    print 'ave_count:', ave_count
def save_at(uid, at_uid, timestamp, sensitive):
    ts = ts2datetime(timestamp).replace('-','')
    key = str(uid)
    try:
        if sensitive:
            ruid_count_string = r_cluster.hget('sensitive_at_'+str(ts), str(uid))
        else:
            ruid_count_string = r_cluster.hget('at_'+str(ts), str(uid))

        ruid_count_dict = json.loads(ruid_count_string)
        try:
            ruid_count_dict[str(at_uid)] += 1
        except:
            ruid_count_dict[str(at_uid)] = 1
        if sensitive:
            r_cluster.hset('sensitive_at_'+str(ts), str(uid), json.dumps(ruid_count_dict))
        else:
            r_cluster.hset('at_'+str(ts), str(uid), json.dumps(ruid_count_dict))

    except:
        if sensitive:
            r_cluster.hset('sensitive_at_'+str(ts), str(uid), json.dumps({str(at_uid):1}))
        else:
            r_cluster.hset('at_'+str(ts), str(uid), json.dumps({str(at_uid):1}))
Пример #38
0
def get_activity_time(uid_list):
    results = {}
    now_ts = time.time()
    now_date = ts2datetime(now_ts)
    #run_type
    if RUN_TYPE == 1:
        timestamp = datetime2ts(now_date)
    else:
        timestamp = datetime2ts(RUN_TEST_TIME)
    activity_list_dict = {} # {uid:[activity_list], uid:[]}
    for i in range(1,WEEK+1):
        ts = timestamp - DAY*i
        print ts
        r_result = r_cluster.hmget('activity_'+str(ts), uid_list)
        #print r_result
        if r_result:
            for j in range(0, len(uid_list)):
                uid = uid_list[j]
                if uid not in activity_list_dict:
                    activity_list_dict[uid] = [0 for i in range(0, 96)]
                user_r_result = r_result[j]
                if user_r_result:
                    user_activity_dict = json.loads(user_r_result)
                    for i in range(0, 96):
                        try:
                            count = user_activity_dict[str(i)]
                        except:
                            count = 0
                        activity_list_dict[uid].append(count)
    for uid in uid_list:
        activity_list = activity_list_dict[uid]
        statusnum = sum(activity_list)
        signal = np.array(activity_list)
        fftResult = np.abs(np.fft.fft(signal))**2
        n = signal.size
        freq = np.fft.fftfreq(n, d=1)
        i = 0
        max_val = 0
        max_freq = 0
        for val in fftResult:
            if val>max_val and freq[i]>0:
                max_val = val
                max_freq = freq[i]
            i += 1
        results[uid] = {'statusnum': statusnum, 'activity_time': math.log(max_freq + 1)}
    
    return results
Пример #39
0
def get_user_at():
    #step1: get_uid_list
    uid_list = get_uid_list()
    date = ts2datetime(time.time())
    ts = datetime2ts(date)
    f = open('/home/user_portrait_0320/revised_user_portrait/user_portrait/user_portrait/attribute/uid_at.txt', 'w')
    for i in range(1, 8):
        ts = ts - DAY
        for uid in uid_list:
            #try:
            result_string = r_cluster.hget('at_' + str(ts), uid)
            #except:
            #    result_string = ''
            if result_string:
                save_dict = {'ts': ts, 'result': result_string}
                f.write('%s\n' % json.dumps(save_dict))
    f.close()
def get_activeness(uid, activity_geo):
    result = 0
    now_ts = time.time()
    now_date = ts2datetime(now_ts)
    timestamp = datetime2ts(now_date)
    #test
    timestamp = datetime2ts('2013-09-08')
    # deal activity_time fft and statusnum
    activity_list = []
    statusnum = 0
    for i in range(1,8):
        ts = timestamp - 24*3600*i
        r_result = r_cluster.hget('activity_'+str(ts), uid)
        if r_result:
            r_result = json.loads(r_result)
        #print 'r_result:', r_result
        for i in range(0,96):
            try:
                count = r_result[str(i)]
            except:
                count = 0
            activity_list.append(float(count))
    #print 'activity_list:', activity_list
    statusnum = sum(activity_list)
    signal = np.array(activity_list)
    fftResult = np.abs(np.fft.fft(signal)) ** 2
    n = signal.size
    freq = np.fft.fftfreq(n, d=1)
    i = 0
    max_val = 0
    max_freq = 0
    for val in fftResult:
        #print str(1/freq[i]) + ',' + str(val)
        if val>max_val and freq[i]>0:
            max_val = val
            max_freq = freq[i]
        i = i + 1
    #print 'i:', i
    #print 'max_freq, max_val:', max_freq, max_val
    # deal avtivity_geo input: 'geo&geo'
    activity_geo_count = len(activity_geo.split('&'))
    result = activeness_weight_dict['activity_time'] * math.log(max_freq  + 1) + \
             activeness_weight_dict['activity_geo'] * math.log(activity_geo_count + 1) +\
             activeness_weight_dict['statusnum'] * math.log(statusnum + 1)
    #print 'activeness:', result
    return result
def get_activeness(uid, activity_geo):
    result = 0
    now_ts = time.time()
    now_date = ts2datetime(now_ts)
    timestamp = datetime2ts(now_date)
    #test
    #timestamp = datetime2ts('2013-09-08')
    # deal activity_time fft and statusnum
    activity_list = []
    statusnum = 0
    for i in range(1,8):
        ts = timestamp - 24*3600*i
        r_result = r_cluster.hget('activity_'+str(ts), uid)
        if r_result:
            r_result = json.loads(r_result)
        #print 'r_result:', r_result
        for i in range(0,96):
            try:
                count = r_result[str(i)]
            except:
                count = 0
            activity_list.append(float(count))
    #print 'activity_list:', activity_list
    statusnum = sum(activity_list)
    signal = np.array(activity_list)
    fftResult = np.abs(np.fft.fft(signal)) ** 2
    n = signal.size
    freq = np.fft.fftfreq(n, d=1)
    i = 0
    max_val = 0
    max_freq = 0
    for val in fftResult:
        #print str(1/freq[i]) + ',' + str(val)
        if val>max_val and freq[i]>0:
            max_val = val
            max_freq = freq[i]
        i = i + 1
    #print 'i:', i
    #print 'max_freq, max_val:', max_freq, max_val
    # deal avtivity_geo input: 'geo&geo'
    activity_geo_count = len(activity_geo.split('&'))
    result = activeness_weight_dict['activity_time'] * math.log(max_freq  + 1) + \
             activeness_weight_dict['activity_geo'] * math.log(activity_geo_count + 1) +\
             activeness_weight_dict['statusnum'] * math.log(statusnum + 1)
    #print 'activeness:', result
    return result
def get_activity_geo(uid):
    ip_result = []
    now_ts = time.time()
    now_date = ts2datetime(now_ts)
    ts = datetime2ts(now_date)
    geo_result = {}
    # test
    ts = datetime2ts('2013-09-08')
    for i in range(1,8):
        ts = ts - 24*3600
        r_result = r_cluster.hget('ip_'+str(ts), uid)
        if r_result:
            ip_list = json.loads(r_result).keys()
            ip_result.extend(ip_list)
    ip_list = set(ip_result)
    geo_string = '&'.join(ip2geo(ip_list))
    #print 'geo_string:', geo_string
    return geo_string
def get_activity_geo(uid):
    ip_result = []
    now_ts = time.time()
    now_date = ts2datetime(now_ts)
    ts = datetime2ts(now_date)
    geo_result = {}
    # test
    ts = datetime2ts('2013-09-08')
    for i in range(1,8):
        ts = ts - 24*3600
        r_result = r_cluster.hget('ip_'+str(ts), uid)
        if r_result:
            ip_list = json.loads(r_result).keys()
            ip_result.extend(ip_list)
    ip_list = set(ip_result)
    geo_string = '&'.join(ip2geo(ip_list))
    #print 'geo_string:', geo_string
    return geo_string
Пример #44
0
def get_user_at():
    #step1: get_uid_list
    uid_list = get_uid_list()
    date = ts2datetime(time.time())
    ts = datetime2ts(date)
    f = open(
        '/home/user_portrait_0320/revised_user_portrait/user_portrait/user_portrait/attribute/uid_at.txt',
        'w')
    for i in range(1, 8):
        ts = ts - DAY
        for uid in uid_list:
            #try:
            result_string = r_cluster.hget('at_' + str(ts), uid)
            #except:
            #    result_string = ''
            if result_string:
                save_dict = {'ts': ts, 'result': result_string}
                f.write('%s\n' % json.dumps(save_dict))
    f.close()
Пример #45
0
def attr_hash(uid):
    hashtag_results = {}
    now_ts = time.time()
    # test
    now_ts = datetime2ts('2013-09-08')
    date = ts2datetime(now_ts)
    ts = datetime2ts(date)
    for i in range(1, 8):
        ts = ts - 24 * 3600
        result_string = r_cluster.hget('hashtag_' + str(ts), str(uid))
        if result_string:
            result_dict = json.loads(result_string)
            for hashtag in result_dict:
                count = result_dict[hashtag]
                try:
                    hashtag_results[hashtag] += count
                except:
                    hashtag_results[hashtag] = count
    return hashtag_results
def attr_hash(uid):
    hashtag_results = {}
    now_ts = time.time()
    # test
    now_ts = datetime2ts('2013-09-08')
    date = ts2datetime(now_ts)
    ts = datetime2ts(date)
    for i in range(1,8):
        ts = ts - 24*3600
        result_string = r_cluster.hget('hashtag_'+str(ts), str(uid))
        if result_string:
            result_dict = json.loads(result_string)
            for hashtag in result_dict:
                count = result_dict[hashtag]
                try:
                    hashtag_results[hashtag] += count
                except:
                    hashtag_results[hashtag] = count
    return hashtag_results
def filter_mention(user_set):
    results = []
    now_date = ts2datetime(time.time())
    now_date = '2013-09-08'
    timestamp = datetime2ts(now_date) - 24*3600
    for user in user_set:
        mention_set = set()
        for i in range(0,7):
            ts = timestamp - 3600*24*i
            result = r_cluster.hget('at_'+str(ts), str(user))
            if result:
                item_dict = json.loads(result)
                for at_user in item_dict:
                    mention_set.add(at_user)
        if at_count < mention_threshold:
            results.append(user)
        else:
            writer.writerow([user, 'mention'])
    print 'after filter mention: ', len(results)
    return results
def filter_mention(user_set):
    results = []
    now_date = ts2datetime(time.time())
    now_date = '2013-09-08'
    timestamp = datetime2ts(now_date) - 24 * 3600
    for user in user_set:
        mention_set = set()
        for i in range(0, 7):
            ts = timestamp - 3600 * 24 * i
            result = r_cluster.hget('at_' + str(ts), str(user))
            if result:
                item_dict = json.loads(result)
                for at_user in item_dict:
                    mention_set.add(at_user)
        if at_count < mention_threshold:
            results.append(user)
        else:
            writer.writerow([user, 'mention'])
    print 'after filter mention: ', len(results)
    return results
Пример #49
0
def get_school(uid_list):
    now_ts = time.time()
    now_date_ts = datetime2ts(ts2datetime(now_ts))
    school_results = {}
    for i in range(WEEK, 0, -1):
        ts = now_date_ts - DAY * i
        ip_results = r_cluster.hmget('new_ip_' + str(ts), uid_list)
        count = 0
        for uid in uid_list:
            if uid not in school_results:
                school_results[uid] = {}
            ip_item = ip_results[count]
            if ip_item:
                uid_ip_dict = json.loads(ip_item)
            else:
                uid_ip_dict = {}
            for ip in uid_ip_dict:
                ip_count = len(uid_ip_dict[ip].split('&'))

                school = ip2school(ip)
                if school:
                    try:
                        school_results[uid][school] += ip_count
                    except:
                        school_results[uid][school] = ip_count

            count += 1
    results = {}
    for uid in uid_list:
        school_dict = school_results[uid]
        school_string = '&'.join(school_dict.keys())
        if school_dict != {}:
            is_school = '1'
        else:
            is_school = '0'
        results[uid] = {
            'is_school': is_school,
            'school_string': school_string,
            'school_dict': json.dumps(school_dict)
        }
    return results
Пример #50
0
def get_attr_trend(uid_list):
    result = {}
    now_ts = time.time()
    date = ts2datetime(now_ts - 24*3600)
    timestamp = datetime2ts(date)
    #test
    timestamp = datetime2ts('2013-09-08')
    time_result = dict()
    segment_result = dict()
    for i in range(1, 8):
        ts = timestamp - i*24*3600
        r_result = r_cluster.hmget('activity_'+str(ts), uid_list)
        #print 'r_result:', r_result
        for item in r_result:
            if item:
                item = json.loads(item)
                for segment in item:
                    try:
                        time_result[int(segment)/16*15*60*16+ts] += item[segment]
                    except:
                        time_result[int(segment)/16*15*60*16+ts] = item[segment]
                    try:
                        segment_result[int(segment)/16*15*60*16] += item[segment]
                    except:
                        segment_result[int(segment)/16*15*60*16] = item[segment]
    trend_list = []
    for i in range(1, 8):
        ts = timestamp - i*24*3600
        for j in range(0, 6):
            time_seg = ts + j*15*60*16
            if time_seg in time_result:
                trend_list.append((time_seg, time_result[time_seg]))
            else:
                trend_list.append((time_seg, 0))
    sort_trend_list = sorted(trend_list, key=lambda x:x[0], reverse=False)
    #print 'time_result:', time_result
    #print 'trend_list:', trend_list
    #print 'sort_trend_list:', sort_trend_list
    result['activity_trend'] = json.dumps(sort_trend_list)
    result['activity_time'] = json.dumps(segment_result)
    return result
Пример #51
0
def update_day_sensitive(uid_list):
    results = {}
    all_results = {}
    now_ts = time.time()
    #run_type
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = test_ts

    for i in range(WEEK,0,-1):
        ts = now_date_ts - DAY*i
        count = 0
        sensitive_results = r_cluster.hmget('sensitive_'+str(ts), uid_list)
        for uid in uid_list:
            if uid not in results:
                results[uid] = {}
                sensitive_item = sensitive_results[count]
            if sensitive_item:
                sensitive_dict = json.loads(sensitive_item)
            else:
                sensitive_dict = {}
            for sensitive in sensitive_dict:
                try:
                    results[uid][sensitive] += 1
                except:
                    results[uid][sensitive] = 1
    for uid in uid_list:
        user_sensitive_dict = results[uid]
        sensitive_score = 0
        for item in user_sensitive_dict:
            k = item
            v = user_sensitive_dict[k]
            tmp_stage = r_sensitive.hget('sensitive_words', k)
            if tmp_stage:
                sensitive_score += v * sensitive_score_dict[str(tmp_stage)]
        sensitive_string = '&'.join(user_sensitive_dict.keys())
        all_results[uid] = {'sensitive_string': sensitive_string, 'sensitive_dict':user_sensitive_dict,\
                'sensitive': sensitive_score}
    return all_results
Пример #52
0
def update_day_sensitive(uid_list):
    results = {}
    all_results = {}
    now_ts = time.time()
    #run_type
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = test_ts

    for i in range(WEEK, 0, -1):
        ts = now_date_ts - DAY * i
        count = 0
        sensitive_results = r_cluster.hmget('sensitive_' + str(ts), uid_list)
        for uid in uid_list:
            if uid not in results:
                results[uid] = {}
                sensitive_item = sensitive_results[count]
            if sensitive_item:
                sensitive_dict = json.loads(sensitive_item)
            else:
                sensitive_dict = {}
            for sensitive in sensitive_dict:
                try:
                    results[uid][sensitive] += 1
                except:
                    results[uid][sensitive] = 1
    for uid in uid_list:
        user_sensitive_dict = results[uid]
        sensitive_score = 0
        for item in user_sensitive_dict:
            k = item
            v = user_sensitive_dict[k]
            tmp_stage = r_sensitive.hget('sensitive_words', k)
            if tmp_stage:
                sensitive_score += v * sensitive_score_dict[str(tmp_stage)]
        sensitive_string = '&'.join(user_sensitive_dict.keys())
        all_results[uid] = {'sensitive_string': sensitive_string, 'sensitive_dict':user_sensitive_dict,\
                'sensitive': sensitive_score}
    return all_results
def filter_ip(user_set):
    results = []
    now_date = ts2datetime(time.time())
    now_date = '2013-09-08'
    ts = datetime2ts(now_date) - 24 * 3600
    for user in user_set:
        ip_set = set()
        for i in range(0, 7):
            timestamp = ts - 3600 * 24 * i
            ip_result = r_cluster.hget('ip_' + str(ts), str(user))
            if ip_result:
                result_dict = json.loads(ip_result)
            else:
                result_dict = {}
            for ip in result_dict:
                ip_set.add(ip)
        if len(result_dict) < ip_threshold:
            results.append(user)
        else:
            writer.writerow([user, 'ip'])
    print 'after filter ip: ', len(results)
    return results
def filter_ip(user_set):
    results = []
    now_date = ts2datetime(time.time())
    now_date = '2013-09-08'
    ts = datetime2ts(now_date) - 24*3600
    for user in user_set:
        ip_set = set()
        for i in range(0,7):
            timestamp = ts - 3600*24*i
            ip_result = r_cluster.hget('ip_'+str(ts), str(user))
            if ip_result:
                result_dict = json.loads(ip_result)
            else:
                result_dict = {}
            for ip in result_dict:
                ip_set.add(ip)
        if len(result_dict) < ip_threshold:
            results.append(user)
        else:
            writer.writerow([user, 'ip'])
    print 'after filter ip: ', len(results)
    return results
Пример #55
0
def get_school(uid_list):
    now_ts = time.time()
    now_date_ts = datetime2ts(ts2datetime(now_ts))
    school_results = {}
    for i in range(WEEK, 0, -1):
        ts = now_date_ts - DAY * i
        ip_results = r_cluster.hmget('new_ip_'+str(ts), uid_list)
        count = 0
        for uid in uid_list:
            if uid not in school_results:
                school_results[uid] = {}
            ip_item = ip_results[count]
            if ip_item:
                uid_ip_dict = json.loads(ip_item)
            else:
                uid_ip_dict = {}
            for ip in uid_ip_dict:
                ip_count = len(uid_ip_dict[ip].split('&'))
                
                school = ip2school(ip)
                if school:
                    try:
                        school_results[uid][school] += ip_count
                    except:
                        school_results[uid][school] = ip_count
           
            count += 1
    results = {} 
    for uid in uid_list:
        school_dict = school_results[uid]
        school_string = '&'.join(school_dict.keys())
        if school_dict != {}:
            is_school = '1'
        else:
            is_school = '0'
        results[uid] = {'is_school': is_school, 'school_string': school_string, 'school_dict': json.dumps(school_dict)}
    return results
Пример #56
0
def filter_ip(user_set):
    results = []
    #run_type
    if RUN_TYPE == 1:
        now_date = ts2datetime(time.time())
    else:
        now_date = RUN_TEST_TIME
    ts = datetime2ts(now_date) - DAY
    for user in user_set:
        ip_set = set()
        for i in range(0, 7):
            timestamp = ts - DAY * i
            ip_result = r_cluster.hget('ip_' + str(ts), str(user))
            if ip_result:
                result_dict = json.loads(ip_result)
            else:
                result_dict = {}
            for ip in result_dict:
                ip_set.add(ip)
        if len(ip_set) < ip_threshold:
            results.append(user)
        else:
            writer.writerow([user, 'ip'])
    return results
Пример #57
0
def filter_mention(user_set):
    results = []
    #run_type
    if RUN_TYPE == 1:
        now_date = ts2datetime(time.time())
    else:
        now_date = RUN_TEST_TIME
    timestamp = datetime2ts(now_date) - DAY
    date = ts2datetime(timestamp)
    for user in user_set:
        mention_set = set()
        for i in range(0, 7):
            ts = timestamp - DAY * i
            result = r_cluster.hget('at_' + str(ts), str(user))
            if result:
                item_dict = json.loads(result)
                for at_user in item_dict:
                    mention_set.add(at_user)
        at_count = len(mention_set)
        if at_count < mention_threshold:
            results.append(user)
        else:
            writer.writerow([user, 'mention'])
    return results
Пример #58
0
                    text.encode('utf-8', 'ignore'), DFA)
                if sensitive_words_dict:
                    item['sensitive_words_string'] = "&".join(
                        sensitive_words_dict.keys())
                    item['sensitive_words_dict'] = json.dumps(
                        sensitive_words_dict)
                else:
                    item['sensitive_words_string'] = ""
                    item['sensitive_words_dict'] = json.dumps({})

                timestamp = item['timestamp']
                date = ts2datetime(timestamp)
                ts = datetime2ts(date)
                if sensitive_words_dict:
                    #print 'sensitive_words_dict...keys[0]...',sensitive_words_dict.keys()[0]
                    sensitive_count_string = r_cluster.hget(
                        'sensitive_' + str(ts), str(uid))
                    if sensitive_count_string:  #redis取空
                        sensitive_count_dict = json.loads(
                            sensitive_count_string)
                        for word in sensitive_words_dict.keys():
                            if sensitive_count_dict.has_key(word):
                                sensitive_count_dict[
                                    word] += sensitive_words_dict[word]
                            else:
                                sensitive_count_dict[
                                    word] = sensitive_words_dict[word]
                        r_cluster.hset('sensitive_' + str(ts), str(uid),
                                       json.dumps(sensitive_count_dict))
                    else:
                        r_cluster.hset('sensitive_' + str(ts), str(uid),
                                       json.dumps(sensitive_words_dict))
Пример #59
0
def test(ft_type):
    print ft_type
    if ft_type == 'facebook':
        index_name_pre = facebook_flow_text_index_name_pre
        index_type = facebook_flow_text_index_type
        user_index_name = facebook_user_index_name
        user_index_type = facebook_user_index_type
    else:
        index_name_pre = twitter_flow_text_index_name_pre
        index_type = twitter_flow_text_index_type
        user_index_name = twitter_user_index_name
        user_index_type = twitter_user_index_type

    # date_list = load_date_list(True)
    date_list = load_date_list()

    DFA = createWordTree()
    query_body = {
        'post_filter': {
            'missing': {
                'field': 'keywords_string'
            }
        },
        'query': {
            'filtered': {
                'filter': {
                    'bool': {
                        'must': [{
                            'range': {
                                'flag_ch': {
                                    'gte': -1
                                }
                            }
                        }]
                    }
                }
            }
        }
    }
    for date in date_list:
        count = 0
        bulk_action = []
        index_name = index_name_pre + date
        try:
            es_scan_results = scan(es,
                                   query=query_body,
                                   size=1000,
                                   index=index_name,
                                   doc_type=index_type)
            while True:
                try:
                    scan_data = es_scan_results.next()
                    item = scan_data['_source']
                    text = item['text_ch']
                    uid = item['uid']
                    if ft_type == 'facebook':
                        _id = item['fid']
                    else:
                        _id = item['tid']

                    ts = datetime2ts(date)
                    #add sentiment field to weibo

                    sentiment, keywords_list = triple_classifier(item)

                    #add key words to weibo
                    keywords_dict, keywords_string = get_weibo_keywords(
                        keywords_list)

                    #sensitive_words_dict
                    sensitive_words_dict = searchWord(
                        text.encode('utf-8', 'ignore'), DFA)
                    if sensitive_words_dict:
                        sensitive_words_string_data = "&".join(
                            sensitive_words_dict.keys())
                        sensitive_words_dict_data = json.dumps(
                            sensitive_words_dict)
                    else:
                        sensitive_words_string_data = ""
                        sensitive_words_dict_data = json.dumps({})

                    #redis
                    if sensitive_words_dict:
                        sensitive_count_string = r_cluster.hget(
                            'sensitive_' + str(ts), str(uid))
                        if sensitive_count_string:  #redis取空
                            sensitive_count_dict = json.loads(
                                sensitive_count_string)
                            for word in sensitive_words_dict.keys():
                                if sensitive_count_dict.has_key(word):
                                    sensitive_count_dict[
                                        word] += sensitive_words_dict[word]
                                else:
                                    sensitive_count_dict[
                                        word] = sensitive_words_dict[word]
                            r_cluster.hset('sensitive_' + str(ts), str(uid),
                                           json.dumps(sensitive_count_dict))
                        else:
                            r_cluster.hset('sensitive_' + str(ts), str(uid),
                                           json.dumps(sensitive_words_dict))

                    #sensitive
                    sensitive_score = 0
                    if sensitive_words_dict:
                        for k, v in sensitive_words_dict.iteritems():
                            tmp_stage = r_sensitive.hget("sensitive_words", k)
                            if tmp_stage:
                                sensitive_score += v * sensitive_score_dict[
                                    str(tmp_stage)]

                    #directed_uid
                    directed_uid_data = 0
                    directed_uid, directed_uname = get_root_retweet(
                        text, uid, ft_type)
                    if directed_uid:
                        directed_uid_data = long(directed_uid)

                    # hashtag
                    hashtag = ''
                    RE = re.compile(
                        u'#([0-9a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)[ ," =.。: :、]'
                    )
                    hashtag_list = re.findall(RE, text)
                    if hashtag_list:
                        hashtag = '&'.join(hashtag_list)

                    #action
                    action = {'update': {'_id': _id}}

                    # action_data
                    action_data = {
                        'sentiment': str(sentiment),
                        'keywords_dict': json.dumps(keywords_dict),
                        'keywords_string': keywords_string,
                        'sensitive_words_string': sensitive_words_string_data,
                        'sensitive_words_dict': sensitive_words_dict_data,
                        'sensitive': sensitive_score,
                        'directed_uid': directed_uid_data,
                        'directed_uname': directed_uname,
                        'hashtag': hashtag,
                    }

                    bulk_action.extend([action, {'doc': action_data}])
                    count += 1

                    if count % 1000 == 0 and count != 0:
                        if bulk_action:
                            es.bulk(bulk_action,
                                    index=index_name,
                                    doc_type=facebook_flow_text_index_type,
                                    timeout=600)
                        bulk_action = []
                        count = 0
                except StopIteration:
                    break
            if bulk_action:

                es.bulk(bulk_action,
                        index=index_name,
                        doc_type=facebook_flow_text_index_type,
                        timeout=600)
        except Exception, e:  #es文档不存在
            print e