def save_city(uid, ip, timestamp, sensitive):
    ts = ts2datetime(timestamp).replace('-','')
    key = str(uid)
    try:
        if sensitive:
            ip_count_string = r_cluster.hget('sensitive_ip_'+str(ts), str(uid))
        else:
            ip_count_string = r_cluster.hget('ip_'+str(ts), str(uid))

        ip_count_dict = json.loads(ip_count_string)

        try:
            ip_count_dict[str(ip)] += 1
        except:
            ip_count_dict[str(ip)] = 1

        if sensitive:
            r_cluster.hset('sensitive_ip_'+str(ts), str(uid), json.dumps(ip_count_dict))
        else:
            r_cluster.hset('ip_'+str(ts), str(uid), json.dumps(ip_count_dict))

    except:
        if sensitive:
            r_cluster.hset('sensitive_ip_'+str(ts), str(uid), json.dumps({str(ip):1}))
        else:
            r_cluster.hset('ip_'+str(ts), str(uid), json.dumps({str(ip):1}))
示例#2
0
def save_city(uid, ip, timestamp, sensitive):
    ts = ts2datetime(timestamp).replace('-', '')
    key = str(uid)
    try:
        if sensitive:
            ip_count_string = r_cluster.hget('sensitive_ip_' + str(ts),
                                             str(uid))
        else:
            ip_count_string = r_cluster.hget('ip_' + str(ts), str(uid))

        ip_count_dict = json.loads(ip_count_string)

        try:
            ip_count_dict[str(ip)] += 1
        except:
            ip_count_dict[str(ip)] = 1

        if sensitive:
            r_cluster.hset('sensitive_ip_' + str(ts), str(uid),
                           json.dumps(ip_count_dict))
        else:
            r_cluster.hset('ip_' + str(ts), str(uid),
                           json.dumps(ip_count_dict))

    except:
        if sensitive:
            r_cluster.hset('sensitive_ip_' + str(ts), str(uid),
                           json.dumps({str(ip): 1}))
        else:
            r_cluster.hset('ip_' + str(ts), str(uid), json.dumps({str(ip): 1}))
示例#3
0
def save_activity(uid, ts, time_segment, sensitive):
    key = str(ts)
    try:
        if sensitive:
            activity_count_dict = r_cluster.hget('sensitive_activity_' + key,
                                                 str(uid))
        else:
            activity_count_dict = r_cluster.hget('activity_' + key, str(uid))
        activity_count_dict = json.loads(activity_count_dict)
        try:
            activity_count_dict[str(time_segment)] += 1
        except:
            activity_count_dict[str(time_segment)] = 1
        if sensitive:
            r_cluster.hset('sensitive_activity_' + key, str(uid),
                           json.dumps(activity_count_dict))
        else:
            r_cluster.hset('activity_' + key, str(uid),
                           json.dumps(activity_count_dict))
    except:
        if sensitive:
            r_cluster.hset('sensitive_activity_' + key, str(uid),
                           json.dumps({str(time_segment): 1}))
        else:
            r_cluster.hset('activity_' + key, str(uid),
                           json.dumps({str(time_segment): 1}))
示例#4
0
def save_at(uid, at_uid, timestamp, sensitive):
    ts = ts2datetime(timestamp).replace('-', '')
    key = str(uid)
    try:
        if sensitive:
            ruid_count_string = r_cluster.hget('sensitive_at_' + str(ts),
                                               str(uid))
        else:
            ruid_count_string = r_cluster.hget('at_' + str(ts), str(uid))

        ruid_count_dict = json.loads(ruid_count_string)
        try:
            ruid_count_dict[str(at_uid)] += 1
        except:
            ruid_count_dict[str(at_uid)] = 1
        if sensitive:
            r_cluster.hset('sensitive_at_' + str(ts), str(uid),
                           json.dumps(ruid_count_dict))
        else:
            r_cluster.hset('at_' + str(ts), str(uid),
                           json.dumps(ruid_count_dict))

    except:
        if sensitive:
            r_cluster.hset('sensitive_at_' + str(ts), str(uid),
                           json.dumps({str(at_uid): 1}))
        else:
            r_cluster.hset('at_' + str(ts), str(uid),
                           json.dumps({str(at_uid): 1}))
def cal_sensitive_words_work(item, sw_list):
    timestamp = item['timestamp']
    uid = item['uid']
    timestamp = ts2datetime(timestamp).replace('-','')
    ts = timestamp
    map = {}
    for w in sw_list:
        word = "".join([chr(x) for x in w])
        word = word.decode('utf-8')
        if not map.__contains__(word):
            map[word] = 1
        else:
            map[word] += 1
    try:
        sensitive_count_string = r_cluster.hget('sensitive_'+str(ts), str(uid))
        sensitive_count_dict = json.loads(sensitive_count_string)
        for word in map:
            count = map[word]
            if sensitive_count_dict.__contains__(word):
                sensitive_count_dict[word] += count
            else:
                sensitive_count_dict[word] = count
        r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_count_dict))
    except:
        r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(map))
def cal_class_ratio():
    ratio_results = {}
    date = '2013-09-07'
    ts = datetime2ts(date)
    scan_count = 0
    scan_cursor = 0
    all_count = 0
    while 1:
        if scan_count == 1000000:
            break
        results = r_cluster.hscan('activity_' + str(ts),
                                  scan_cursor,
                                  count=1000)
        scan_cursor = results[0]
        scan_count += 1000
        for uid in results[1]:
            activity_dict_string = r_cluster.hget('activity_' + str(ts), uid)
            activity_dict = json.loads(activity_dict_string)
            weibo_count = 0
            for time_seg in activity_dict:
                weibo_count += int(activity_dict[time_seg])
            if weibo_count >= 6:
                indic_3 = '1'
            else:
                indic_3 = '0'
            retweet_results = r.hgetall('retweet_' + str(uid))
            retweet_count = len(retweet_results)
            if retweet_count >= 8:
                indic_1 = '1'
            else:
                indic_1 = '0'
            be_retweet_results = r.hgetall('be_retweet_' + str(uid))
            be_retweet_count = len(be_retweet_results)
            #print 'be_retweet_count:', be_retweet_count
            if be_retweet_count >= 9:
                indic_2 = '1'
            else:
                indic_2 = '0'
            #print 'indic_2:', indic_2
            key = indic_1 + indic_2 + indic_3
            try:
                ratio_results[key] += 1
            except:
                ratio_results[key] = 1
            # write eight type users
            '''
            if key=='001':
                writer1.writerow([uid, retweet_count, be_retweet_count, weibo_count])
            elif key=='111':
                writer2.writerow([uid, retweet_count, be_retweet_count, weibo_count])
            elif key=='101':
                writer3.writerow([uid, retweet_count, be_retweet_count, weibo_count])
            elif key=='011':
                writer4.writerow([uid, retweet_count, be_retweet_count, weibo_count])
            elif key=='110':
                writer5.writerow([uid, retweet_count, be_retweet_count, weibo_count])
            if key=='010':
                writer6.writerow([uid, retweet_count, be_retweet_count, weibo_count])
            '''
    print 'ratio_results:', ratio_results
def cal_ave_weibo():
    date = '2013-09-07'
    timestamp = datetime2ts(date)
    scan_count = 0
    scan_cursor = 0
    all_count = 0
    while 1:
        if scan_count == 1000000:
            break
        results = r_cluster.hscan('activity_' + str(timestamp),
                                  scan_cursor,
                                  count=1000)
        scan_cursor = results[0]
        scan_count += 1000
        for uid in results[1]:
            for i in range(0, 1):
                ts = timestamp - 24 * 3600 * i
                activity_dict_string = r_cluster.hget('activity_' + str(ts),
                                                      uid)
                if activity_dict_string:
                    activity_dict = json.loads(activity_dict_string)
                    weibo_count = 0
                    for time_seg in activity_dict:
                        weibo_count += int(activity_dict[time_seg])
                    all_count += weibo_count
    ave_count = float(all_count) / scan_count
    print 'ave_count:', ave_count
def cal_text_work(item):
    uid = item['uid']
    timestamp = item['timestamp']
    date = ts2datetime(timestamp)
    ts = datetime2ts(date)
    text = item['text']
    if isinstance(text, str):
        text = text.decode('utf-8', 'ignore')
    RE = re.compile(u'#([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)#', re.UNICODE)
    hashtag_list = RE.findall(text)
    if hashtag_list:
        # there all use unicode·
        hashtag_dict = dict()
        for hashtag in hashtag_list:
            try:
                hashtag_dict[hashtag] += 1
            except:
                hashtag_dict[hashtag] = 1
        try:
            hashtag_count_string = r_cluster.hget('hashtag_'+str(ts), str(uid))
            hashtag_count_dict = json.loads(hashtag_count_string)
            for hashtag in hashtag_dict:
                count = hashtag_dict[hashtag]
                try:
                    hashtag_count_dict[hashtag] += count
                except:
                    hashtag_count_dict[hashtag] = count
            r_cluster.hset('hashtag_'+str(ts), str(uid), json.dumps(hashtag_count_dict))
        except:
            r_cluster.hset('hashtag_'+str(ts), str(uid), json.dumps(hashtag_dict))
def filter_activity(user_set):
    results = []
    #run_type
    if RUN_TYPE == 1:
        now_date = ts2datetime(time.time())
    else:
        now_date = RUN_TEST_TIME
    ts = datetime2ts(now_date) - DAY
    date = ts2datetime(ts)
    timestamp = datetime2ts(date)
    for user in user_set:
        over_count = 0
        for i in range(0,7):
            ts = timestamp - DAY*i
            result = r_cluster.hget('activity_'+str(ts), str(user))
            if result:
                items_dict = json.loads(result)
                for item in items_dict:
                    weibo_count = items_dict[item]
                    if weibo_count > activity_threshold:
                        over_count += 1
        if over_count == 0:
            results.append(user)
        else:
            writer.writerow([user, 'activity'])
            
    return results
示例#10
0
def cal_text_sensitive(item):
    text = item['text']
    uid = item['uid']
    timestamp = item['timestamp']
    date = ts2datetime(timestamp)
    ts = datetime2ts(date)
    if isinstance(text, str):
        text = text.decode('utf-8', 'ignore')
    sensitive_result = [word for word in SENSITIVE_WORD if word in text]
    if sensitive_result:
        sensitive_dict = dict()
        for word in sensitive_result:
            try:
                sensitive_dict[word] += 1
            except:
                sensitive_dict[word] = 1
        try:
            sensitive_count_string = r_cluster.hget('sensitive_'+str(ts), str(uid))
            sensitive_count_dict = json.loads(sensitive_count_string)
            for word in sensitive_dict:
                count = sensitive_dict[word]
                try:
                    sensitive_count_dict[word] += count
                except:
                    sensitive_count_dict[word] = count
            r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_count_dict))
        except:
            r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_dict))
示例#11
0
def cal_text_work(item):
    uid = item['uid']
    timestamp = item['timestamp']
    date = ts2datetime(timestamp)
    ts = datetime2ts(date)
    text = item['text']
    if isinstance(text, str):
        text = text.decode('utf-8', 'ignore')
    RE = re.compile(u'#([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)#', re.UNICODE)
    hashtag_list = RE.findall(text)
    if hashtag_list:
        # there all use unicode·
        hashtag_dict = dict()
        for hashtag in hashtag_list:
            try:
                hashtag_dict[hashtag] += 1
            except:
                hashtag_dict[hashtag] = 1
        try:
            hashtag_count_string = r_cluster.hget('hashtag_'+str(ts), str(uid))
            hashtag_count_dict = json.loads(hashtag_count_string)
            for hashtag in hashtag_dict:
                count = hashtag_dict[hashtag]
                try:
                    hashtag_count_dict[hashtag] += count
                except:
                    hashtag_count_dict[hashtag] = count
            r_cluster.hset('hashtag_'+str(ts), str(uid), json.dumps(hashtag_count_dict))
        except:
            r_cluster.hset('hashtag_'+str(ts), str(uid), json.dumps(hashtag_dict))
示例#12
0
def filter_activity(user_set):
    results = []
    now_date = ts2datetime(time.time())
    # test
    now_date = '2013-09-08'
    ts = datetime2ts(now_date) - 24*3600
    date = ts2datetime(ts)
    #print 'date:', date
    timestamp = datetime2ts(date)
    for user in user_set:
        over_count = 0
        for i in range(0,7):
            ts = timestamp - 3600*24*i
            result = r_cluster.hget('activity_'+str(ts), str(user))
            if result:
                items_dict = json.loads(result)
                for item in items_dict:
                    weibo_count = items_dict[item]
                    if weibo_count > activity_threshold:
                        over_count += 1
        if over_count == 0:
            results.append(user)
        else:
            writer.writerow([user, 'activity'])
            
    print 'after filter activity:', len(results)    
    return results
示例#13
0
def filter_activity(user_set):
    results = []
    #run_type
    if RUN_TYPE == 1:
        now_date = ts2datetime(time.time())
    else:
        now_date = RUN_TEST_TIME
    ts = datetime2ts(now_date) - DAY
    date = ts2datetime(ts)
    timestamp = datetime2ts(date)
    for user in user_set:
        over_count = 0
        for i in range(0, 7):
            ts = timestamp - DAY * i
            result = r_cluster.hget('activity_' + str(ts), str(user))
            if result:
                items_dict = json.loads(result)
                for item in items_dict:
                    weibo_count = items_dict[item]
                    if weibo_count > activity_threshold:
                        over_count += 1
        if over_count == 0:
            results.append(user)
        else:
            writer.writerow([user, 'activity'])

    return results
def filter_activity(user_set):
    results = []
    now_date = ts2datetime(time.time())
    now_date = '2013-09-08'
    ts = datetime2ts(now_date) - 24 * 3600
    date = ts2datetime(ts)
    timestamp = datetime2ts(date)
    ts = ts.replace('-', '')
    for user in user_set:
        over_count = 0
        for i in range(0, 7):
            ts = timestamp - 3600 * 24 * i
            result = r_cluster.hget('activity_' + str(ts), str(user))
            if result:
                item_dict = json.loads(result)
                sorted_dict = sorted(item_dict.iteritems(),
                                     key=lambda asd: asd[1],
                                     reverse=True)
                if sorted_dict[0][1] > activity_threshold:
                    over_count = 1
        if over_count == 0:
            results.append(user)
        else:
            writer.writerow([user, 'activity'])

    print 'after filter activity: ', len(results)
    return results
def filter_activity(user_set):
    results = []
    now_date = ts2datetime(time.time())
    now_date = '2013-09-08'
    ts = datetime2ts(now_date) - 24*3600
    date = ts2datetime(ts)
    timestamp = datetime2ts(date)
    ts = ts.replace('-','')
    for user in user_set:
        over_count = 0
        for i in range(0,7):
            ts = timestamp - 3600*24*i
            result = r_cluster.hget('activity_'+str(ts), str(user))
            if result:
                item_dict = json.loads(result)
                sorted_dict = sorted(item_dict.iteritems(), key=lambda asd:asd[1], reverse=True)
                if sorted_dict[0][1] > activity_threshold:
                    over_count = 1
        if over_count == 0:
            results.append(user)
        else:
            writer.writerow([user, 'activity'])

    print 'after filter activity: ', len(results)
    return results
def cal_text_sensitive(item):
    text = item['text']
    uid = item['uid']
    timestamp = item['timestamp']
    date = ts2datetime(timestamp)
    ts = datetime2ts(date)
    if isinstance(text, str):
        text = text.decode('utf-8', 'ignore')
    sensitive_result = [word for word in SENSITIVE_WORD if word in text]
    if sensitive_result:
        sensitive_dict = dict()
        for word in sensitive_result:
            try:
                sensitive_dict[word] += 1
            except:
                sensitive_dict[word] = 1
        try:
            sensitive_count_string = r_cluster.hget('sensitive_'+str(ts), str(uid))
            sensitive_count_dict = json.loads(sensitive_count_string)
            for word in sensitive_dict:
                count = sensitive_dict[word]
                try:
                    sensitive_count_dict[word] += count
                except:
                    sensitive_count_dict[word] = count
            r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_count_dict))
        except:
            r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_dict))
def cal_sensitive_words_work(item, sw_list):
    timestamp = item['timestamp']
    uid = item['uid']
    timestamp = ts2datetime(timestamp).replace('-', '')
    ts = timestamp
    map = {}
    for w in sw_list:
        word = "".join([chr(x) for x in w])
        word = word.decode('utf-8')
        if not map.__contains__(word):
            map[word] = 1
        else:
            map[word] += 1
    try:
        sensitive_count_string = r_cluster.hget('sensitive_' + str(ts),
                                                str(uid))
        sensitive_count_dict = json.loads(sensitive_count_string)
        for word in map:
            count = map[word]
            if sensitive_count_dict.__contains__(word):
                sensitive_count_dict[word] += count
            else:
                sensitive_count_dict[word] = count
        r_cluster.hset('sensitive_' + str(ts), str(uid),
                       json.dumps(sensitive_count_dict))
    except:
        r_cluster.hset('sensitive_' + str(ts), str(uid), json.dumps(map))
def cal_hashtag_work(item, sensitive):
    text = item['text']
    uid = item['uid']
    timestamp = item['timestamp']
    ts = ts2datetime(timestamp).replace('-', '')

    if isinstance(text, str):
        text = text.decode('utf-8', 'ignore')
    RE = re.compile(u'#([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)#',
                    re.UNICODE)
    hashtag_list = RE.findall(text)
    if hashtag_list:
        hashtag_dict = {}
        for hashtag in hashtag_list:
            try:
                hashtag_dict[hashtag] += 1
            except:
                hashtag_dict[hashtag] = 1

        try:
            if sensitive:
                hashtag_count_string = r_cluster.hget(
                    'sensitive_hashtag_' + str(ts), str(uid))
            else:
                hashtag_count_string = r_cluster.hget('hashtag_' + str(ts),
                                                      str(uid))
            hashtag_count_dict = json.loads(hashtag_count_string)
            for hashtag in hashtag_dict:
                count = hashtag_dict[hashtag]
                try:
                    hashtag_count_dict[hashtag] += count
                except:
                    hashtag_count_dict[hashtag] = count
            if sensitive:
                r_cluster.hset('sensitive_hashtag_' + str(ts), str(uid),
                               json.dumps(hashtag_count_dict))
            else:
                r_cluster.hset('hashtag_' + str(ts), str(uid),
                               json.dumps(hashtag_count_dict))
        except:
            if sensitive:
                r_cluster.hset('sensitive_hashtag_' + str(ts), str(uid),
                               json.dumps(hashtag_dict))
            else:
                r_cluster.hset('hashtag_' + str(ts), str(uid),
                               json.dumps(hashtag_dict))
def cal_class_ratio():
    ratio_results = {}
    date = '2013-09-07'
    ts = datetime2ts(date)
    scan_count = 0
    scan_cursor = 0
    all_count = 0
    while 1:
        if scan_count == 1000000:
            break
        results = r_cluster.hscan('activity_'+str(ts), scan_cursor, count=1000)
        scan_cursor = results[0]
        scan_count += 1000
        for uid in results[1]:
            activity_dict_string = r_cluster.hget('activity_'+str(ts), uid)
            activity_dict = json.loads(activity_dict_string)
            weibo_count = 0
            for time_seg in activity_dict:
                weibo_count += int(activity_dict[time_seg])
            if weibo_count >= 6:
                indic_3 = '1'
            else:
                indic_3 = '0'
            retweet_results = r.hgetall('retweet_'+str(uid))
            retweet_count = len(retweet_results)
            if retweet_count >= 8:
                indic_1 = '1'
            else:
                indic_1 = '0'
            be_retweet_results = r.hgetall('be_retweet_'+str(uid))
            be_retweet_count = len(be_retweet_results)
            #print 'be_retweet_count:', be_retweet_count
            if be_retweet_count >= 9:
                indic_2 = '1'
            else:
                indic_2 = '0'
            #print 'indic_2:', indic_2
            key = indic_1 + indic_2 + indic_3
            try:
                ratio_results[key] += 1
            except:
                ratio_results[key] = 1
            # write eight type users
            '''
            if key=='001':
                writer1.writerow([uid, retweet_count, be_retweet_count, weibo_count])
            elif key=='111':
                writer2.writerow([uid, retweet_count, be_retweet_count, weibo_count])
            elif key=='101':
                writer3.writerow([uid, retweet_count, be_retweet_count, weibo_count])
            elif key=='011':
                writer4.writerow([uid, retweet_count, be_retweet_count, weibo_count])
            elif key=='110':
                writer5.writerow([uid, retweet_count, be_retweet_count, weibo_count])
            if key=='010':
                writer6.writerow([uid, retweet_count, be_retweet_count, weibo_count])
            '''
    print 'ratio_results:', ratio_results
def cal_core_class():
    date = '2013-09-07'
    timestamp = datetime2ts(date)
    f_r = open(
        '/home/ubuntu8/huxiaoqian/user_portrait/user_portrait/cron/recommentation_in/core_list.csv',
        'rb')
    reader = csv.reader(f_r)
    f_w = open(
        '/home/ubuntu8/huxiaoqian/user_portrait/user_portrait/cron/recommentation_in/core_class.csv',
        'wb')
    writer = csv.writer(f_w)
    result_list = []
    count011 = 0
    for line in reader:
        uid = line[0]
        retweet_results = r.hgetall('retweet_' + str(uid))
        retweet_count = len(retweet_results)
        be_retweet_results = r.hgetall('be_retweet_' + str(uid))
        be_retweet_count = len(be_retweet_results)
        weibo_count = 0
        for i in range(0, 7):
            ts = timestamp - 24 * 3600 * i
            activity_string = r_cluster.hget('activity_' + str(ts), str(uid))
            if activity_string:
                activity_dict = json.loads(activity_string)
            else:
                activity_dict = {}
            for time_seg in activity_dict:
                count = activity_dict[time_seg]
                weibo_count += count
        ave_weibo_count = float(weibo_count) / 7
        if retweet_count >= 8:
            indic_1 = '1'
        else:
            indic_1 = '0'
        if be_retweet_count >= 9:
            indic_2 = '1'
        else:
            indic_2 = '0'
        if ave_weibo_count >= 6:
            indic_3 = '1'
        else:
            indic_3 = '0'
        key = indic_1 + indic_2 + indic_3
        if key == '011':
            count011 += 1
        result_list.append(
            [uid, key, retweet_count, be_retweet_count, ave_weibo_count])
    f_r.close()
    sort_result = sorted(result_list, key=lambda x: x[3], reverse=True)
    for item in sort_result:
        writer.writerow(list(item))
    f_w.close()
    print 'count011:', count011
def save_activity(uid, ts, time_segment):
    key = str(ts)
    try:
        activity_count_dict = r_cluster.hget('activity_' + key, str(uid))
        activity_count_dict = json.loads(activity_count_dict)
        try:
            activity_count_dict[str(time_segment)] += 1
        except:
            activity_count_dict[str(time_segment)] = 1
        r_cluster.hset('activity_' + key, str(uid), json.dumps(activity_count_dict))
    except:
        r_cluster.hset('activity_' + key, str(uid), json.dumps({str(time_segment): 1}))
def cal_hashtag_work(item, sensitive):
    text = item['text']
    uid = item['uid']
    timestamp = item['timestamp']
    ts = ts2datetime(timestamp).replace('-','')

    if isinstance(text, str):
        text = text.decode('utf-8', 'ignore')
    RE = re.compile(u'#([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)#', re.UNICODE)
    hashtag_list = RE.findall(text)
    if hashtag_list:
        hashtag_dict = {}
        for hashtag in hashtag_list:
            try:
                hashtag_dict[hashtag] += 1
            except:
                hashtag_dict[hashtag] = 1

        try:
            if sensitive:
                hashtag_count_string = r_cluster.hget('sensitive_hashtag_'+str(ts), str(uid))
            else:
                hashtag_count_string = r_cluster.hget('hashtag_'+str(ts), str(uid))
            hashtag_count_dict = json.loads(hashtag_count_string)
            for hashtag in hashtag_dict:
                count = hashtag_dict[hashtag]
                try:
                    hashtag_count_dict[hashtag] += count
                except:
                    hashtag_count_dict[hashtag] = count
            if sensitive:
                r_cluster.hset('sensitive_hashtag_'+str(ts), str(uid), json.dumps(hashtag_count_dict))
            else:
                r_cluster.hset('hashtag_'+str(ts), str(uid), json.dumps(hashtag_count_dict))
        except:
            if sensitive:
                r_cluster.hset('sensitive_hashtag_'+str(ts), str(uid), json.dumps(hashtag_dict))
            else:
                r_cluster.hset('hashtag_'+str(ts), str(uid), json.dumps(hashtag_dict))
示例#23
0
def save_at(uid, at_uid, timestamp):
    date = ts2datetime(timestamp)
    ts = datetime2ts(date)
    key = str(uid)
    try:
        ruid_count_string = r_cluster.hget('at_' + str(ts), str(uid))
        ruid_count_dict = json.loads(ruid_count_string)
        try:
            ruid_count_dict[str(at_uid)] += 1
        except:
            ruid_count_dict[str(at_uid)] = 1
        r_cluster.hset('at_' + str(ts), str(uid), json.dumps(ruid_count_dict))
    except:
        r_cluster.hset('at_' + str(ts), str(uid), json.dumps({str(at_uid): 1}))
def save_at(uid, at_uid, timestamp):
    date = ts2datetime(timestamp)
    ts = datetime2ts(date)
    key = str(uid)
    try:
        ruid_count_string = r_cluster.hget('at_'+str(ts), str(uid))
        ruid_count_dict = json.loads(ruid_count_string)
        try:
            ruid_count_dict[str(at_uid)] += 1
        except:
            ruid_count_dict[str(at_uid)] = 1
        r_cluster.hset('at_'+str(ts), str(uid), json.dumps(ruid_count_dict))
    except:
        r_cluster.hset('at_'+str(ts), str(uid), json.dumps({str(at_uid):1}))
def save_city(uid, ip, timestamp):
    date = ts2datetime(timestamp)
    ts = datetime2ts(date)
    key = str(uid)
    try:
        ip_count_string = r_cluster.hget('ip_'+str(ts), str(uid))
        ip_count_dict = json.loads(ip_count_string)
        try:
            ip_count_dict[str(ip)] += 1
        except:
            ip_count_dict[str(ip)] = 1
        r_cluster.hset('ip_'+str(ts), str(uid), json.dumps(ip_count_dict))
    except:
        r_cluster.hset('ip_'+str(ts), str(uid), json.dumps({str(ip):1}))
def cal_core_class():
    date = '2013-09-07'
    timestamp = datetime2ts(date)
    f_r = open('/home/ubuntu8/huxiaoqian/user_portrait/user_portrait/cron/recommentation_in/core_list.csv', 'rb')
    reader = csv.reader(f_r)
    f_w = open('/home/ubuntu8/huxiaoqian/user_portrait/user_portrait/cron/recommentation_in/core_class.csv', 'wb')
    writer = csv.writer(f_w)
    result_list = []
    count011 = 0
    for line in reader:
        uid = line[0]
        retweet_results = r.hgetall('retweet_'+str(uid))
        retweet_count = len(retweet_results)
        be_retweet_results = r.hgetall('be_retweet_'+str(uid))
        be_retweet_count = len(be_retweet_results)
        weibo_count = 0
        for i in range(0,7):
            ts = timestamp - 24*3600*i
            activity_string = r_cluster.hget('activity_'+str(ts), str(uid))
            if activity_string:
                activity_dict = json.loads(activity_string)
            else:
                activity_dict = {}
            for time_seg in activity_dict:
                count = activity_dict[time_seg]
                weibo_count += count
        ave_weibo_count = float(weibo_count) / 7
        if retweet_count >= 8:
            indic_1 = '1'
        else:
            indic_1 = '0'
        if be_retweet_count >= 9:
            indic_2 = '1'
        else:
            indic_2 = '0'
        if ave_weibo_count >= 6:
            indic_3 = '1'
        else:
            indic_3 = '0'
        key = indic_1 + indic_2 + indic_3
        if key=='011':
            count011 += 1
        result_list.append([uid, key, retweet_count, be_retweet_count, ave_weibo_count])
    f_r.close()
    sort_result = sorted(result_list, key=lambda x:x[3], reverse=True)
    for item in sort_result:
        writer.writerow(list(item))
    f_w.close()
    print 'count011:', count011
def save_city_timestamp(uid, ip, timestamp):
    date = ts2datetime(timestamp)
    ts = datetime2ts(date)
    try:
        ip_timestamp_string = r_cluster.hget('new_ip_'+str(ts), str(uid))
        ip_timestamp_string_dict = json.loads(ip_timestamp_string)
        try:
            add_string = '&'+str(timestamp)
            ip_timestamp_string_dict[str(ip)] += add_string
        except:
            ip_timestamp_string_dict[str(ip)] = str(timestamp)
        r_cluster.hset('new_ip_'+str(ts), str(uid), json.dumps(ip_timestamp_string_dict))

    except:
        r_cluster.hset('new_ip_'+str(ts), str(uid), json.dumps({str(ip): str(timestamp)}))
def save_at(uid, at_uid, timestamp, sensitive):
    ts = ts2datetime(timestamp).replace('-','')
    key = str(uid)
    try:
        if sensitive:
            ruid_count_string = r_cluster.hget('sensitive_at_'+str(ts), str(uid))
        else:
            ruid_count_string = r_cluster.hget('at_'+str(ts), str(uid))

        ruid_count_dict = json.loads(ruid_count_string)
        try:
            ruid_count_dict[str(at_uid)] += 1
        except:
            ruid_count_dict[str(at_uid)] = 1
        if sensitive:
            r_cluster.hset('sensitive_at_'+str(ts), str(uid), json.dumps(ruid_count_dict))
        else:
            r_cluster.hset('at_'+str(ts), str(uid), json.dumps(ruid_count_dict))

    except:
        if sensitive:
            r_cluster.hset('sensitive_at_'+str(ts), str(uid), json.dumps({str(at_uid):1}))
        else:
            r_cluster.hset('at_'+str(ts), str(uid), json.dumps({str(at_uid):1}))
示例#29
0
def save_city_timestamp(uid, ip, timestamp):
    date = ts2datetime(timestamp)
    ts = datetime2ts(date)
    try:
        ip_timestamp_string = r_cluster.hget('new_ip_' + str(ts), str(uid))
        ip_timestamp_string_dict = json.loads(ip_timestamp_string)
        try:
            add_string = '&' + str(timestamp)
            ip_timestamp_string_dict[str(ip)] += add_string
        except:
            ip_timestamp_string_dict[str(ip)] = str(timestamp)
        r_cluster.hset('new_ip_' + str(ts), str(uid),
                       json.dumps(ip_timestamp_string_dict))

    except:
        r_cluster.hset('new_ip_' + str(ts), str(uid),
                       json.dumps({str(ip): str(timestamp)}))
def get_activeness(uid, activity_geo):
    result = 0
    now_ts = time.time()
    now_date = ts2datetime(now_ts)
    timestamp = datetime2ts(now_date)
    #test
    #timestamp = datetime2ts('2013-09-08')
    # deal activity_time fft and statusnum
    activity_list = []
    statusnum = 0
    for i in range(1,8):
        ts = timestamp - 24*3600*i
        r_result = r_cluster.hget('activity_'+str(ts), uid)
        if r_result:
            r_result = json.loads(r_result)
        #print 'r_result:', r_result
        for i in range(0,96):
            try:
                count = r_result[str(i)]
            except:
                count = 0
            activity_list.append(float(count))
    #print 'activity_list:', activity_list
    statusnum = sum(activity_list)
    signal = np.array(activity_list)
    fftResult = np.abs(np.fft.fft(signal)) ** 2
    n = signal.size
    freq = np.fft.fftfreq(n, d=1)
    i = 0
    max_val = 0
    max_freq = 0
    for val in fftResult:
        #print str(1/freq[i]) + ',' + str(val)
        if val>max_val and freq[i]>0:
            max_val = val
            max_freq = freq[i]
        i = i + 1
    #print 'i:', i
    #print 'max_freq, max_val:', max_freq, max_val
    # deal avtivity_geo input: 'geo&geo'
    activity_geo_count = len(activity_geo.split('&'))
    result = activeness_weight_dict['activity_time'] * math.log(max_freq  + 1) + \
             activeness_weight_dict['activity_geo'] * math.log(activity_geo_count + 1) +\
             activeness_weight_dict['statusnum'] * math.log(statusnum + 1)
    #print 'activeness:', result
    return result
示例#31
0
文件: test_0520.py 项目: SwoJa/ruman
def get_user_at():
    #step1: get_uid_list
    uid_list = get_uid_list()
    date = ts2datetime(time.time())
    ts = datetime2ts(date)
    f = open('/home/user_portrait_0320/revised_user_portrait/user_portrait/user_portrait/attribute/uid_at.txt', 'w')
    for i in range(1, 8):
        ts = ts - DAY
        for uid in uid_list:
            #try:
            result_string = r_cluster.hget('at_' + str(ts), uid)
            #except:
            #    result_string = ''
            if result_string:
                save_dict = {'ts': ts, 'result': result_string}
                f.write('%s\n' % json.dumps(save_dict))
    f.close()
def get_activeness(uid, activity_geo):
    result = 0
    now_ts = time.time()
    now_date = ts2datetime(now_ts)
    timestamp = datetime2ts(now_date)
    #test
    timestamp = datetime2ts('2013-09-08')
    # deal activity_time fft and statusnum
    activity_list = []
    statusnum = 0
    for i in range(1,8):
        ts = timestamp - 24*3600*i
        r_result = r_cluster.hget('activity_'+str(ts), uid)
        if r_result:
            r_result = json.loads(r_result)
        #print 'r_result:', r_result
        for i in range(0,96):
            try:
                count = r_result[str(i)]
            except:
                count = 0
            activity_list.append(float(count))
    #print 'activity_list:', activity_list
    statusnum = sum(activity_list)
    signal = np.array(activity_list)
    fftResult = np.abs(np.fft.fft(signal)) ** 2
    n = signal.size
    freq = np.fft.fftfreq(n, d=1)
    i = 0
    max_val = 0
    max_freq = 0
    for val in fftResult:
        #print str(1/freq[i]) + ',' + str(val)
        if val>max_val and freq[i]>0:
            max_val = val
            max_freq = freq[i]
        i = i + 1
    #print 'i:', i
    #print 'max_freq, max_val:', max_freq, max_val
    # deal avtivity_geo input: 'geo&geo'
    activity_geo_count = len(activity_geo.split('&'))
    result = activeness_weight_dict['activity_time'] * math.log(max_freq  + 1) + \
             activeness_weight_dict['activity_geo'] * math.log(activity_geo_count + 1) +\
             activeness_weight_dict['statusnum'] * math.log(statusnum + 1)
    #print 'activeness:', result
    return result
def get_activity_geo(uid):
    ip_result = []
    now_ts = time.time()
    now_date = ts2datetime(now_ts)
    ts = datetime2ts(now_date)
    geo_result = {}
    # test
    ts = datetime2ts('2013-09-08')
    for i in range(1,8):
        ts = ts - 24*3600
        r_result = r_cluster.hget('ip_'+str(ts), uid)
        if r_result:
            ip_list = json.loads(r_result).keys()
            ip_result.extend(ip_list)
    ip_list = set(ip_result)
    geo_string = '&'.join(ip2geo(ip_list))
    #print 'geo_string:', geo_string
    return geo_string
def get_activity_geo(uid):
    ip_result = []
    now_ts = time.time()
    now_date = ts2datetime(now_ts)
    ts = datetime2ts(now_date)
    geo_result = {}
    # test
    ts = datetime2ts('2013-09-08')
    for i in range(1,8):
        ts = ts - 24*3600
        r_result = r_cluster.hget('ip_'+str(ts), uid)
        if r_result:
            ip_list = json.loads(r_result).keys()
            ip_result.extend(ip_list)
    ip_list = set(ip_result)
    geo_string = '&'.join(ip2geo(ip_list))
    #print 'geo_string:', geo_string
    return geo_string
示例#35
0
文件: test_0520.py 项目: SwoJa/ruman
def get_user_at():
    #step1: get_uid_list
    uid_list = get_uid_list()
    date = ts2datetime(time.time())
    ts = datetime2ts(date)
    f = open(
        '/home/user_portrait_0320/revised_user_portrait/user_portrait/user_portrait/attribute/uid_at.txt',
        'w')
    for i in range(1, 8):
        ts = ts - DAY
        for uid in uid_list:
            #try:
            result_string = r_cluster.hget('at_' + str(ts), uid)
            #except:
            #    result_string = ''
            if result_string:
                save_dict = {'ts': ts, 'result': result_string}
                f.write('%s\n' % json.dumps(save_dict))
    f.close()
示例#36
0
def attr_hash(uid):
    hashtag_results = {}
    now_ts = time.time()
    # test
    now_ts = datetime2ts('2013-09-08')
    date = ts2datetime(now_ts)
    ts = datetime2ts(date)
    for i in range(1, 8):
        ts = ts - 24 * 3600
        result_string = r_cluster.hget('hashtag_' + str(ts), str(uid))
        if result_string:
            result_dict = json.loads(result_string)
            for hashtag in result_dict:
                count = result_dict[hashtag]
                try:
                    hashtag_results[hashtag] += count
                except:
                    hashtag_results[hashtag] = count
    return hashtag_results
def attr_hash(uid):
    hashtag_results = {}
    now_ts = time.time()
    # test
    now_ts = datetime2ts('2013-09-08')
    date = ts2datetime(now_ts)
    ts = datetime2ts(date)
    for i in range(1,8):
        ts = ts - 24*3600
        result_string = r_cluster.hget('hashtag_'+str(ts), str(uid))
        if result_string:
            result_dict = json.loads(result_string)
            for hashtag in result_dict:
                count = result_dict[hashtag]
                try:
                    hashtag_results[hashtag] += count
                except:
                    hashtag_results[hashtag] = count
    return hashtag_results
def filter_mention(user_set):
    results = []
    now_date = ts2datetime(time.time())
    now_date = '2013-09-08'
    timestamp = datetime2ts(now_date) - 24*3600
    for user in user_set:
        mention_set = set()
        for i in range(0,7):
            ts = timestamp - 3600*24*i
            result = r_cluster.hget('at_'+str(ts), str(user))
            if result:
                item_dict = json.loads(result)
                for at_user in item_dict:
                    mention_set.add(at_user)
        if at_count < mention_threshold:
            results.append(user)
        else:
            writer.writerow([user, 'mention'])
    print 'after filter mention: ', len(results)
    return results
def filter_mention(user_set):
    results = []
    now_date = ts2datetime(time.time())
    now_date = '2013-09-08'
    timestamp = datetime2ts(now_date) - 24 * 3600
    for user in user_set:
        mention_set = set()
        for i in range(0, 7):
            ts = timestamp - 3600 * 24 * i
            result = r_cluster.hget('at_' + str(ts), str(user))
            if result:
                item_dict = json.loads(result)
                for at_user in item_dict:
                    mention_set.add(at_user)
        if at_count < mention_threshold:
            results.append(user)
        else:
            writer.writerow([user, 'mention'])
    print 'after filter mention: ', len(results)
    return results
def filter_ip(user_set):
    results = []
    now_date = ts2datetime(time.time())
    now_date = '2013-09-08'
    ts = datetime2ts(now_date) - 24*3600
    for user in user_set:
        ip_set = set()
        for i in range(0,7):
            timestamp = ts - 3600*24*i
            ip_result = r_cluster.hget('ip_'+str(ts), str(user))
            if ip_result:
                result_dict = json.loads(ip_result)
            else:
                result_dict = {}
            for ip in result_dict:
                ip_set.add(ip)
        if len(result_dict) < ip_threshold:
            results.append(user)
        else:
            writer.writerow([user, 'ip'])
    print 'after filter ip: ', len(results)
    return results
def filter_ip(user_set):
    results = []
    now_date = ts2datetime(time.time())
    now_date = '2013-09-08'
    ts = datetime2ts(now_date) - 24 * 3600
    for user in user_set:
        ip_set = set()
        for i in range(0, 7):
            timestamp = ts - 3600 * 24 * i
            ip_result = r_cluster.hget('ip_' + str(ts), str(user))
            if ip_result:
                result_dict = json.loads(ip_result)
            else:
                result_dict = {}
            for ip in result_dict:
                ip_set.add(ip)
        if len(result_dict) < ip_threshold:
            results.append(user)
        else:
            writer.writerow([user, 'ip'])
    print 'after filter ip: ', len(results)
    return results
def filter_ip(user_set):
    results = []
    #run_type
    if RUN_TYPE == 1:
        now_date = ts2datetime(time.time())
    else:
        now_date = RUN_TEST_TIME
    ts = datetime2ts(now_date) - DAY
    for user in user_set:
        ip_set = set()
        for i in range(0,7):
            timestamp = ts - DAY*i
            ip_result = r_cluster.hget('ip_'+str(ts), str(user))
            if ip_result:
                result_dict = json.loads(ip_result)
            else:
                result_dict = {}
            for ip in result_dict:
                ip_set.add(ip)
        if len(ip_set) < ip_threshold:
            results.append(user)
        else:
            writer.writerow([user, 'ip'])
    return results
示例#43
0
def filter_ip(user_set):
    results = []
    #run_type
    if RUN_TYPE == 1:
        now_date = ts2datetime(time.time())
    else:
        now_date = RUN_TEST_TIME
    ts = datetime2ts(now_date) - DAY
    for user in user_set:
        ip_set = set()
        for i in range(0, 7):
            timestamp = ts - DAY * i
            ip_result = r_cluster.hget('ip_' + str(ts), str(user))
            if ip_result:
                result_dict = json.loads(ip_result)
            else:
                result_dict = {}
            for ip in result_dict:
                ip_set.add(ip)
        if len(ip_set) < ip_threshold:
            results.append(user)
        else:
            writer.writerow([user, 'ip'])
    return results
def cal_ave_weibo():
    date = '2013-09-07'
    timestamp = datetime2ts(date)
    scan_count = 0
    scan_cursor = 0
    all_count = 0
    while 1:
        if scan_count == 1000000:
            break
        results = r_cluster.hscan('activity_'+str(timestamp), scan_cursor, count=1000)
        scan_cursor = results[0]
        scan_count += 1000
        for uid in results[1]:
            for i in range(0,1):
                ts = timestamp - 24*3600*i
                activity_dict_string = r_cluster.hget('activity_'+str(ts), uid)
                if activity_dict_string:
                    activity_dict = json.loads(activity_dict_string)
                    weibo_count = 0
                    for time_seg in activity_dict:
                        weibo_count += int(activity_dict[time_seg])
                    all_count += weibo_count
    ave_count = float(all_count) / scan_count
    print 'ave_count:', ave_count
示例#45
0
def filter_mention(user_set):
    results = []
    #run_type
    if RUN_TYPE == 1:
        now_date = ts2datetime(time.time())
    else:
        now_date = RUN_TEST_TIME
    timestamp = datetime2ts(now_date) - DAY
    date = ts2datetime(timestamp)
    for user in user_set:
        mention_set = set()
        for i in range(0, 7):
            ts = timestamp - DAY * i
            result = r_cluster.hget('at_' + str(ts), str(user))
            if result:
                item_dict = json.loads(result)
                for at_user in item_dict:
                    mention_set.add(at_user)
        at_count = len(mention_set)
        if at_count < mention_threshold:
            results.append(user)
        else:
            writer.writerow([user, 'mention'])
    return results
def filter_mention(user_set):
    results = []
    #run_type
    if RUN_TYPE == 1:
        now_date = ts2datetime(time.time())
    else:
        now_date = RUN_TEST_TIME
    timestamp = datetime2ts(now_date) - DAY
    date = ts2datetime(timestamp)
    for user in user_set:
        mention_set = set()
        for i in range(0,7):
            ts = timestamp - DAY*i
            result = r_cluster.hget('at_'+str(ts), str(user))
            if result:
                item_dict = json.loads(result)
                for at_user in item_dict:
                    mention_set.add(at_user)
        at_count = len(mention_set)
        if at_count < mention_threshold:
            results.append(user)
        else:
            writer.writerow([user, 'mention'])
    return results
示例#47
0
def test(ft_type):
    print ft_type
    if ft_type == 'facebook':
        index_name_pre = facebook_flow_text_index_name_pre
        index_type = facebook_flow_text_index_type
        user_index_name = facebook_user_index_name
        user_index_type = facebook_user_index_type
    else:
        index_name_pre = twitter_flow_text_index_name_pre
        index_type = twitter_flow_text_index_type
        user_index_name = twitter_user_index_name
        user_index_type = twitter_user_index_type

    # date_list = load_date_list(True)
    date_list = load_date_list()

    DFA = createWordTree()
    query_body = {
        'post_filter': {
            'missing': {
                'field': 'keywords_string'
            }
        },
        'query': {
            'filtered': {
                'filter': {
                    'bool': {
                        'must': [{
                            'range': {
                                'flag_ch': {
                                    'gte': -1
                                }
                            }
                        }]
                    }
                }
            }
        }
    }
    for date in date_list:
        count = 0
        bulk_action = []
        index_name = index_name_pre + date
        try:
            es_scan_results = scan(es,
                                   query=query_body,
                                   size=1000,
                                   index=index_name,
                                   doc_type=index_type)
            while True:
                try:
                    scan_data = es_scan_results.next()
                    item = scan_data['_source']
                    text = item['text_ch']
                    uid = item['uid']
                    if ft_type == 'facebook':
                        _id = item['fid']
                    else:
                        _id = item['tid']

                    ts = datetime2ts(date)
                    #add sentiment field to weibo

                    sentiment, keywords_list = triple_classifier(item)

                    #add key words to weibo
                    keywords_dict, keywords_string = get_weibo_keywords(
                        keywords_list)

                    #sensitive_words_dict
                    sensitive_words_dict = searchWord(
                        text.encode('utf-8', 'ignore'), DFA)
                    if sensitive_words_dict:
                        sensitive_words_string_data = "&".join(
                            sensitive_words_dict.keys())
                        sensitive_words_dict_data = json.dumps(
                            sensitive_words_dict)
                    else:
                        sensitive_words_string_data = ""
                        sensitive_words_dict_data = json.dumps({})

                    #redis
                    if sensitive_words_dict:
                        sensitive_count_string = r_cluster.hget(
                            'sensitive_' + str(ts), str(uid))
                        if sensitive_count_string:  #redis取空
                            sensitive_count_dict = json.loads(
                                sensitive_count_string)
                            for word in sensitive_words_dict.keys():
                                if sensitive_count_dict.has_key(word):
                                    sensitive_count_dict[
                                        word] += sensitive_words_dict[word]
                                else:
                                    sensitive_count_dict[
                                        word] = sensitive_words_dict[word]
                            r_cluster.hset('sensitive_' + str(ts), str(uid),
                                           json.dumps(sensitive_count_dict))
                        else:
                            r_cluster.hset('sensitive_' + str(ts), str(uid),
                                           json.dumps(sensitive_words_dict))

                    #sensitive
                    sensitive_score = 0
                    if sensitive_words_dict:
                        for k, v in sensitive_words_dict.iteritems():
                            tmp_stage = r_sensitive.hget("sensitive_words", k)
                            if tmp_stage:
                                sensitive_score += v * sensitive_score_dict[
                                    str(tmp_stage)]

                    #directed_uid
                    directed_uid_data = 0
                    directed_uid, directed_uname = get_root_retweet(
                        text, uid, ft_type)
                    if directed_uid:
                        directed_uid_data = long(directed_uid)

                    # hashtag
                    hashtag = ''
                    RE = re.compile(
                        u'#([0-9a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)[ ," =.。: :、]'
                    )
                    hashtag_list = re.findall(RE, text)
                    if hashtag_list:
                        hashtag = '&'.join(hashtag_list)

                    #action
                    action = {'update': {'_id': _id}}

                    # action_data
                    action_data = {
                        'sentiment': str(sentiment),
                        'keywords_dict': json.dumps(keywords_dict),
                        'keywords_string': keywords_string,
                        'sensitive_words_string': sensitive_words_string_data,
                        'sensitive_words_dict': sensitive_words_dict_data,
                        'sensitive': sensitive_score,
                        'directed_uid': directed_uid_data,
                        'directed_uname': directed_uname,
                        'hashtag': hashtag,
                    }

                    bulk_action.extend([action, {'doc': action_data}])
                    count += 1

                    if count % 1000 == 0 and count != 0:
                        if bulk_action:
                            es.bulk(bulk_action,
                                    index=index_name,
                                    doc_type=facebook_flow_text_index_type,
                                    timeout=600)
                        bulk_action = []
                        count = 0
                except StopIteration:
                    break
            if bulk_action:

                es.bulk(bulk_action,
                        index=index_name,
                        doc_type=facebook_flow_text_index_type,
                        timeout=600)
        except Exception, e:  #es文档不存在
            print e
def cal_propage_work(item, sensitive_words):
    cluster_redis = R_CLUSTER_FLOW1
    user = str(item['uid'])
    uid = str(item['uid'])
    followers_count = item['user_fansnum']
    friends_count = item.get("user_friendsnum", 0)
    cluster_redis.hset(user, 'user_fansnum', followers_count)
    cluster_redis.hset(user, 'user_friendsnum', friends_count)

    retweeted_uid = str(item['root_uid'])
    retweeted_mid = str(item['root_mid'])

    message_type = int(item['message_type'])
    mid = str(item['mid'])
    timestamp = item['timestamp']
    text = item['text']

    sw_list = searchWord(text.encode('utf-8'))
    sensitive_result = len(sw_list)
    if sensitive_result:
        ts = ts2datetime(timestamp).replace('-','')
        map = {}
        for w in sw_list:
            word = "".join([chr(x) for x in w])
            word = word.decode('utf-8')
            if not map.__contains__(word):
                map[word] = 1
            else:
                map[word] += 1
        try:
            sensitive_count_string = r_cluster.hget('sensitive_'+str(ts), str(uid))
            sensitive_count_dict = json.loads(sensitive_count_string)
            for word in map:
                count = map[word]
                if sensitive_count_dict.__contains__(word):
                    sensitive_count_dict[word] += count
                else:
                    sensitive_count_dict[word] = count
            r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_count_dict))
        except:
            r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(map))


    if message_type == 1:
        cluster_redis.sadd('user_set', user)
        if sensitive_result:
            cluster_redis.hset('s_'+user, mid + '_origin_weibo_timestamp', timestamp)
        else:
            cluster_redis.hset(user, mid + '_origin_weibo_timestamp', timestamp)

    elif message_type == 2: # comment weibo
        cluster_redis.sadd('user_set', user)
        if cluster_redis.sismember(user + '_comment_weibo', retweeted_mid) or cluster_redis.sismember('s_' + user + '_comment_weibo', retweeted_mid):
            return

        #RE = re.compile(u'//@([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+):', re.UNICODE)
        #nicknames = RE.findall(text)

        if not sensitive_result:
            cluster_redis.sadd(user + '_comment_weibo', retweeted_mid)
            queue_index = get_queue_index(timestamp)
            cluster_redis.hincrby(user, 'comment_weibo', 1)

            if 1:
            #if len(nicknames) == 0:
                cluster_redis.hincrby(retweeted_uid, retweeted_mid + '_origin_weibo_comment', 1) 
                cluster_redis.hincrby(retweeted_uid, 'origin_weibo_comment_timestamp_%s' % queue_index, 1)
                cluster_redis.hset(retweeted_uid, retweeted_mid + '_origin_weibo_comment_timestamp', timestamp)
            """
            else:
                nick_id_ = nicknames[0]
                _id = single_redis.hget(NICK_UID_NAMESPACE, nick_id_)
                print _id
                single_redis.hset(ACTIVE_NICK_UID_NAMESPACE, nick_id_, _id)
                if _id:
                    cluster_redis.hincrby(str(_id), retweeted_mid + '_retweeted_weibo_comment', 1) 
                    cluster_redis.hincrby(str(_id), 'retweeted_weibo_comment_timestamp_%s' % queue_index, 1)
                    cluster_redis.hset(str(_id), retweeted_mid + '_retweeted_weibo_comment_timestamp', timestamp)
            """
        else:
            cluster_redis.sadd('s_' + user + '_comment_weibo', retweeted_mid)
            queue_index = get_queue_index(timestamp)
            cluster_redis.hincrby('s_'+user, 'comment_weibo', 1)

            if 1:
            #if len(nicknames) == 0:
                cluster_redis.hincrby('s_' + retweeted_uid, retweeted_mid + '_origin_weibo_comment', 1) 
                cluster_redis.hincrby('s_' + retweeted_uid, 'origin_weibo_comment_timestamp_%s' % queue_index, 1)
                cluster_redis.hset('s_' + retweeted_uid, retweeted_mid + '_origin_weibo_comment_timestamp', timestamp)
            """
            else:
                nick_id_ = nicknames[0]
                _id = single_redis.hget(NICK_UID_NAMESPACE, nick_id_)
                print _id
                single_redis.hset(ACTIVE_NICK_UID_NAMESPACE, nick_id_, _id)
                if _id:
                    cluster_redis.hincrby('s_' + str(_id), retweeted_mid + '_retweeted_weibo_comment', 1) 
                    cluster_redis.hincrby('s_' + str(_id), 'retweeted_weibo_comment_timestamp_%s' % queue_index, 1)
                    cluster_redis.hset('s_' + str(_id), retweeted_mid + '_retweeted_weibo_comment_timestamp', timestamp)
            """

    elif message_type == 3:
        cluster_redis.sadd('user_set', user)
        if cluster_redis.sismember(user + '_retweeted_weibo', retweeted_mid) or cluster_redis.sismember('s_' + user + '_retweeted_weibo', retweeted_mid):
            return
        """
        RE = re.compile(u'//@([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+):', re.UNICODE)
        nicknames = RE.findall(text)
        """
        if not sensitive_result:
            cluster_redis.sadd(user + '_retweeted_weibo', retweeted_mid)
            cluster_redis.hset(user, retweeted_mid + '_retweeted_weibo_timestamp', timestamp) 
            queue_index = get_queue_index(timestamp)
            cluster_redis.hincrby(retweeted_uid, 'origin_weibo_retweeted_timestamp_%s' % queue_index, 1)    
            cluster_redis.hincrby(retweeted_uid, retweeted_mid + '_origin_weibo_retweeted', 1) 
            """
            if len(nicknames) != 0:
                for nick_id in nicknames:
                    _id = single_redis.hget(NICK_UID_NAMESPACE, nick_id)
                    print _id
                    single_redis.hset(ACTIVE_NICK_UID_NAMESPACE, nick_id, _id)
                    if _id:
                        cluster_redis.hincrby(str(_id), retweeted_mid+'_retweeted_weibo_retweeted', 1) 
                        cluster_redis.hset(str(_id), 'retweeted_weibo_retweeted_timestamp', timestamp)
                        cluster_redis.hincrby(str(_id), 'retweeted_weibo_retweeted_timestamp_%s' % queue_index, 1)
            """
        else:
            cluster_redis.sadd('s_' + user + '_retweeted_weibo', retweeted_mid)
            cluster_redis.hset('s_' + user, retweeted_mid + '_retweeted_weibo_timestamp', timestamp)
            queue_index = get_queue_index(timestamp)
            cluster_redis.hincrby('s_' +retweeted_uid, 'origin_weibo_retweeted_timestamp_%s' %queue_index, 1)
            cluster_redis.hincrby('s_' +retweeted_uid, retweeted_mid + '_origin_weibo_retweeted', 1)
            """
def cal_propage_work(item, sensitive_words):
    cluster_redis = R_CLUSTER_FLOW1
    user = str(item['uid'])
    followers_count = item['user_fansnum']
    friends_count = item.get("user_friendsnum", 0)
    cluster_redis.hset(user, 'user_fansnum', followers_count)
    cluster_redis.hset(user, 'user_friendsnum', friends_count)

    retweeted_uid = str(item['root_uid'])
    retweeted_mid = str(item['root_mid'])

    message_type = int(item['message_type'])
    mid = str(item['mid'])
    timestamp = item['timestamp']
    text = item['text']

    sw_list = searchWord(text.encode('utf-8'))
    sensitive_result = len(sw_list)
    if sensitive_result:
        date = ts2datetime(timestamp)
        ts = datetime2ts(date)
        map = {}
        for w in sw_list:
            word = "".join([chr(x) for x in w])
            word = word.decode('utf-8')
            print word
            if not map.__contains__(word):
                map[word] = 1
            else:
                map[word] += 1
        try:
            sensitive_count_string = r_cluster.hget('sensitive_'+str(ts), str(uid))
            sensitive_count_dict = json.loads(sensitive_count_string)
            for word in map:
                count = map[word]
                if sensitive_count_dict.__contains__(word):
                    sensitive_count_dict[word] += count
                else:
                    sensitive_count_dict[word] = count
            r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_count_dict))
        except:
            r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_dict))

    if message_type == 1:
        cluster_redis.sadd('user_set', user)
        if sensitive_result:
            cluster_redis.hset('s_'+user, mid + '_origin_weibo_timestamp', timestamp)
        else:
            cluster_redis.hset(user, mid + '_origin_weibo_timestamp', timestamp)

    elif message_type == 2: # comment weibo
        cluster_redis.sadd('user_set', user)
        if cluster_redis.sismember(user + '_comment_weibo', retweeted_mid) or cluster_redis.sismember('s_' + user + '_comment_weibo', retweeted_mid):
            return

        #RE = re.compile(u'//@([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+):', re.UNICODE)
        #nicknames = RE.findall(text)

        if not sensitive_result:
            cluster_redis.sadd(user + '_comment_weibo', retweeted_mid)
            queue_index = get_queue_index(timestamp)
            cluster_redis.hincrby(user, 'comment_weibo', 1)

            if 1:
            #if len(nicknames) == 0:
                cluster_redis.hincrby(retweeted_uid, retweeted_mid + '_origin_weibo_comment', 1) 
                cluster_redis.hincrby(retweeted_uid, 'origin_weibo_comment_timestamp_%s' % queue_index, 1)
                cluster_redis.hset(retweeted_uid, retweeted_mid + '_origin_weibo_comment_timestamp', timestamp)
            """
            else:
                nick_id_ = nicknames[0]
                _id = single_redis.hget(NICK_UID_NAMESPACE, nick_id_)
                print _id
                single_redis.hset(ACTIVE_NICK_UID_NAMESPACE, nick_id_, _id)
                if _id:
                    cluster_redis.hincrby(str(_id), retweeted_mid + '_retweeted_weibo_comment', 1) 
                    cluster_redis.hincrby(str(_id), 'retweeted_weibo_comment_timestamp_%s' % queue_index, 1)
                    cluster_redis.hset(str(_id), retweeted_mid + '_retweeted_weibo_comment_timestamp', timestamp)
            """
        else:
            cluster_redis.sadd('s_' + user + '_comment_weibo', retweeted_mid)
            queue_index = get_queue_index(timestamp)
            cluster_redis.hincrby('s_'+user, 'comment_weibo', 1)

            if 1:
            #if len(nicknames) == 0:
                cluster_redis.hincrby('s_' + retweeted_uid, retweeted_mid + '_origin_weibo_comment', 1) 
                cluster_redis.hincrby('s_' + retweeted_uid, 'origin_weibo_comment_timestamp_%s' % queue_index, 1)
                cluster_redis.hset('s_' + retweeted_uid, retweeted_mid + '_origin_weibo_comment_timestamp', timestamp)
            """
            else:
                nick_id_ = nicknames[0]
                _id = single_redis.hget(NICK_UID_NAMESPACE, nick_id_)
                print _id
                single_redis.hset(ACTIVE_NICK_UID_NAMESPACE, nick_id_, _id)
                if _id:
                    cluster_redis.hincrby('s_' + str(_id), retweeted_mid + '_retweeted_weibo_comment', 1) 
                    cluster_redis.hincrby('s_' + str(_id), 'retweeted_weibo_comment_timestamp_%s' % queue_index, 1)
                    cluster_redis.hset('s_' + str(_id), retweeted_mid + '_retweeted_weibo_comment_timestamp', timestamp)
            """

    elif message_type == 3:
        cluster_redis.sadd('user_set', user)
        if cluster_redis.sismember(user + '_retweeted_weibo', retweeted_mid) or cluster_redis.sismember('s_' + user + '_retweeted_weibo', retweeted_mid):
            return
        """
        RE = re.compile(u'//@([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+):', re.UNICODE)
        nicknames = RE.findall(text)
        """
        if not sensitive_result:
            cluster_redis.sadd(user + '_retweeted_weibo', retweeted_mid)
            cluster_redis.hset(user, retweeted_mid + '_retweeted_weibo_timestamp', timestamp) 
            queue_index = get_queue_index(timestamp)
            cluster_redis.hincrby(retweeted_uid, 'origin_weibo_retweeted_timestamp_%s' % queue_index, 1)    
            cluster_redis.hincrby(retweeted_uid, retweeted_mid + '_origin_weibo_retweeted', 1) 
            """
            if len(nicknames) != 0:
                for nick_id in nicknames:
                    _id = single_redis.hget(NICK_UID_NAMESPACE, nick_id)
                    print _id
                    single_redis.hset(ACTIVE_NICK_UID_NAMESPACE, nick_id, _id)
                    if _id:
                        cluster_redis.hincrby(str(_id), retweeted_mid+'_retweeted_weibo_retweeted', 1) 
                        cluster_redis.hset(str(_id), 'retweeted_weibo_retweeted_timestamp', timestamp)
                        cluster_redis.hincrby(str(_id), 'retweeted_weibo_retweeted_timestamp_%s' % queue_index, 1)
            """
        else:
            cluster_redis.sadd('s_' + user + '_retweeted_weibo', retweeted_mid)
            cluster_redis.hset('s_' + user, retweeted_mid + '_retweeted_weibo_timestamp', timestamp)
            queue_index = get_queue_index(timestamp)
            cluster_redis.hincrby('s_' +retweeted_uid, 'origin_weibo_retweeted_timestamp_%s' %queue_index, 1)
            cluster_redis.hincrby('s_' +retweeted_uid, retweeted_mid + '_origin_weibo_retweeted', 1)
            """
示例#50
0
            item['keywords_string'] = keywords_string         # use to search

            sensitive_words_dict = searchWord(text.encode('utf-8', 'ignore'), DFA)
            if sensitive_words_dict:
                item['sensitive_words_string'] = "&".join(sensitive_words_dict.keys())
                item['sensitive_words_dict'] = json.dumps(sensitive_words_dict)
            else:
                item['sensitive_words_string'] = ""
                item['sensitive_words_dict'] = json.dumps({})

            timestamp = item['timestamp']
            date = ts2datetime(timestamp)
            ts = datetime2ts(date)
            if sensitive_words_dict:
                print sensitive_words_dict.keys()[0]
                sensitive_count_string = r_cluster.hget('sensitive_'+str(ts), str(uid))
                if sensitive_count_string: #redis取空
                    sensitive_count_dict = json.loads(sensitive_count_string)
                    for word in sensitive_words_dict.keys():
                        if sensitive_count_dict.has_key(word):
                            sensitive_count_dict[word] += sensitive_words_dict[word]
                        else:
                            sensitive_count_dict[word] = sensitive_words_dict[word]
                    r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_count_dict))
                else:
                    r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_words_dict))

            #identify whether to mapping new es
            weibo_timestamp = item['timestamp']
            should_index_name_date = ts2datetime(weibo_timestamp)
            if should_index_name_date != now_index_name_date:
示例#51
0
                    text.encode('utf-8', 'ignore'), DFA)
                if sensitive_words_dict:
                    item['sensitive_words_string'] = "&".join(
                        sensitive_words_dict.keys())
                    item['sensitive_words_dict'] = json.dumps(
                        sensitive_words_dict)
                else:
                    item['sensitive_words_string'] = ""
                    item['sensitive_words_dict'] = json.dumps({})

                timestamp = item['timestamp']
                date = ts2datetime(timestamp)
                ts = datetime2ts(date)
                if sensitive_words_dict:
                    #print 'sensitive_words_dict...keys[0]...',sensitive_words_dict.keys()[0]
                    sensitive_count_string = r_cluster.hget(
                        'sensitive_' + str(ts), str(uid))
                    if sensitive_count_string:  #redis取空
                        sensitive_count_dict = json.loads(
                            sensitive_count_string)
                        for word in sensitive_words_dict.keys():
                            if sensitive_count_dict.has_key(word):
                                sensitive_count_dict[
                                    word] += sensitive_words_dict[word]
                            else:
                                sensitive_count_dict[
                                    word] = sensitive_words_dict[word]
                        r_cluster.hset('sensitive_' + str(ts), str(uid),
                                       json.dumps(sensitive_count_dict))
                    else:
                        r_cluster.hset('sensitive_' + str(ts), str(uid),
                                       json.dumps(sensitive_words_dict))