def cal_sensitive_words_work(item, sw_list):
    timestamp = item['timestamp']
    uid = item['uid']
    timestamp = ts2datetime(timestamp).replace('-','')
    ts = timestamp
    map = {}
    for w in sw_list:
        word = "".join([chr(x) for x in w])
        word = word.decode('utf-8')
        if not map.__contains__(word):
            map[word] = 1
        else:
            map[word] += 1
    try:
        sensitive_count_string = r_cluster.hget('sensitive_'+str(ts), str(uid))
        sensitive_count_dict = json.loads(sensitive_count_string)
        for word in map:
            count = map[word]
            if sensitive_count_dict.__contains__(word):
                sensitive_count_dict[word] += count
            else:
                sensitive_count_dict[word] = count
        r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_count_dict))
    except:
        r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(map))
def cal_sensitive_words_work(item, sw_list):
    timestamp = item['timestamp']
    uid = item['uid']
    timestamp = ts2datetime(timestamp).replace('-', '')
    ts = timestamp
    map = {}
    for w in sw_list:
        word = "".join([chr(x) for x in w])
        word = word.decode('utf-8')
        if not map.__contains__(word):
            map[word] = 1
        else:
            map[word] += 1
    try:
        sensitive_count_string = r_cluster.hget('sensitive_' + str(ts),
                                                str(uid))
        sensitive_count_dict = json.loads(sensitive_count_string)
        for word in map:
            count = map[word]
            if sensitive_count_dict.__contains__(word):
                sensitive_count_dict[word] += count
            else:
                sensitive_count_dict[word] = count
        r_cluster.hset('sensitive_' + str(ts), str(uid),
                       json.dumps(sensitive_count_dict))
    except:
        r_cluster.hset('sensitive_' + str(ts), str(uid), json.dumps(map))
def cal_text_work(item):
    uid = item['uid']
    timestamp = item['timestamp']
    date = ts2datetime(timestamp)
    ts = datetime2ts(date)
    text = item['text']
    if isinstance(text, str):
        text = text.decode('utf-8', 'ignore')
    RE = re.compile(u'#([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)#', re.UNICODE)
    hashtag_list = RE.findall(text)
    if hashtag_list:
        # there all use unicode·
        hashtag_dict = dict()
        for hashtag in hashtag_list:
            try:
                hashtag_dict[hashtag] += 1
            except:
                hashtag_dict[hashtag] = 1
        try:
            hashtag_count_string = r_cluster.hget('hashtag_'+str(ts), str(uid))
            hashtag_count_dict = json.loads(hashtag_count_string)
            for hashtag in hashtag_dict:
                count = hashtag_dict[hashtag]
                try:
                    hashtag_count_dict[hashtag] += count
                except:
                    hashtag_count_dict[hashtag] = count
            r_cluster.hset('hashtag_'+str(ts), str(uid), json.dumps(hashtag_count_dict))
        except:
            r_cluster.hset('hashtag_'+str(ts), str(uid), json.dumps(hashtag_dict))
Exemplo n.º 4
0
def cal_text_sensitive(item):
    text = item['text']
    uid = item['uid']
    timestamp = item['timestamp']
    date = ts2datetime(timestamp)
    ts = datetime2ts(date)
    if isinstance(text, str):
        text = text.decode('utf-8', 'ignore')
    sensitive_result = [word for word in SENSITIVE_WORD if word in text]
    if sensitive_result:
        sensitive_dict = dict()
        for word in sensitive_result:
            try:
                sensitive_dict[word] += 1
            except:
                sensitive_dict[word] = 1
        try:
            sensitive_count_string = r_cluster.hget('sensitive_'+str(ts), str(uid))
            sensitive_count_dict = json.loads(sensitive_count_string)
            for word in sensitive_dict:
                count = sensitive_dict[word]
                try:
                    sensitive_count_dict[word] += count
                except:
                    sensitive_count_dict[word] = count
            r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_count_dict))
        except:
            r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_dict))
Exemplo n.º 5
0
def cal_text_work(item):
    uid = item['uid']
    timestamp = item['timestamp']
    date = ts2datetime(timestamp)
    ts = datetime2ts(date)
    text = item['text']
    if isinstance(text, str):
        text = text.decode('utf-8', 'ignore')
    RE = re.compile(u'#([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)#', re.UNICODE)
    hashtag_list = RE.findall(text)
    if hashtag_list:
        # there all use unicode·
        hashtag_dict = dict()
        for hashtag in hashtag_list:
            try:
                hashtag_dict[hashtag] += 1
            except:
                hashtag_dict[hashtag] = 1
        try:
            hashtag_count_string = r_cluster.hget('hashtag_'+str(ts), str(uid))
            hashtag_count_dict = json.loads(hashtag_count_string)
            for hashtag in hashtag_dict:
                count = hashtag_dict[hashtag]
                try:
                    hashtag_count_dict[hashtag] += count
                except:
                    hashtag_count_dict[hashtag] = count
            r_cluster.hset('hashtag_'+str(ts), str(uid), json.dumps(hashtag_count_dict))
        except:
            r_cluster.hset('hashtag_'+str(ts), str(uid), json.dumps(hashtag_dict))
def cal_text_sensitive(item):
    text = item['text']
    uid = item['uid']
    timestamp = item['timestamp']
    date = ts2datetime(timestamp)
    ts = datetime2ts(date)
    if isinstance(text, str):
        text = text.decode('utf-8', 'ignore')
    sensitive_result = [word for word in SENSITIVE_WORD if word in text]
    if sensitive_result:
        sensitive_dict = dict()
        for word in sensitive_result:
            try:
                sensitive_dict[word] += 1
            except:
                sensitive_dict[word] = 1
        try:
            sensitive_count_string = r_cluster.hget('sensitive_'+str(ts), str(uid))
            sensitive_count_dict = json.loads(sensitive_count_string)
            for word in sensitive_dict:
                count = sensitive_dict[word]
                try:
                    sensitive_count_dict[word] += count
                except:
                    sensitive_count_dict[word] = count
            r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_count_dict))
        except:
            r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_dict))
Exemplo n.º 7
0
def save_activity(uid, ts, time_segment):
    key = str(ts)
    try:
        activity_count_dict = r_cluster.hget('activity_' + key, str(uid))
        activity_count_dict = json.loads(activity_count_dict)
        try:
            activity_count_dict[str(time_segment)] += 1
        except:
            activity_count_dict[str(time_segment)] = 1
        r_cluster.hset('activity_' + key, str(uid), json.dumps(activity_count_dict))
    except:
        r_cluster.hset('activity_' + key, str(uid), json.dumps({str(time_segment): 1}))
def save_city(uid, ip, timestamp, sensitive):
    ts = ts2datetime(timestamp).replace('-','')
    key = str(uid)
    try:
        if sensitive:
            ip_count_string = r_cluster.hget('sensitive_ip_'+str(ts), str(uid))
        else:
            ip_count_string = r_cluster.hget('ip_'+str(ts), str(uid))

        ip_count_dict = json.loads(ip_count_string)

        try:
            ip_count_dict[str(ip)] += 1
        except:
            ip_count_dict[str(ip)] = 1

        if sensitive:
            r_cluster.hset('sensitive_ip_'+str(ts), str(uid), json.dumps(ip_count_dict))
        else:
            r_cluster.hset('ip_'+str(ts), str(uid), json.dumps(ip_count_dict))

    except:
        if sensitive:
            r_cluster.hset('sensitive_ip_'+str(ts), str(uid), json.dumps({str(ip):1}))
        else:
            r_cluster.hset('ip_'+str(ts), str(uid), json.dumps({str(ip):1}))
Exemplo n.º 9
0
def save_at(uid, at_uid, timestamp, sensitive):
    ts = ts2datetime(timestamp).replace('-', '')
    key = str(uid)
    try:
        if sensitive:
            ruid_count_string = r_cluster.hget('sensitive_at_' + str(ts),
                                               str(uid))
        else:
            ruid_count_string = r_cluster.hget('at_' + str(ts), str(uid))

        ruid_count_dict = json.loads(ruid_count_string)
        try:
            ruid_count_dict[str(at_uid)] += 1
        except:
            ruid_count_dict[str(at_uid)] = 1
        if sensitive:
            r_cluster.hset('sensitive_at_' + str(ts), str(uid),
                           json.dumps(ruid_count_dict))
        else:
            r_cluster.hset('at_' + str(ts), str(uid),
                           json.dumps(ruid_count_dict))

    except:
        if sensitive:
            r_cluster.hset('sensitive_at_' + str(ts), str(uid),
                           json.dumps({str(at_uid): 1}))
        else:
            r_cluster.hset('at_' + str(ts), str(uid),
                           json.dumps({str(at_uid): 1}))
Exemplo n.º 10
0
def save_city(uid, ip, timestamp):
    date = ts2datetime(timestamp)
    ts = datetime2ts(date)
    key = str(uid)
    try:
        ip_count_string = r_cluster.hget('ip_'+str(ts), str(uid))
        ip_count_dict = json.loads(ip_count_string)
        try:
            ip_count_dict[str(ip)] += 1
        except:
            ip_count_dict[str(ip)] = 1
        r_cluster.hset('ip_'+str(ts), str(uid), json.dumps(ip_count_dict))
    except:
        r_cluster.hset('ip_'+str(ts), str(uid), json.dumps({str(ip):1}))
Exemplo n.º 11
0
def save_activity(uid, ts, time_segment):
    key = str(ts)
    try:
        activity_count_dict = r_cluster.hget('activity_' + key, str(uid))
        activity_count_dict = json.loads(activity_count_dict)
        try:
            activity_count_dict[str(time_segment)] += 1
        except:
            activity_count_dict[str(time_segment)] = 1
        r_cluster.hset('activity_' + key, str(uid),
                       json.dumps(activity_count_dict))
    except:
        r_cluster.hset('activity_' + key, str(uid),
                       json.dumps({str(time_segment): 1}))
Exemplo n.º 12
0
def save_at(uid, at_uid, timestamp):
    date = ts2datetime(timestamp)
    ts = datetime2ts(date)
    key = str(uid)
    try:
        ruid_count_string = r_cluster.hget('at_'+str(ts), str(uid))
        ruid_count_dict = json.loads(ruid_count_string)
        try:
            ruid_count_dict[str(at_uid)] += 1
        except:
            ruid_count_dict[str(at_uid)] = 1
        r_cluster.hset('at_'+str(ts), str(uid), json.dumps(ruid_count_dict))
    except:
        r_cluster.hset('at_'+str(ts), str(uid), json.dumps({str(at_uid):1}))
Exemplo n.º 13
0
def save_at(uid, at_uid, timestamp):
    date = ts2datetime(timestamp)
    ts = datetime2ts(date)
    key = str(uid)
    try:
        ruid_count_string = r_cluster.hget('at_' + str(ts), str(uid))
        ruid_count_dict = json.loads(ruid_count_string)
        try:
            ruid_count_dict[str(at_uid)] += 1
        except:
            ruid_count_dict[str(at_uid)] = 1
        r_cluster.hset('at_' + str(ts), str(uid), json.dumps(ruid_count_dict))
    except:
        r_cluster.hset('at_' + str(ts), str(uid), json.dumps({str(at_uid): 1}))
Exemplo n.º 14
0
def save_city(uid, ip, timestamp, sensitive):
    ts = ts2datetime(timestamp).replace('-', '')
    key = str(uid)
    try:
        if sensitive:
            ip_count_string = r_cluster.hget('sensitive_ip_' + str(ts),
                                             str(uid))
        else:
            ip_count_string = r_cluster.hget('ip_' + str(ts), str(uid))

        ip_count_dict = json.loads(ip_count_string)

        try:
            ip_count_dict[str(ip)] += 1
        except:
            ip_count_dict[str(ip)] = 1

        if sensitive:
            r_cluster.hset('sensitive_ip_' + str(ts), str(uid),
                           json.dumps(ip_count_dict))
        else:
            r_cluster.hset('ip_' + str(ts), str(uid),
                           json.dumps(ip_count_dict))

    except:
        if sensitive:
            r_cluster.hset('sensitive_ip_' + str(ts), str(uid),
                           json.dumps({str(ip): 1}))
        else:
            r_cluster.hset('ip_' + str(ts), str(uid), json.dumps({str(ip): 1}))
def save_city_timestamp(uid, ip, timestamp):
    date = ts2datetime(timestamp)
    ts = datetime2ts(date)
    try:
        ip_timestamp_string = r_cluster.hget('new_ip_'+str(ts), str(uid))
        ip_timestamp_string_dict = json.loads(ip_timestamp_string)
        try:
            add_string = '&'+str(timestamp)
            ip_timestamp_string_dict[str(ip)] += add_string
        except:
            ip_timestamp_string_dict[str(ip)] = str(timestamp)
        r_cluster.hset('new_ip_'+str(ts), str(uid), json.dumps(ip_timestamp_string_dict))

    except:
        r_cluster.hset('new_ip_'+str(ts), str(uid), json.dumps({str(ip): str(timestamp)}))
Exemplo n.º 16
0
def save_city_timestamp(uid, ip, timestamp):
    date = ts2datetime(timestamp)
    ts = datetime2ts(date)
    try:
        ip_timestamp_string = r_cluster.hget('new_ip_' + str(ts), str(uid))
        ip_timestamp_string_dict = json.loads(ip_timestamp_string)
        try:
            add_string = '&' + str(timestamp)
            ip_timestamp_string_dict[str(ip)] += add_string
        except:
            ip_timestamp_string_dict[str(ip)] = str(timestamp)
        r_cluster.hset('new_ip_' + str(ts), str(uid),
                       json.dumps(ip_timestamp_string_dict))

    except:
        r_cluster.hset('new_ip_' + str(ts), str(uid),
                       json.dumps({str(ip): str(timestamp)}))
def cal_hashtag_work(item, sensitive):
    text = item['text']
    uid = item['uid']
    timestamp = item['timestamp']
    ts = ts2datetime(timestamp).replace('-', '')

    if isinstance(text, str):
        text = text.decode('utf-8', 'ignore')
    RE = re.compile(u'#([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)#',
                    re.UNICODE)
    hashtag_list = RE.findall(text)
    if hashtag_list:
        hashtag_dict = {}
        for hashtag in hashtag_list:
            try:
                hashtag_dict[hashtag] += 1
            except:
                hashtag_dict[hashtag] = 1

        try:
            if sensitive:
                hashtag_count_string = r_cluster.hget(
                    'sensitive_hashtag_' + str(ts), str(uid))
            else:
                hashtag_count_string = r_cluster.hget('hashtag_' + str(ts),
                                                      str(uid))
            hashtag_count_dict = json.loads(hashtag_count_string)
            for hashtag in hashtag_dict:
                count = hashtag_dict[hashtag]
                try:
                    hashtag_count_dict[hashtag] += count
                except:
                    hashtag_count_dict[hashtag] = count
            if sensitive:
                r_cluster.hset('sensitive_hashtag_' + str(ts), str(uid),
                               json.dumps(hashtag_count_dict))
            else:
                r_cluster.hset('hashtag_' + str(ts), str(uid),
                               json.dumps(hashtag_count_dict))
        except:
            if sensitive:
                r_cluster.hset('sensitive_hashtag_' + str(ts), str(uid),
                               json.dumps(hashtag_dict))
            else:
                r_cluster.hset('hashtag_' + str(ts), str(uid),
                               json.dumps(hashtag_dict))
def cal_hashtag_work(item, sensitive):
    text = item['text']
    uid = item['uid']
    timestamp = item['timestamp']
    ts = ts2datetime(timestamp).replace('-','')

    if isinstance(text, str):
        text = text.decode('utf-8', 'ignore')
    RE = re.compile(u'#([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)#', re.UNICODE)
    hashtag_list = RE.findall(text)
    if hashtag_list:
        hashtag_dict = {}
        for hashtag in hashtag_list:
            try:
                hashtag_dict[hashtag] += 1
            except:
                hashtag_dict[hashtag] = 1

        try:
            if sensitive:
                hashtag_count_string = r_cluster.hget('sensitive_hashtag_'+str(ts), str(uid))
            else:
                hashtag_count_string = r_cluster.hget('hashtag_'+str(ts), str(uid))
            hashtag_count_dict = json.loads(hashtag_count_string)
            for hashtag in hashtag_dict:
                count = hashtag_dict[hashtag]
                try:
                    hashtag_count_dict[hashtag] += count
                except:
                    hashtag_count_dict[hashtag] = count
            if sensitive:
                r_cluster.hset('sensitive_hashtag_'+str(ts), str(uid), json.dumps(hashtag_count_dict))
            else:
                r_cluster.hset('hashtag_'+str(ts), str(uid), json.dumps(hashtag_count_dict))
        except:
            if sensitive:
                r_cluster.hset('sensitive_hashtag_'+str(ts), str(uid), json.dumps(hashtag_dict))
            else:
                r_cluster.hset('hashtag_'+str(ts), str(uid), json.dumps(hashtag_dict))
def save_at(uid, at_uid, timestamp, sensitive):
    ts = ts2datetime(timestamp).replace('-','')
    key = str(uid)
    try:
        if sensitive:
            ruid_count_string = r_cluster.hget('sensitive_at_'+str(ts), str(uid))
        else:
            ruid_count_string = r_cluster.hget('at_'+str(ts), str(uid))

        ruid_count_dict = json.loads(ruid_count_string)
        try:
            ruid_count_dict[str(at_uid)] += 1
        except:
            ruid_count_dict[str(at_uid)] = 1
        if sensitive:
            r_cluster.hset('sensitive_at_'+str(ts), str(uid), json.dumps(ruid_count_dict))
        else:
            r_cluster.hset('at_'+str(ts), str(uid), json.dumps(ruid_count_dict))

    except:
        if sensitive:
            r_cluster.hset('sensitive_at_'+str(ts), str(uid), json.dumps({str(at_uid):1}))
        else:
            r_cluster.hset('at_'+str(ts), str(uid), json.dumps({str(at_uid):1}))
Exemplo n.º 20
0
                ts = datetime2ts(date)
                if sensitive_words_dict:
                    #print 'sensitive_words_dict...keys[0]...',sensitive_words_dict.keys()[0]
                    sensitive_count_string = r_cluster.hget(
                        'sensitive_' + str(ts), str(uid))
                    if sensitive_count_string:  #redis取空
                        sensitive_count_dict = json.loads(
                            sensitive_count_string)
                        for word in sensitive_words_dict.keys():
                            if sensitive_count_dict.has_key(word):
                                sensitive_count_dict[
                                    word] += sensitive_words_dict[word]
                            else:
                                sensitive_count_dict[
                                    word] = sensitive_words_dict[word]
                        r_cluster.hset('sensitive_' + str(ts), str(uid),
                                       json.dumps(sensitive_count_dict))
                    else:
                        r_cluster.hset('sensitive_' + str(ts), str(uid),
                                       json.dumps(sensitive_words_dict))

                #identify whether to mapping new es
                weibo_timestamp = item['timestamp']
                #should_index_name_date = ts2datetime(weibo_timestamp)
                # if should_index_name_date != now_index_name_date:
                if action != [] and xdata != []:
                    #index_name = index_name_pre + now_index_name_date
                    if bulk_action:
                        es.bulk(bulk_action,
                                index=index_name,
                                doc_type=index_type,
                                timeout=60)
def cal_propage_work(item, sensitive_words):
    cluster_redis = R_CLUSTER_FLOW1
    user = str(item['uid'])
    followers_count = item['user_fansnum']
    friends_count = item.get("user_friendsnum", 0)
    cluster_redis.hset(user, 'user_fansnum', followers_count)
    cluster_redis.hset(user, 'user_friendsnum', friends_count)

    retweeted_uid = str(item['root_uid'])
    retweeted_mid = str(item['root_mid'])

    message_type = int(item['message_type'])
    mid = str(item['mid'])
    timestamp = item['timestamp']
    text = item['text']

    sw_list = searchWord(text.encode('utf-8'))
    sensitive_result = len(sw_list)
    if sensitive_result:
        date = ts2datetime(timestamp)
        ts = datetime2ts(date)
        map = {}
        for w in sw_list:
            word = "".join([chr(x) for x in w])
            word = word.decode('utf-8')
            print word
            if not map.__contains__(word):
                map[word] = 1
            else:
                map[word] += 1
        try:
            sensitive_count_string = r_cluster.hget('sensitive_'+str(ts), str(uid))
            sensitive_count_dict = json.loads(sensitive_count_string)
            for word in map:
                count = map[word]
                if sensitive_count_dict.__contains__(word):
                    sensitive_count_dict[word] += count
                else:
                    sensitive_count_dict[word] = count
            r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_count_dict))
        except:
            r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_dict))

    if message_type == 1:
        cluster_redis.sadd('user_set', user)
        if sensitive_result:
            cluster_redis.hset('s_'+user, mid + '_origin_weibo_timestamp', timestamp)
        else:
            cluster_redis.hset(user, mid + '_origin_weibo_timestamp', timestamp)

    elif message_type == 2: # comment weibo
        cluster_redis.sadd('user_set', user)
        if cluster_redis.sismember(user + '_comment_weibo', retweeted_mid) or cluster_redis.sismember('s_' + user + '_comment_weibo', retweeted_mid):
            return

        #RE = re.compile(u'//@([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+):', re.UNICODE)
        #nicknames = RE.findall(text)

        if not sensitive_result:
            cluster_redis.sadd(user + '_comment_weibo', retweeted_mid)
            queue_index = get_queue_index(timestamp)
            cluster_redis.hincrby(user, 'comment_weibo', 1)

            if 1:
            #if len(nicknames) == 0:
                cluster_redis.hincrby(retweeted_uid, retweeted_mid + '_origin_weibo_comment', 1) 
                cluster_redis.hincrby(retweeted_uid, 'origin_weibo_comment_timestamp_%s' % queue_index, 1)
                cluster_redis.hset(retweeted_uid, retweeted_mid + '_origin_weibo_comment_timestamp', timestamp)
            """
            else:
                nick_id_ = nicknames[0]
                _id = single_redis.hget(NICK_UID_NAMESPACE, nick_id_)
                print _id
                single_redis.hset(ACTIVE_NICK_UID_NAMESPACE, nick_id_, _id)
                if _id:
                    cluster_redis.hincrby(str(_id), retweeted_mid + '_retweeted_weibo_comment', 1) 
                    cluster_redis.hincrby(str(_id), 'retweeted_weibo_comment_timestamp_%s' % queue_index, 1)
                    cluster_redis.hset(str(_id), retweeted_mid + '_retweeted_weibo_comment_timestamp', timestamp)
            """
        else:
            cluster_redis.sadd('s_' + user + '_comment_weibo', retweeted_mid)
            queue_index = get_queue_index(timestamp)
            cluster_redis.hincrby('s_'+user, 'comment_weibo', 1)

            if 1:
            #if len(nicknames) == 0:
                cluster_redis.hincrby('s_' + retweeted_uid, retweeted_mid + '_origin_weibo_comment', 1) 
                cluster_redis.hincrby('s_' + retweeted_uid, 'origin_weibo_comment_timestamp_%s' % queue_index, 1)
                cluster_redis.hset('s_' + retweeted_uid, retweeted_mid + '_origin_weibo_comment_timestamp', timestamp)
            """
            else:
                nick_id_ = nicknames[0]
                _id = single_redis.hget(NICK_UID_NAMESPACE, nick_id_)
                print _id
                single_redis.hset(ACTIVE_NICK_UID_NAMESPACE, nick_id_, _id)
                if _id:
                    cluster_redis.hincrby('s_' + str(_id), retweeted_mid + '_retweeted_weibo_comment', 1) 
                    cluster_redis.hincrby('s_' + str(_id), 'retweeted_weibo_comment_timestamp_%s' % queue_index, 1)
                    cluster_redis.hset('s_' + str(_id), retweeted_mid + '_retweeted_weibo_comment_timestamp', timestamp)
            """

    elif message_type == 3:
        cluster_redis.sadd('user_set', user)
        if cluster_redis.sismember(user + '_retweeted_weibo', retweeted_mid) or cluster_redis.sismember('s_' + user + '_retweeted_weibo', retweeted_mid):
            return
        """
        RE = re.compile(u'//@([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+):', re.UNICODE)
        nicknames = RE.findall(text)
        """
        if not sensitive_result:
            cluster_redis.sadd(user + '_retweeted_weibo', retweeted_mid)
            cluster_redis.hset(user, retweeted_mid + '_retweeted_weibo_timestamp', timestamp) 
            queue_index = get_queue_index(timestamp)
            cluster_redis.hincrby(retweeted_uid, 'origin_weibo_retweeted_timestamp_%s' % queue_index, 1)    
            cluster_redis.hincrby(retweeted_uid, retweeted_mid + '_origin_weibo_retweeted', 1) 
            """
            if len(nicknames) != 0:
                for nick_id in nicknames:
                    _id = single_redis.hget(NICK_UID_NAMESPACE, nick_id)
                    print _id
                    single_redis.hset(ACTIVE_NICK_UID_NAMESPACE, nick_id, _id)
                    if _id:
                        cluster_redis.hincrby(str(_id), retweeted_mid+'_retweeted_weibo_retweeted', 1) 
                        cluster_redis.hset(str(_id), 'retweeted_weibo_retweeted_timestamp', timestamp)
                        cluster_redis.hincrby(str(_id), 'retweeted_weibo_retweeted_timestamp_%s' % queue_index, 1)
            """
        else:
            cluster_redis.sadd('s_' + user + '_retweeted_weibo', retweeted_mid)
            cluster_redis.hset('s_' + user, retweeted_mid + '_retweeted_weibo_timestamp', timestamp)
            queue_index = get_queue_index(timestamp)
            cluster_redis.hincrby('s_' +retweeted_uid, 'origin_weibo_retweeted_timestamp_%s' %queue_index, 1)
            cluster_redis.hincrby('s_' +retweeted_uid, retweeted_mid + '_origin_weibo_retweeted', 1)
            """
Exemplo n.º 22
0
def test(ft_type):
    print ft_type
    if ft_type == 'facebook':
        index_name_pre = facebook_flow_text_index_name_pre
        index_type = facebook_flow_text_index_type
        user_index_name = facebook_user_index_name
        user_index_type = facebook_user_index_type
    else:
        index_name_pre = twitter_flow_text_index_name_pre
        index_type = twitter_flow_text_index_type
        user_index_name = twitter_user_index_name
        user_index_type = twitter_user_index_type

    # date_list = load_date_list(True)
    date_list = load_date_list()

    DFA = createWordTree()
    query_body = {
        'post_filter': {
            'missing': {
                'field': 'keywords_string'
            }
        },
        'query': {
            'filtered': {
                'filter': {
                    'bool': {
                        'must': [{
                            'range': {
                                'flag_ch': {
                                    'gte': -1
                                }
                            }
                        }]
                    }
                }
            }
        }
    }
    for date in date_list:
        count = 0
        bulk_action = []
        index_name = index_name_pre + date
        try:
            es_scan_results = scan(es,
                                   query=query_body,
                                   size=1000,
                                   index=index_name,
                                   doc_type=index_type)
            while True:
                try:
                    scan_data = es_scan_results.next()
                    item = scan_data['_source']
                    text = item['text_ch']
                    uid = item['uid']
                    if ft_type == 'facebook':
                        _id = item['fid']
                    else:
                        _id = item['tid']

                    ts = datetime2ts(date)
                    #add sentiment field to weibo

                    sentiment, keywords_list = triple_classifier(item)

                    #add key words to weibo
                    keywords_dict, keywords_string = get_weibo_keywords(
                        keywords_list)

                    #sensitive_words_dict
                    sensitive_words_dict = searchWord(
                        text.encode('utf-8', 'ignore'), DFA)
                    if sensitive_words_dict:
                        sensitive_words_string_data = "&".join(
                            sensitive_words_dict.keys())
                        sensitive_words_dict_data = json.dumps(
                            sensitive_words_dict)
                    else:
                        sensitive_words_string_data = ""
                        sensitive_words_dict_data = json.dumps({})

                    #redis
                    if sensitive_words_dict:
                        sensitive_count_string = r_cluster.hget(
                            'sensitive_' + str(ts), str(uid))
                        if sensitive_count_string:  #redis取空
                            sensitive_count_dict = json.loads(
                                sensitive_count_string)
                            for word in sensitive_words_dict.keys():
                                if sensitive_count_dict.has_key(word):
                                    sensitive_count_dict[
                                        word] += sensitive_words_dict[word]
                                else:
                                    sensitive_count_dict[
                                        word] = sensitive_words_dict[word]
                            r_cluster.hset('sensitive_' + str(ts), str(uid),
                                           json.dumps(sensitive_count_dict))
                        else:
                            r_cluster.hset('sensitive_' + str(ts), str(uid),
                                           json.dumps(sensitive_words_dict))

                    #sensitive
                    sensitive_score = 0
                    if sensitive_words_dict:
                        for k, v in sensitive_words_dict.iteritems():
                            tmp_stage = r_sensitive.hget("sensitive_words", k)
                            if tmp_stage:
                                sensitive_score += v * sensitive_score_dict[
                                    str(tmp_stage)]

                    #directed_uid
                    directed_uid_data = 0
                    directed_uid, directed_uname = get_root_retweet(
                        text, uid, ft_type)
                    if directed_uid:
                        directed_uid_data = long(directed_uid)

                    # hashtag
                    hashtag = ''
                    RE = re.compile(
                        u'#([0-9a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)[ ," =.。: :、]'
                    )
                    hashtag_list = re.findall(RE, text)
                    if hashtag_list:
                        hashtag = '&'.join(hashtag_list)

                    #action
                    action = {'update': {'_id': _id}}

                    # action_data
                    action_data = {
                        'sentiment': str(sentiment),
                        'keywords_dict': json.dumps(keywords_dict),
                        'keywords_string': keywords_string,
                        'sensitive_words_string': sensitive_words_string_data,
                        'sensitive_words_dict': sensitive_words_dict_data,
                        'sensitive': sensitive_score,
                        'directed_uid': directed_uid_data,
                        'directed_uname': directed_uname,
                        'hashtag': hashtag,
                    }

                    bulk_action.extend([action, {'doc': action_data}])
                    count += 1

                    if count % 1000 == 0 and count != 0:
                        if bulk_action:
                            es.bulk(bulk_action,
                                    index=index_name,
                                    doc_type=facebook_flow_text_index_type,
                                    timeout=600)
                        bulk_action = []
                        count = 0
                except StopIteration:
                    break
            if bulk_action:

                es.bulk(bulk_action,
                        index=index_name,
                        doc_type=facebook_flow_text_index_type,
                        timeout=600)
        except Exception, e:  #es文档不存在
            print e
Exemplo n.º 23
0
                item['sensitive_words_dict'] = json.dumps({})

            timestamp = item['timestamp']
            date = ts2datetime(timestamp)
            ts = datetime2ts(date)
            if sensitive_words_dict:
                print sensitive_words_dict.keys()[0]
                sensitive_count_string = r_cluster.hget('sensitive_'+str(ts), str(uid))
                if sensitive_count_string: #redis取空
                    sensitive_count_dict = json.loads(sensitive_count_string)
                    for word in sensitive_words_dict.keys():
                        if sensitive_count_dict.has_key(word):
                            sensitive_count_dict[word] += sensitive_words_dict[word]
                        else:
                            sensitive_count_dict[word] = sensitive_words_dict[word]
                    r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_count_dict))
                else:
                    r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_words_dict))

            #identify whether to mapping new es
            weibo_timestamp = item['timestamp']
            should_index_name_date = ts2datetime(weibo_timestamp)
            if should_index_name_date != now_index_name_date:
                if action != [] and xdata != []:
                    index_name = index_name_pre + now_index_name_date
                    if bulk_action:
                        es.bulk(bulk_action, index=index_name, doc_type=index_type, timeout=60)
                    bulk_action = []
                    count = 0
                    now_index_name_date = should_index_name_date
                    index_name = index_name_pre + now_index_name_date
def cal_propage_work(item, sensitive_words):
    cluster_redis = R_CLUSTER_FLOW1
    user = str(item['uid'])
    uid = str(item['uid'])
    followers_count = item['user_fansnum']
    friends_count = item.get("user_friendsnum", 0)
    cluster_redis.hset(user, 'user_fansnum', followers_count)
    cluster_redis.hset(user, 'user_friendsnum', friends_count)

    retweeted_uid = str(item['root_uid'])
    retweeted_mid = str(item['root_mid'])

    message_type = int(item['message_type'])
    mid = str(item['mid'])
    timestamp = item['timestamp']
    text = item['text']

    sw_list = searchWord(text.encode('utf-8'))
    sensitive_result = len(sw_list)
    if sensitive_result:
        ts = ts2datetime(timestamp).replace('-','')
        map = {}
        for w in sw_list:
            word = "".join([chr(x) for x in w])
            word = word.decode('utf-8')
            if not map.__contains__(word):
                map[word] = 1
            else:
                map[word] += 1
        try:
            sensitive_count_string = r_cluster.hget('sensitive_'+str(ts), str(uid))
            sensitive_count_dict = json.loads(sensitive_count_string)
            for word in map:
                count = map[word]
                if sensitive_count_dict.__contains__(word):
                    sensitive_count_dict[word] += count
                else:
                    sensitive_count_dict[word] = count
            r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_count_dict))
        except:
            r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(map))


    if message_type == 1:
        cluster_redis.sadd('user_set', user)
        if sensitive_result:
            cluster_redis.hset('s_'+user, mid + '_origin_weibo_timestamp', timestamp)
        else:
            cluster_redis.hset(user, mid + '_origin_weibo_timestamp', timestamp)

    elif message_type == 2: # comment weibo
        cluster_redis.sadd('user_set', user)
        if cluster_redis.sismember(user + '_comment_weibo', retweeted_mid) or cluster_redis.sismember('s_' + user + '_comment_weibo', retweeted_mid):
            return

        #RE = re.compile(u'//@([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+):', re.UNICODE)
        #nicknames = RE.findall(text)

        if not sensitive_result:
            cluster_redis.sadd(user + '_comment_weibo', retweeted_mid)
            queue_index = get_queue_index(timestamp)
            cluster_redis.hincrby(user, 'comment_weibo', 1)

            if 1:
            #if len(nicknames) == 0:
                cluster_redis.hincrby(retweeted_uid, retweeted_mid + '_origin_weibo_comment', 1) 
                cluster_redis.hincrby(retweeted_uid, 'origin_weibo_comment_timestamp_%s' % queue_index, 1)
                cluster_redis.hset(retweeted_uid, retweeted_mid + '_origin_weibo_comment_timestamp', timestamp)
            """
            else:
                nick_id_ = nicknames[0]
                _id = single_redis.hget(NICK_UID_NAMESPACE, nick_id_)
                print _id
                single_redis.hset(ACTIVE_NICK_UID_NAMESPACE, nick_id_, _id)
                if _id:
                    cluster_redis.hincrby(str(_id), retweeted_mid + '_retweeted_weibo_comment', 1) 
                    cluster_redis.hincrby(str(_id), 'retweeted_weibo_comment_timestamp_%s' % queue_index, 1)
                    cluster_redis.hset(str(_id), retweeted_mid + '_retweeted_weibo_comment_timestamp', timestamp)
            """
        else:
            cluster_redis.sadd('s_' + user + '_comment_weibo', retweeted_mid)
            queue_index = get_queue_index(timestamp)
            cluster_redis.hincrby('s_'+user, 'comment_weibo', 1)

            if 1:
            #if len(nicknames) == 0:
                cluster_redis.hincrby('s_' + retweeted_uid, retweeted_mid + '_origin_weibo_comment', 1) 
                cluster_redis.hincrby('s_' + retweeted_uid, 'origin_weibo_comment_timestamp_%s' % queue_index, 1)
                cluster_redis.hset('s_' + retweeted_uid, retweeted_mid + '_origin_weibo_comment_timestamp', timestamp)
            """
            else:
                nick_id_ = nicknames[0]
                _id = single_redis.hget(NICK_UID_NAMESPACE, nick_id_)
                print _id
                single_redis.hset(ACTIVE_NICK_UID_NAMESPACE, nick_id_, _id)
                if _id:
                    cluster_redis.hincrby('s_' + str(_id), retweeted_mid + '_retweeted_weibo_comment', 1) 
                    cluster_redis.hincrby('s_' + str(_id), 'retweeted_weibo_comment_timestamp_%s' % queue_index, 1)
                    cluster_redis.hset('s_' + str(_id), retweeted_mid + '_retweeted_weibo_comment_timestamp', timestamp)
            """

    elif message_type == 3:
        cluster_redis.sadd('user_set', user)
        if cluster_redis.sismember(user + '_retweeted_weibo', retweeted_mid) or cluster_redis.sismember('s_' + user + '_retweeted_weibo', retweeted_mid):
            return
        """
        RE = re.compile(u'//@([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+):', re.UNICODE)
        nicknames = RE.findall(text)
        """
        if not sensitive_result:
            cluster_redis.sadd(user + '_retweeted_weibo', retweeted_mid)
            cluster_redis.hset(user, retweeted_mid + '_retweeted_weibo_timestamp', timestamp) 
            queue_index = get_queue_index(timestamp)
            cluster_redis.hincrby(retweeted_uid, 'origin_weibo_retweeted_timestamp_%s' % queue_index, 1)    
            cluster_redis.hincrby(retweeted_uid, retweeted_mid + '_origin_weibo_retweeted', 1) 
            """
            if len(nicknames) != 0:
                for nick_id in nicknames:
                    _id = single_redis.hget(NICK_UID_NAMESPACE, nick_id)
                    print _id
                    single_redis.hset(ACTIVE_NICK_UID_NAMESPACE, nick_id, _id)
                    if _id:
                        cluster_redis.hincrby(str(_id), retweeted_mid+'_retweeted_weibo_retweeted', 1) 
                        cluster_redis.hset(str(_id), 'retweeted_weibo_retweeted_timestamp', timestamp)
                        cluster_redis.hincrby(str(_id), 'retweeted_weibo_retweeted_timestamp_%s' % queue_index, 1)
            """
        else:
            cluster_redis.sadd('s_' + user + '_retweeted_weibo', retweeted_mid)
            cluster_redis.hset('s_' + user, retweeted_mid + '_retweeted_weibo_timestamp', timestamp)
            queue_index = get_queue_index(timestamp)
            cluster_redis.hincrby('s_' +retweeted_uid, 'origin_weibo_retweeted_timestamp_%s' %queue_index, 1)
            cluster_redis.hincrby('s_' +retweeted_uid, retweeted_mid + '_origin_weibo_retweeted', 1)
            """