def main():
    scan_cursor = 0
    count = 0
    bulk_action = []
    number = r.scard('user_set')
    print number

    if RUN_TYPE:
        ts = time.time() - DAY
        date = ts2datetime(ts)
        start_time = str(ts2datetime(time.time()))
        print "/cron/push_mid2redis.py&start&%s" %start_time
    else:
        date = '2013-09-05'
    index_name = flow_text_index_name_pre+date
    print index_name

    ts = time.time()
    while 1:
        re_scan = r.sscan("user_set", scan_cursor, count=3000)
        scan_cursor = re_scan[0]
        uid_list = re_scan[1] #具体数据
        if len(uid_list):
            for uid in uid_list:
                detail_dict = r.hgetall(uid)
                for k,v in detail_dict.iteritems():
                    update_dict = dict()
                    if "_origin_weibo_retweeted" in k and v:
                        mid = k.split('_')[0]
                        update_dict["retweeted"] = int(v)
                    elif "_origin_weibo_comment" in k and v:
                        mid = k.split('_')[0]
                        update_dict["comment"] = int(v)
                    elif '_retweeted_weibo_comment' in k and v:
                        mid = k.split('_')[0]
                        update_dict["comment"] = int(v)
                    elif '_retweeted_weibo_retweeted' in k and v:
                        mid = k.split('_')[0]
                        update_dict["retweeted"] = int(v)
                    else:
                        pass
                    if update_dict:
                        action = {"update": {"_id": mid}}
                        xdata = {"doc": update_dict}
                        bulk_action.extend([action, xdata])
                        count += 1
                        if count % 400 == 0:
                            r_flow.lpush('update_mid_list', json.dumps(bulk_action))
                            bulk_action = []
                            tp = time.time()
                            #print "%s cost %s" %(count, tp-ts)
                            ts = tp
        if int(scan_cursor) == 0:
            break

    if bulk_action:
        r_flow.lpush('update_mid_list', json.dumps(bulk_action))

    print count
예제 #2
0
def main():
    scan_cursor = 0
    count = 0
    bulk_action = []
    number = r.scard('user_set')
    print number

    if RUN_TYPE:
        ts = time.time() - DAY
        date = ts2datetime(ts)
    else:
        date = '2013-09-05'
    index_name = flow_text_index_name_pre + date

    ts = time.time()
    while 1:
        re_scan = r.sscan("user_set", scan_cursor, count=3000)
        scan_cursor = re_scan[0]
        uid_list = re_scan[1]  #具体数据
        if len(uid_list):
            for uid in uid_list:
                detail_dict = r.hgetall(uid)
                for k, v in detail_dict.iteritems():
                    update_dict = dict()
                    if "_origin_weibo_retweeted" in k and int(v):
                        mid = k.split('_')[0]
                        update_dict["retweeted"] = int(v)
                    elif "_origin_weibo_comment" in k and int(v):
                        mid = k.split('_')[0]
                        update_dict["comment"] = int(v)
                    else:
                        pass
                    if update_dict:
                        action = {"update": {"_id": mid}}
                        xdata = {"doc": update_dict}
                        bulk_action.extend([action, xdata])
                        count += 1
                        if count % 1000 == 0:
                            #print bulk_action
                            r_flow.lpush('update_mid_list',
                                         json.dumps(bulk_action))
                            bulk_action = []
                            tp = time.time()
                            print "%s cost %s" % (count, tp - ts)
                            ts = tp
        if int(scan_cursor) == 0:
            break

    if bulk_action:
        r_flow.lpush('update_mid_list', json.dumps(bulk_action))

    print count
def scan_offline_task():

    query = {
        "query": {
            "bool": {
                "must": [{
                    "term": {
                        "status": 0
                    }
                }]
            }
        },
        "size": 1000
    }
    results = es_user_portrait.search(index=USER_RANK_KEYWORD_TASK_INDEX,
                                      doc_type=USER_RANK_KEYWORD_TASK_TYPE,
                                      body=query)['hits']['hits']
    if results:
        for item in results:
            task_id = item['_id']
            iter_item = item['_source']
            search_type = iter_item['search_type']
            pre = iter_item['pre']
            during = iter_item['during']
            start_time = iter_item['start_time']
            keyword = json.loads(iter_item['keyword'])
            search_key = iter_item['user_ts']
            number = iter_item['number']
            sort_norm = iter_item['sort_norm']
            sort_scope = iter_item['sort_scope']
            time = iter_item['time']
            isall = iter_item['isall']
            redis_task.lpush(
                "task_user_rank",
                json.dumps([
                    task_id, search_type, pre, during, start_time, keyword,
                    search_key, sort_norm, sort_scope, time, isall, number
                ]))
            iter_item['status'] = -1
            task_id = item['_id']
            #print item
            es_user_portrait.index(index=USER_RANK_KEYWORD_TASK_INDEX,
                                   doc_type=USER_RANK_KEYWORD_TASK_TYPE,
                                   id=task_id,
                                   body=iter_item)
def scan_offline_task():
    
    query = {"query":{"bool":{"must":[{"term":{"status":-1}}]}},"size":1000}
    results = es_user_portrait.search(index = USER_RANK_KEYWORD_TASK_INDEX , doc_type = USER_RANK_KEYWORD_TASK_TYPE,body=query)['hits']['hits']
    if results :
        for item in results:
            task_id = item['_id']
            iter_item = item['_source']
            search_type = iter_item['search_type']          
            pre = iter_item['pre']
            during =  iter_item['during'] 
            start_time =  iter_item['start_time']  
            keyword = json.loads(iter_item['keyword'])
            search_key = iter_item['user_ts']
            number = iter_item['number']
            sort_norm = iter_item['sort_norm']
            sort_scope = iter_item['sort_scope']
            time = iter_item['time']
            isall = iter_item['isall']
            redis_task.lpush("task_user_rank", json.dumps([task_id, search_type , pre , during , start_time , keyword , search_key , sort_norm , sort_scope  ,time , isall, number]))
            iter_item['status'] = -1 
            task_id = item['_id']
            #print item
            es_user_portrait.index(index=USER_RANK_KEYWORD_TASK_INDEX, doc_type=USER_RANK_KEYWORD_TASK_TYPE, id=task_id, body=iter_item)
예제 #5
0
def get_temporal_rank(timestamp):
    index = get_queue_index(timestamp)
    index_ts = "influence_timestamp_" + str(index)
    
    uid_list = r.zrange(index_ts, 0, 10000, desc=True)
    user_info = []
    in_portrait = [] # 入库
    if uid_list:
        search_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, body={"ids":uid_list}, field=SOCIAL_SENSOR_INFO)["docs"]
        for item in search_result:
            if item["found"]:
                temp = []
                in_portrait.append(item['_id'])
                for iter_key in SOCIAL_SENSOR_INFO:
                   
    """
예제 #6
0
"""
放弃原先设想的采用redis cluster方案,而使用单台redis形式
"""

import sys
import redis
from redis import StrictRedis
reload(sys)
sys.path.append('../../')
from global_utils import R_CLUSTER_FLOW1 as r

if __name__ == '__main__':

    """
    startup_nodes = [{"host": '219.224.135.91', "port": "6379"}]
    weibo_redis = RedisCluster(startup_nodes = startup_nodes)
    weibo_redis.flushall()

    startup_nodes = [{"host": '219.224.135.91', "port": "6380"}]
    weibo_redis = RedisCluster(startup_nodes = startup_nodes)
    weibo_redis.flushall()

    startup_nodes = [{"host": '219.224.135.93', "port": "6380"}]
    weibo_redis = RedisCluster(startup_nodes = startup_nodes)
    weibo_redis.flushall()

    print "finish flushing!"
    """
    r.flushdb()
                    "should":[
                    ]
                }
            }
        }
    },
    "size": 10000
}


if __name__ == "__main__":
    scan_cursor = 0
    sensitive_uid_list = []
    count = 0
    while 1:
        re_scan = r_cluster.sscan('s_user_set', scan_cursor, count=10000)
        if int(re_scan[0]) == 0:
            sensitive_uid_list.extend(re_scan[1])
            count += len(re_scan[1])
            print count
            break
        else:
            sensitive_uid_list.extend(re_scan[1])
            count += 10000
            scan_cursor = re_scan[0]

    temp_list = sensitive_uid_list
    count = 0
    patition = 100
    number = int(math.ceil(len(temp_list)/float(100)))
    print number
            number = iter_item['number']
            sort_norm = iter_item['sort_norm']
            sort_scope = iter_item['sort_scope']
            time = iter_item['time']
            isall = iter_item['isall']
            redis_task.lpush("task_user_rank", json.dumps([task_id, search_type , pre , during , start_time , keyword , search_key , sort_norm , sort_scope  ,time , isall, number]))
            iter_item['status'] = -1 
            task_id = item['_id']
            #print item
            es_user_portrait.index(index=USER_RANK_KEYWORD_TASK_INDEX, doc_type=USER_RANK_KEYWORD_TASK_TYPE, id=task_id, body=iter_item)


def cron_task(data):
    key_words_search(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7], data[8], data[9], data[10], data[11])
    

if __name__ == "__main__":

    scan_offline_task()
    while 1:
        data = redis_task.rpop("task_user_rank")
        print data
        #"""
        if data:
            cron_task(json.loads(data))
        else:
            break
        #"""
            
    
                json.dumps([
                    task_id, search_type, pre, during, start_time, keyword,
                    search_key, sort_norm, sort_scope, time, isall, number
                ]))
            iter_item['status'] = -1
            task_id = item['_id']
            #print item
            es_user_portrait.index(index=USER_RANK_KEYWORD_TASK_INDEX,
                                   doc_type=USER_RANK_KEYWORD_TASK_TYPE,
                                   id=task_id,
                                   body=iter_item)


def cron_task(data):
    key_words_search(data[0], data[1], data[2], data[3], data[4], data[5],
                     data[6], data[7], data[8], data[9], data[10], data[11])


if __name__ == "__main__":

    scan_offline_task()
    while 1:
        data = redis_task.rpop("task_user_rank")
        print data
        #"""
        if data:
            cron_task(json.loads(data))
        else:
            break
        #"""
    return int(index)

if __name__ == "__main__":
    now_ts = time.time()
    date_ts = datetime2ts(ts2datetime(now_ts))
    if now_ts - TIME_INTERVAL < date_ts:
        sys.exit(0)

    tmp_date = ts2date(now_ts)
    print "cron_influence_start_" + tmp_date
    index = get_queue_index(now_ts) #当前时间戳所对应的时间区间
    influence_ts = "influence_timestamp_" + str(index)
    scan_cursor = 0
    count = 0
    while 1:
        re_scan = r.hscan(influence_ts, scan_cursor, count=1000)
        scan_cursor = re_scan[0]
        detail = re_scan[1]
        if len(detail):
            for k,v in detail.iteritems():
                r.zadd(influence_ts, v, k)
                count += 1
        if int(scan_cursor) == 0:
            break

    tmp_date = ts2date(time.time())
    print count
    print "cron_influence_end_" + tmp_date