Пример #1
0
def cal_forever(receiver, controller, poller, sender=None, fill_field_funcs=[]):
    count = 0
    ts = time.time()
    tb = ts
    receive_kill = False

    # prepare
    item = receiver.recv_json()
    item_timestamp = item["timestamp"]

    now_db_no = get_now_db_no(item_timestamp)
    print "redis db no now", now_db_no
    global_profile_r = _default_redis(host=PROFILE_REDIS_HOST, port=PROFILE_REDIS_PORT, db=now_db_no)

    set_now_accepted_tsrange(item_timestamp)

    while 1:
        evts = poller.poll(XAPIAN_ZMQ_POLL_TIMEOUT)
        if evts:
            socks = dict(poller.poll(XAPIAN_ZMQ_POLL_TIMEOUT))
        elif receive_kill and time.time() - tb > XAPIAN_ZMQ_WORK_KILL_INTERVAL:
            """
            定期kill,可以记录work开启的时间
            然后收到kill的时候判断一下当前时间减去work开启的时间
            是否超过某个阈值,是则执行kill操作
            配套的prod模式下,应该在每隔XAPIAN_ZMQ_WORK_KILL_INTERVAL新开work
            """
            print 'receive "KILL", worker stop, cost: %ss' % (time.time() - tb)
            break
        else:
            socks = None

        if socks and socks.get(receiver) == zmq.POLLIN:
            item = receiver.recv_json()
            if fill_field_funcs:
                for func in fill_field_funcs:
                    item = func(item)

            item_timestamp = item["timestamp"]

            now_a_start_ts, now_a_end_ts = get_now_accepted_tsrange()
            if int(item_timestamp) < now_a_start_ts or int(item_timestamp) >= now_a_end_ts:
                # 超出接受范围,抛弃该条微博
                continue

            new_db_no = get_now_db_no(item_timestamp)
            if new_db_no != now_db_no:
                now_db_no = new_db_no
                print "redis db no now", now_db_no
                global_profile_r = _default_redis(db=now_db_no)
                set_now_accepted_tsrange(item_timestamp)

            realtime_profile_keywords_cal(item, global_profile_r)

            count += 1
            if count % XAPIAN_FLUSH_DB_SIZE == 0:
                te = time.time()
                cost = te - ts
                ts = te
                print "[%s] total profile calc: %s, %s sec/per %s" % (
                    datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                    count,
                    cost,
                    XAPIAN_FLUSH_DB_SIZE,
                )

        # Any waiting controller command acts as 'KILL'
        if socks and socks.get(controller) == zmq.POLLIN:
            controller.recv()
            receive_kill = True
Пример #2
0
        last_complete_start_ts = global_r0.get(LAST_COMPLETE_START_TS)
        
        if last_complete_start_ts:
            last_complete_start_ts = int(last_complete_start_ts)
            print 'last_complete_start_ts', last_complete_start_ts
            
            # 正常应该更新的情况是,last_complete_start_ts是now_db-3甚至更早单元的起始时间,
            # 所以应该去取last_complete_start_ts往后一个时间段的数据,每次挪一个时间单元
            if last_complete_start_ts <= now_db_start_ts - 60 * 15 * 3:
                # 更新last_complete_start_ts
                last_complete_start_ts += 60 * 15
                global_r0.set(LAST_COMPLETE_START_TS, last_complete_start_ts)

                # 开始计算
                end_ts = last_complete_start_ts + 60 * 15
                now_db_no = get_now_db_no(last_complete_start_ts)
                r = _default_redis(db=now_db_no)
                calc_sentiment()
                calc_profile()
                clear_current_redis()                

            # 当last_complete_start_ts 达到 23:15时,把剩余的两段15分钟时间单元更新计算   
            elif last_complete_start_ts < now_db_start_ts and (datetime.datetime.fromtimestamp(last_complete_start_ts).strftime("%H:%M:%S") ==  '23:15:00' or datetime.datetime.fromtimestamp(last_complete_start_ts).strftime("%H:%M:%S") == '23:30:00'):
                # 更新last_complete_start_ts
                last_complete_start_ts += 60 * 15
                global_r0.set(LAST_COMPLETE_START_TS, last_complete_start_ts)

                # 开始计算
                end_ts = last_complete_start_ts + 60 * 15
                now_db_no = get_now_db_no(last_complete_start_ts)
                r = _default_redis(db=now_db_no)
Пример #3
0
            # print uid
            keywords_with_count = r.zrange(USER_KEYWORDS % uid, 0, -1, withscores=True)
            daily_profile_keywords_bucket.Put(str(uid), zlib.compress(pickle.dumps(keywords_with_count, pickle.HIGHEST_PROTOCOL), zlib.Z_BEST_COMPRESSION))

        cursor, members = r.sscan(USER_SET, cursor=cursor, count=10000)


def get_now_leveldb_no():
    local_ts = time.time() - time.timezone
    return int(local_ts) % (24 * 60 * 60) / (15 * 60)  + 1


def get_now_datestr():
    return datetime.datetime.now().strftime("%Y%m%d")
        

if __name__ == '__main__':
    # init redis
    now_db_no = get_now_db_no()
    print "redis db no now", now_db_no
    r = _default_redis(db=now_db_no)
    
    # init leveldb
    now_datestr = get_now_datestr()
    now_leveldb_no = get_now_leveldb_no()
    print "leveldb no now", now_leveldb_no, now_datestr
    daily_profile_keywords_bucket = leveldb.LevelDB(os.path.join(LEVELDBPATH, './keywords/linhao_profile_keywords_%s_%s' % (now_datestr, now_leveldb_no)),
                                                    block_cache_size=8 * (2 << 25), write_buffer_size=8 * (2 << 25))

    profile_keywords_redis2leveldb()