def get_sp_duration_active_userid(begin, end):
    '''
    获取cy_real_time_event一定时间段内左右出现的用户id
    :param begin:
    :param end:
    :return:
    '''

    # 调整输入时间戳格式
    begin = ensure_second_timestamp(begin)
    end = ensure_second_timestamp(end)
    # 获取table
    connection = happybase.Connection('hbase_server',
                                      compat='0.90',
                                      port=19090,
                                      timeout=BIG_TIMEOUT)
    table = connection.table("cy_real_time_event")

    uids = set()
    for key, data in table.scan():
        uid, timestamp, event_type = key.split('|')
        try:
            uid = int(uid)
        except:
            continue
        ts = ensure_second_timestamp(timestamp)
        if ts < begin or ts > end:
            continue

        uids.add(uid)
    connection.close()
    return list(uids)
def get_user_recent_views(uid, now=None, lookback=3 * 24 * 86400.0, num=None):
    # 获取用户近期点击news和topic的数据
    if now:
        end = ensure_second_timestamp(now)
        begin = end - lookback
    else:
        end = time.time()
        begin = end - lookback
    connection = happybase.Connection('hbase_server',
                                      compat='0.90',
                                      port=19090,
                                      timeout=SMALL_TIMEOUT)
    table = connection.table("cy_real_time_event")
    focused_type = ('view_news', 'view_topics', 'view_topic')
    actions = []

    o = [[key, data]
         for key, data in table.scan(row_prefix=str(uid) + '|')][::-1]
    for key, data in o:
        if num and len(actions) >= num:
            # 限定只取一定数目的数据
            break
        _, ts, action_type = key.split('|')
        # 用时间筛选
        ts = ensure_second_timestamp(ts)
        if end < ts < begin:
            continue
        if action_type in focused_type:
            actions.append([
                ts, action_type,
                int(data[CY_REAL_TIME_EVENT_ATTR_MAP[action_type]])
            ])
    return actions
Пример #3
0
def get_user_qa_content(uid, begin, end):
    # 不要了,改用从hbase取数据
    # 获取用户在一段时间内所有qa的全文
    begin_ds = timestamp2datetime(ensure_second_timestamp(begin))
    end_ds = timestamp2datetime(ensure_second_timestamp(end))

    all_qa_text = []

    sql1 = 'select id from ask_problem where user_id=%s and created_time>"%s" and created_time<"%s";' \
           % (
               uid, begin_ds, end_ds
           )

    o1 = get_medicaldb_handler().do_one(sql1)
    if o1 is None or len(o1) == 0:
        return all_qa_text

    for item in o1:
        problem_id = item[0]
        sql = 'select content from ask_problemcontent where problem_id=%s;' % problem_id
        o = get_medicaldb_handler().do_one(sql)
        if o is None or len(o) == 0:
            continue

        content = o[0][0]
        content_dict = json.loads(content)[0]
        if content_dict['type'] != 'text':
            continue
        text = content_dict['text']
        all_qa_text.append(text)

    return all_qa_text
def get_sp_duration_valid_user_id(begin, end):
    '''
    获取begin,end之间所有活跃(big_search and free_problem_create)用户的id
    :param begin:
    :param end:
    :return:
    '''

    # 调整输入时间戳格式
    begin = ensure_second_timestamp(begin)
    end = ensure_second_timestamp(end)

    # 获取table
    connection = happybase.Connection('hbase_server',
                                      compat='0.90',
                                      port=19090,
                                      timeout=BIG_TIMEOUT)
    table = connection.table("cy_real_time_event")

    valid_uids = set()
    event_type_list = ["big_search", "free_problem_create"]
    for key, data in table.scan():
        uid, timestamp, event_type = key.split('|')
        if event_type not in event_type_list:
            continue
        ts = ensure_second_timestamp(timestamp)
        if ts < begin or ts > end:
            continue
        valid_uids.add(int(uid))

    connection.close()
    return valid_uids
Пример #5
0
def get_feed_showlist_dict(begin, end):
    '''
    获取begin和end之间每天的展示列表,并存为date:news_list的字典
    :param begin: 精确到秒的时间戳
    :param end: 精确到秒的时间戳
    :return:
    '''
    view_time_th = 5000  # 5000以下认为没展示过,5000以上认为被展示过
    # 调整时间戳格式
    begin = ensure_second_timestamp(begin)
    end = ensure_second_timestamp(end)

    # begin , end 转换成日期
    begin_d = timestamp2date(begin)
    end_d = timestamp2date(end)

    # 从数据库select
    sql = 'select id,date,view_times from news_healthnews where is_online=1 and date<="%s" and date>="%s";' % (
        end_d, begin_d)

    o = get_newsdb_handler().dbhandler.do_one(sql)

    date_newsid_dict = defaultdict(list)
    for item in o:
        news_id = int(item[0])
        date = item[1]
        view_times = int(item[2])
        if view_times < view_time_th:
            continue
        date_newsid_dict[date].append(news_id)

    return date_newsid_dict
def get_qa_uids(begin, end):
    # 获取begin-end之间所有qa对应的user_id
    begin_dt = timestamp2datetime(ensure_second_timestamp(begin))
    end_dt = timestamp2datetime(ensure_second_timestamp(end))
    sql = 'select distinct user_id from ask_problem where created_time>"%s" and created_time<"%s";' % (
        begin_dt, end_dt)
    o = get_medicaldb_handler().dbhandler.do_one(sql)
    uids = set()
    for item in o:
        uid = item[0]
        uids.add(int(uid))
    return uids
Пример #7
0
def test3():
    from collections import defaultdict
    # 2017-11-12 17:05:33,289 INFO recommend_resource.Recommend Line:52  failed in recommend==user_info is None ===uid=128243057===========
    filename = sys.argv[2]
    lookback_list = [5, 10, 15, 20, 30, 60, 120]
    res = defaultdict(list)
    trigger_count = {"big_search": 0, "free_problem_create": 0}
    with open(filename, 'r') as f:
        for l in f:
            dt = l.split(',')[0]
            uid = l.split("==uid=")[1].split('=')[0]
            end = ensure_second_timestamp(dt)
            begin_list = [end - x * 61.0 for x in lookback_list]

            end += 5.0
            index, trigger = user_time_event2(uid, end, begin_list)
            if trigger:
                trigger_count[trigger] += 1
            # print uid, index, trigger
            res[index].append([uid, dt, trigger])
    lookback_list.append(0)
    print res[0]
    for index in res.keys():
        print lookback_list[index], "分钟内可以召回", len(res[index])

    for trigger in trigger_count:
        print trigger, trigger_count[trigger]
Пример #8
0
def user_time_event(uid, begin, end):
    begin = ensure_second_timestamp(begin)
    end = ensure_second_timestamp(end)
    connection = happybase.Connection('hbase_server',
                                      compat='0.90',
                                      port=19090,
                                      timeout=30000)
    table = connection.table("cy_real_time_event")
    for key, value in table.scan(row_prefix=str(uid) + '|'):
        print "all key", key
        _, ts, event_type = key.split('|')
        ts = ensure_second_timestamp(ts)
        print "all time", timestamp2datetime(ts)
        if ts >= begin and ts <= end:
            print "shoot key", key, value
            print "shoot time", timestamp2datetime(ts)
Пример #9
0
def user_time_event2(uid, end, begin_list):
    #lookback_list = [5,10,15,20,30,60]必须是递增的整数
    #end = ensure_second_timestamp(end)
    #begin_list = [end - x * 61.0 for x in lookback_list]
    #end += 5.0
    connection = happybase.Connection('hbase_server',
                                      compat='0.90',
                                      port=19090,
                                      timeout=30000)
    table = connection.table("cy_real_time_event")
    rows = [item for item in table.scan(row_prefix=str(uid) + '|')]
    for key, value in rows[::-1]:
        _, ts, event_type = key.split('|')
        if event_type not in ("big_search", "free_problem_create"):
            continue
        ts = ensure_second_timestamp(ts)
        if ts > end:  #以后发生的事
            continue
        for i in range(len(begin_list)):
            begin = begin_list[i]

            #lookback = lookback_list[i]
            if ts >= begin:
                return i, event_type
    return len(begin_list), ''
def cy_time_event_one_user_kernel(uid, begin, end, event_type_list=None):
    # 获取某个用户begin到end时间戳内的所有活动信息
    # 上线用的,输入uid,时间段(一般是15min,获取触发类型和触发信息)

    info = {"last_event": None, "last_event_time": 0}

    if not event_type_list:
        event_type_list = ["big_search", "free_problem_create"]

    connection = happybase.Connection('hbase_server',
                                      compat='0.90',
                                      port=19090,
                                      timeout=SMALL_TIMEOUT)
    table = connection.table("cy_real_time_event")
    for key, data in table.scan(row_prefix=str(uid) + '|'):
        uid, timestamp, event_type = key.split('|')

        if event_type not in event_type_list:
            continue
        timestamp = ensure_second_timestamp(timestamp)
        if end >= timestamp >= begin:
            event_type, t_info = event_info2(data, event_type)
            if event_type in info:
                info[event_type].append(t_info + [timestamp])
            else:
                info[event_type] = [t_info + [timestamp]]
            if timestamp > info['last_event_time']:
                info['last_event_time'] = timestamp
                info['last_event'] = [event_type, t_info]

    connection.close()
    return info
def get_last_login_uids(begin, end):
    # 不需要快
    # 调整两个时间戳的格式
    begin = int(1000 * ensure_second_timestamp(begin))
    end = int(1000 * ensure_second_timestamp(end))

    # 建立query
    solr_query = SolrQuery()
    q = '*:*'
    solr_query.set('q', q)
    solr_query.set('fl', ['id'])
    solr_query.add('fq', 'last_login:[%s TO %s]' % (begin, end))
    solr_query.set('rows', 1000000)

    # 搜
    res = [item['id'] for item in solr_up.search(**solr_query.get_query_dict())]
    return res
def get_row_key_from_solr2(uid, begin, end, col_name):
    if col_name == 'search_event':
        res = get_cy_event_row_key_search(uid)
    if col_name == 'news_profile':
        res = get_cy_event_row_key_news(uid)
    if col_name == 'topic_profile':
        res = get_cy_event_row_key_topic(uid)

    selected_rowkey_list = []
    begin = ensure_second_timestamp(begin)
    end = ensure_second_timestamp(end)
    for item in res:
        ts = ensure_second_timestamp(item['event_time'])
        if ts > end or ts < begin:
            continue
        selected_rowkey_list.append(item['id'])
    return selected_rowkey_list
def get_48h_data(now=None):
    # 时间戳格式是hbase除以1000之后的
    if now:
        # 从指定的now时间戳算前两天(不包括now的当天)
        now = ensure_second_timestamp(now)
        begin, end = get_48h_timestamps(now)
    else:
        begin, end = get_48h_timestamps()
    return cy_time_event_kernel(begin, end)
def get_sp_duration_valid_user_data(begin, end, test_uid=None):
    '''
    获取begin到end之间所有活跃用户(qa or bs action)的用户的数据
    时间不可以太久,因为cy_real_time_event只存10天的实时数据
    :param begin: 开始的时间戳
    :param end: 终止的时间戳
    :return: user_info0
    '''
    # 调整输入时间戳格式
    begin = ensure_second_timestamp(begin)
    end = ensure_second_timestamp(end)

    # 获取table
    connection = happybase.Connection('hbase_server',
                                      compat='0.90',
                                      port=19090,
                                      timeout=BIG_TIMEOUT)
    table = connection.table("cy_real_time_event")

    user_info0 = {}
    event_type_list = ["big_search", "free_problem_create"]

    row_prefix = str(test_uid) if test_uid else None

    for key, data in table.scan(row_prefix=row_prefix):
        uid, timestamp, event_type = key.split('|')
        uid = int(uid)
        if test_uid and uid != int(test_uid):
            continue
        if event_type not in event_type_list:
            continue
        ts = ensure_second_timestamp(timestamp)
        if ts < begin or ts > end:
            continue

        # 搂uid的数据,不记录last_event
        if uid not in user_info0:
            user_info0[uid] = {'big_search': [], 'free_problem_create': []}

        event_type, t_info = event_info2(data, event_type)
        user_info0[uid][event_type].append(t_info + [timestamp])

    connection.close()
    return user_info0
Пример #15
0
def test1():
    uid = sys.argv[2]
    end = sys.argv[3]
    interval = sys.argv[4]
    end = ensure_second_timestamp(end)
    begin = end - int(interval) * 61.0
    end += int(interval) * 61.0
    print "begin", timestamp2datetime(begin)
    print "end", timestamp2datetime(end)
    user_time_event(uid, begin, end)
Пример #16
0
def Recommend_list(uid, num, end=None, pid=None, lookback=5 * 61.0):
    # return : [{'id':111,'type':'topic','title':'xxxx'},{'id':222,'type':'news','title':'yyy'}...]
    # ******************************************
    # if not IS_ONLINE_WEB_SERVER:
    #     return TEST_RETURN_Recommend_list
    # ******************************************

    bad_return = []
    log_mark = "recommend_topn"
    info_logger.info(
        "%s===============start=========uid=%s==============pid=%s===============",
        log_mark, uid, str(pid))

    # assert uid
    try:
        uid = int(uid)
    except:
        info_logger.info("%s=====failed in recommend==bad uid=%s=========",
                         log_mark, uid)
        return bad_return
    if uid == -1:
        info_logger.info("%s=====failed in recommend==bad uid=%s=========",
                         log_mark, uid)
        return bad_return

    # time window
    if not end:
        end = time.time()
    else:
        end = ensure_second_timestamp(end)

    begin = end - lookback

    end += 5.0  # 结束点顺延5s,防止hbase表里还没有实时数据
    if pid:  # qa触发由传入的problem_id查询信息
        user_info0 = one_user_last_qa_info(pid)
    else:
        user_info0 = cy_time_event_one_user_kernel(uid, begin, end)

    res_dict = Recommend_by_user_info(user_info0,
                                      uid,
                                      log_mark=log_mark,
                                      num=num)
    res = res_dict['res']
    status = res_dict['status']
    if not res:
        info_logger.info("%s==failed in recommend==%s===uid=%s===========",
                         log_mark, status, uid)
        return bad_return
    for item in res:
        best_id, title, mtype = item
    info_logger.info(
        "%s==succeed in recommend===id=%s==title=%s====type=%s===uid=%s===========",
        log_mark, best_id, title, mtype, uid)
    return [{'id': item[0], 'title': item[1], 'type': item[2]} for item in res]
def cy_time_event_kernel_test(begin, end, test_uid=None):
    ############
    # query算15min内的所有,qa取最后一个,所以一个用户就取一次触发的例子,其他时间不要了
    connection = happybase.Connection('hbase_server',
                                      compat='0.90',
                                      port=19090,
                                      timeout=BIG_TIMEOUT)
    table = connection.table("cy_real_time_event")

    data_dict = {}

    interval = 5 * 60.0  # 15min
    caled_uid = set()
    max = 0.0

    begin = ensure_second_timestamp(begin)
    end = ensure_second_timestamp(end)
    row_prefix = str(test_uid) + '|' if test_uid else None
    for key, data in table.scan(row_prefix=row_prefix):
        uid, timestamp, event_type = key.split('|')
        if test_uid and int(uid) != int(test_uid):
            continue
        if event_type not in ("big_search", "free_problem_create"):
            continue

        timestamp = ensure_second_timestamp(timestamp)
        if timestamp > max:
            max = timestamp
        if timestamp > end or timestamp < begin:
            continue

        if uid in caled_uid:
            continue
        caled_uid.add(uid)

        end_t = timestamp + 1.0
        begin_t = end_t - interval
        user_info = cy_time_event_one_user_kernel(uid, begin_t, end_t)
        data_dict[uid] = user_info
    print "num of caled_uid", len(caled_uid)
    print "max timestamp", max
    return data_dict
def get_today_data(now=None):
    # 时间戳格式是hbase除以1000之后的
    # 获取now当天的活跃用户数据

    if now:
        # 从指定的now时间戳算前两天(不包括now的当天)
        now = ensure_second_timestamp(now)
        begin, end = get_today_timestamp(now)
    else:
        begin, end = get_today_timestamp()
    return cy_time_event_kernel(begin, end)
Пример #19
0
def get_user_qa_content2(uid, begin, end):
    # 从habse problem2表中 获取用户在一段时间内所有qa的全文
    begin_ds = timestamp2datetime(ensure_second_timestamp(begin))
    end_ds = timestamp2datetime(ensure_second_timestamp(end))

    all_qa_text = []

    sql1 = 'select id from ask_problem where user_id=%s and created_time>"%s" and created_time<"%s";' \
           % (
               uid, begin_ds, end_ds
           )

    o1 = get_medicaldb_handler().do_one(sql1)
    if o1 is None or len(o1) == 0:
        return all_qa_text

    for item in o1:
        problem_id = item[0]
        qa_texts = get_qa_texts_by_pid(problem_id)
        all_qa_text.extend(qa_texts)
    return all_qa_text
def get_all_yesterday_user_id(now=None, test=False):
    if not now:
        now = time.time()
    else:
        now = ensure_second_timestamp(now)

    begin, end = get_yesterday_timestamp(now)
    if test:
        end = begin + 30 * 60  # 测试模式只取三十分钟数据

    all_valid_uids = get_sp_duration_valid_user_id(begin, end)
    return all_valid_uids
def get_user_search_keys(uid, begin, end):
    # 从md4的search_event中选取一定时间段内用户搜索行为的key
    # (这个key在hbase的cy_event表中可以查到该次行为的详细信息)

    # 调整两个时间戳的格式
    begin = int(1000 * ensure_second_timestamp(begin))
    end = int(1000 * ensure_second_timestamp(end))

    # 建立query
    solr_query = SolrQuery()
    q = '*:*'
    solr_query.set('q', q)
    solr_query.set('fl', ['id', 'event_time'])
    # solr_query.add('fq', 'event_time:[%s TO %s]' % (begin, end))
    solr_query.add('fq', 'uid:%s' % uid)
    solr_query.set('rows', 100000)

    # 搜
    res = [[item['id'], item['event_time']] for item in solr_se.search(**solr_query.get_query_dict())]
    res = [item[0] for item in res if (begin < item[1] < end)]
    return res
def get_qa_text(uid, begin, end, num):
    # 需要快,同时保留事件的时间
    bad_return = [], []
    begin = ensure_second_timestamp(begin)
    end = ensure_second_timestamp(end)
    sql = 'select id,created_time,ask from ask_problem  where user_id=%s order by id desc limit %s;' % (
        uid, num)
    # print 'sql', sql
    o = get_medicaldb_handler().do_one(sql)
    if o is None or len(o) == 0:
        return bad_return

    text_list = []
    ts_list = []
    for item in o:
        dt = str(item[1])
        ts = datetime_str2timestamp(dt)
        if ts < begin or ts > end:
            continue
        first_ask = unicode(item[2])
        text_list.append(first_ask)
        ts_list.append(ts)
    return text_list, ts_list
def get_all_yesterday_user_id(now=None, test=False):
    from general_utils.hbase_utils import get_sp_duration_active_userid
    from general_utils.time_utils import get_yesterday_timestamp
    if not now:
        now = time.time()
    else:
        now = ensure_second_timestamp(now)

    begin, end = get_yesterday_timestamp(now)
    if test:
        end = begin + 30 * 60  # 测试模式只取三十分钟数据

    all_valid_uids = get_sp_duration_active_userid(begin, end)
    return all_valid_uids
def get_view_news_data(row_prefix):
    connection = happybase.Connection('hbase_server',
                                      compat='0.90',
                                      port=19090,
                                      timeout=SMALL_TIMEOUT)
    table = connection.table("cy_event")
    news_viwers = defaultdict(set)
    cnt = 0
    print time.time()
    last_ts = None
    now = time.time()
    start = now - 86400 * 180

    focused_type = ('view_news', 'view_topic')
    if row_prefix not in focused_type:
        return
    all_types = defaultdict(int)
    for key, data in table.scan(row_prefix=row_prefix):

        try:
            action_type, ts, uid = key.split('|')
        except:
            continue
        all_types[action_type] += 1

        if action_type not in focused_type:
            continue

        last_ts = ensure_second_timestamp(ts)
        if last_ts < start:
            continue
        news_id = data[CY_REAL_TIME_EVENT_ATTR_MAP[action_type]]
        news_viwers[news_id].add(uid)
        cnt += 1
        if cnt % 1000 == 0:
            print timestamp2datetime(time.time()), cnt, len(news_viwers)
    print time.time()
    print 'last_ts', last_ts

    print len(news_viwers)
    for x in all_types:
        print x, all_types[x]
    with open('cy_event_%s.json' % row_prefix, 'w') as f:
        for news_id in news_viwers:
            str = json.dumps({
                'id': news_id,
                'uids': list(news_viwers[news_id]),
                'len': len(news_viwers[news_id])
            }) + '\n'
            f.write(str)
Пример #25
0
def Recommend(uid, lookback, end=None, pid=None):
    # if not IS_ONLINE_WEB_SERVER:
    #     return choice(TEST_RETURN)

    # recommed top 1
    bad_return = [-1, "", "nothing"]  # material_id, title, material_type
    log_mark = "recommend_one"
    info_logger.info(
        "%s===============start=========uid=%s==============pid=%s===============",
        log_mark, uid, str(pid))

    try:
        uid = int(uid)
    except:
        info_logger.info("%s=====failed in recommend==bad uid=%s=========",
                         log_mark, uid)
        return bad_return

    if uid == -1:
        info_logger.info("%s=====failed in recommend==bad uid=%s=========",
                         log_mark, uid)
        return bad_return

    if not end:
        end = time.time()
    else:
        end = ensure_second_timestamp(end)

    begin = end - lookback

    end += 5.0  # 结束点顺延5s,防止hbase表里还没有实时数据

    if pid:  # qa触发由传入的problem_id查询信息
        user_info0 = one_user_last_qa_info(pid)
    else:
        user_info0 = cy_time_event_one_user_kernel(uid, begin, end)

    res_dict = Recommend_by_user_info(user_info0, uid, log_mark=log_mark)
    res = res_dict['res']
    status = res_dict['status']
    if not res:
        info_logger.info("%s==failed in recommend==%s===uid=%s===========",
                         log_mark, status, uid)
        return bad_return
    best_id, title, mtype = res[0]
    info_logger.info(
        "%s==succeed in recommend===id=%s==title=%s====type=%s===uid=%s===========",
        log_mark, best_id, title, mtype, uid)
    return [int(best_id), title, mtype]
def cy_time_event_one_user_viewnews(uid, begin, end):
    connection = happybase.Connection('hbase_server',
                                      compat='0.90',
                                      port=19090,
                                      timeout=SMALL_TIMEOUT)
    table = connection.table("cy_real_time_event")
    res = {}
    for key, data in table.scan(row_prefix=str(uid) + '|'):
        uid, timestamp, event_type = key.split('|')

        if event_type not in ["view_news"]:
            continue
        timestamp = ensure_second_timestamp(timestamp)
        info_logger.info("real timestamp=%s", timestamp)
        if timestamp >= begin and timestamp <= end:
            news_id = int(data["info:news_id"])
            res[news_id] = timestamp
    return res
Пример #27
0
def g1():
    '''
    查看不使用热卖tag扩充的覆盖率,和使用热卖tag扩充的覆盖率

    分子,能匹配上热卖tag的,分母,一天内有活动用户(cy_event
    '''

    from general_utils.hbase_utils import get_user_query, get_user_query2
    from general_utils.solr_utils import get_last_login_uids
    from recommend.manager.recommend_tags_data_helper import get_relation_plan3
    from general_utils.db_utils import get_db_data_local_handler
    from general_utils.hbase_utils import get_sp_duration_active_userid

    from general_utils.time_utils import timestamp2datetime, ensure_second_timestamp
    # 用户采样时间窗
    # 用户采样命中率

    end_ds0 = '2018-01-21 23:59:40'
    end0 = datetime_str2timestamp(end_ds0)
    begin0 = end0 - 86400 * 1

    # 每个选中用户的数据采集时间窗
    end_ds = '2018-01-22 23:59:40'
    end = datetime_str2timestamp(end_ds)
    begin = end - 86400 * 180.0  # 半年

    # 最后登录时间在2018-01-21 23:59:40前一周的用户
    # test_uids = get_last_login_uids(begin0, end0)
    # test_uids = get_sp_duration_active_userid(begin0,end0)
    test_uids = get_one_day_uid_from_file('log_event_20180122')
    print "test_uids num", len(test_uids)

    # 打乱顺序,取1000个样本
    random.shuffle(test_uids)
    selected_uids = test_uids[:3000]

    all_good_cnt = 0
    all_cnt = 0
    app_cnt = 0
    good_app_cnt = 0

    text_empty_cnt = 0
    fo = open('180129_rp_1.csv', 'w')
    csvwriter = csv.writer(fo)
    first_line = [
        'uid', 'username', 'is_app', 'last_info_time', 'use_tags',
        'systag_ids', 'tag_names', 't', 'is_tangsai'
    ]
    csvwriter.writerow(first_line)
    # status_dict = {
    #     1: "qa and query",
    #     2: "view actions",
    #     3: "search_doctor clinic_no",
    #     0: ""
    # }

    total_time = {}
    for uid in selected_uids:
        print '==============uid=%s=======================' % uid
        username = get_username(uid)
        is_app = is_app_user(uid)

        all_cnt += 1
        if is_app:
            app_cnt += 1

        t1 = time.time()
        res = get_relation_plan3(uid, test=True)
        t2 = time.time()
        t = t2 - t1
        total_time[uid] = t
        status = res['status']
        is_tangsai = False
        if status:
            all_good_cnt += 1
            if is_app:
                good_app_cnt += 1
            systag_ids = res['ids']
            if 96 in systag_ids:
                is_tangsai = True
            tagnames = [
                get_db_data_local_handler().get_systagid_name(id)
                for id in systag_ids
            ]
            if status in (1, 2, 4):
                info0 = res['systag_id_dict']
                record_info = '~'.join(info0.keys())
            elif status == 3:
                info0 = res['clinic_no']
                record_info = '~'.join(info0)
            last_ts = res['last_ts']
            last_info_time = timestamp2datetime(
                ensure_second_timestamp(last_ts))

        else:
            systag_ids = []
            tagnames = []
            record_info = ''
            last_info_time = ''

        systag_ids_str = '~'.join([str(x) for x in systag_ids])
        tagnames_str = '~'.join(tagnames)

        line = convert2gbk([
            str(uid), username,
            str(is_app), last_info_time, record_info, systag_ids_str,
            tagnames_str,
            str(t),
            str(is_tangsai)
        ])
        csvwriter.writerow(line)

    line = [str(all_cnt), str(all_good_cnt), str(app_cnt), str(good_app_cnt)]
    csvwriter.writerow(line)
    s_total_time = sorted(total_time.iteritems(),
                          key=lambda x: x[1],
                          reverse=True)
    times = total_time.values()
    line = [str(min(times)), str(max(times)), str(sum(times) / len(times))]
    csvwriter.writerow(line)
    for uid, t in s_total_time[:10]:
        line = [str(uid), str(t)]
        csvwriter.writerow(line)

    fo.close()

    print str(max(times))
    print all_good_cnt
Пример #28
0
def main5(test_uid=None, now=None):
    if test_uid == "n":
        test_uid = None
    now = time.time()
    if not now:
        now = 1512379920.1
    else:
        now = float(ensure_second_timestamp(now))
    t10 = time.time()
    data_dict = cy_time_event_kernel_test(now - 12000.0, now, test_uid)
    t20 = time.time()

    print "len(data_dict)", len(data_dict)

    if not test_uid:
        fo = open("20171220_1_res.csv", "w")
    else:
        fo = open('test.csv', 'w')
    csvwriter = csv.writer(fo, dialect="excel")
    first_line = [
        u"uid", u"u_tags", u"special_population", u"trigger", u"trigger_info",
        u"trigger_time", u"material_id", u"material_type", u"score", u"title",
        u"m_tags", u"only_topic", u"best_id", u"best_score", u"time"
    ]
    csvwriter.writerow(first_line)
    all_call_cnt = 0
    all_valid_res_cnt = 0
    exception_cnt = 0
    status_dict = defaultdict(int)

    total_time = []
    slow_case = []
    for uid in data_dict:

        all_call_cnt += 1

        user_info0 = data_dict[uid]
        try:
            # if True:
            t1 = time.time()
            res = Recommend_by_user_info(user_info0,
                                         uid,
                                         log_mark='testmain5',
                                         test=True)

            # return = {"user_info": None, "res": None, "topn_ids_scores": None, "only_topic": None,"status":"succeed"}

            t2 = time.time()
            print t2 - t1
            if t2 - t1 >= 3:
                break

            user_info = res['user_info']
            res1 = res['res']
            topn_ids_scores = res['topn_ids_scores']
            only_topic = res['only_topic']
            status = res['status']
            v_score_dict = res['v_score_dict']

            best_id, best_title, mtype = res1[0]

            this_time = t2 - t1
            if this_time >= 1.0:
                slow_case.append([uid, this_time])
            total_time.append(t2 - t1)
        except Exception, e:
            print e

            exception_cnt += 0
            continue
        status_dict[status] += 1

        ####################
        # if not only_topic:
        #     continue
        ####################

        if best_id == -1 or user_info is None:
            continue

        print '================='
        print uid

        texts = user_info["texts"]
        tags = user_info["tags"]
        special_population = user_info["special_population"]
        trigger = user_info["trigger"]
        timestamp = user_info['timestamp']
        best_score = v_score_dict[mtype + '_' + str(best_id)]
        # if trigger == "big_search":
        #     continue

        if trigger == 'big_search':
            trigger_info = "-".join(texts)
        elif trigger == "free_problem_create":
            problem_id, ask = get_medicaldb_handler().get_ask_by_timestamp(
                uid, timestamp)
            if not ask:
                ask = texts[0]
            trigger_info = '-'.join([str(problem_id), str(ask)])
        print "u tags", "-".join(tags), special_population
        print trigger_info, best_id, best_score, best_title

        for unique_id, score in topn_ids_scores:
            material_type, id = unique_id.split('_')
            if material_type == "news":
                title, _ = get_newsdb_handler().get_title_digest_by_nid(id)
                m_tags = get_news_tags_from_solr("news_" + str(id))
            elif material_type == "topic":
                title = get_medicaldb_handler().get_topic_title(id)
                m_tags = get_news_tags_from_solr("r_topic_" + str(id))

            rows = [
                str(uid), "-".join(tags),
                str(special_population), trigger, trigger_info,
                str(timestamp),
                str(id), material_type,
                str(score), title, "-".join(m_tags),
                str(only_topic),
                str(best_id),
                str(best_score),
                str(this_time)
            ]
            rows = convert2gbk(rows)
            csvwriter.writerow(rows)
        all_valid_res_cnt += 1
def time_factor(ts, t0):
    # 牛顿冷却定律的时间衰减因子
    return math.exp(
        -k * abs(ensure_second_timestamp(t0) - ensure_second_timestamp(ts)))