Python convert2gbk 예제들, general_utils.text_utils.convert2gbk Python 예제들

예제 #1

0

파일 보기

파일: test.py 프로젝트: bopopescu/material_recommendation

def test9():
    from general_utils.db_utils import get_medicaldb_handler
    from add_data_to_solr.manager.add_utils import topic_info, doctor_info
    fo = open("topic_score.csv", "w")
    csvwriter = csv.writer(fo, dialect='excel')
    first_line = [u'topic id', u'score', u'topic title', u'content len', u'image num', u'is original',
                  u'doctor id', u'职称', u'医院级别', u'1科室', u"2科室", u'城市', u'hospital_name']
    first_line = convert2gbk(first_line)
    csvwriter.writerow(first_line)
    d = "data_dir/topic_data/"
    b2 = pickle_from_file(d + 'all_doc_big_2')
    for item in b2:
        id = int(item['id'].split('_')[-1])
        score = item['tid'] / 10.0
        title = item['title']
        content_len = item['content_len']
        sql = 'select doctor_id from api_doctortopic where id=%s;' % id
        o = get_medicaldb_handler().do_one(sql)
        doctor_id = o[0][0]
        ti = topic_info(id)
        di = doctor_info(doctor_id)
        image_num = ti['image_num']
        is_original = ti['is_original']
        d_title = di['title']
        h_level = di['hospital_level']
        h_name = di['hospital_name']
        clinic_no = di['first_class_clinic_no']
        s_clinic_no = di['second_class_clinic_no']
        city = di['city']
        rows = [str(id), str(score), title, str(content_len), str(image_num),
                str(is_original), doctor_id, d_title, h_level, clinic_no, s_clinic_no, city, h_name]
        rows = convert2gbk(rows)
        csvwriter.writerow(rows)
    fo.close()

예제 #2

0

파일 보기

파일: test.py 프로젝트: johndpope/material_recommendation

def test17():
    from general_utils.solr_utils import nat_get_title
    fin = sys.argv[2]
    mtype = 'topic' if 'topic' in fin else 'news'
    fon = mtype + '_nearest_top10.csv'
    fo = open(fon, 'w')
    csvwriter = csv.writer(fo)
    first_line = ['id', 'title', 'top_id', 'top_title', 'score']
    csvwriter.writerow(first_line)
    cnt = 0
    with open(fin, 'r') as f:
        for l in f:
            if cnt > 1000:
                break
            cnt += 1
            l = l.strip('\n')
            this_dict = json.loads(l)
            main_id = this_dict['id']
            main_title = nat_get_title(mtype + '_' + str(main_id))
            if not main_title:
                continue
            top = this_dict['top'][:10]
            for subordinate_id, score in top:
                subordinate_title = nat_get_title(mtype + '_' +
                                                  str(subordinate_id))
                row = [
                    str(main_id), main_title,
                    str(subordinate_id), subordinate_title,
                    str(score)
                ]
                row = convert2gbk(row)
                csvwriter.writerow(row)
    fo.close()

예제 #3

0

파일 보기

파일: test.py 프로젝트: johndpope/material_recommendation

def test11():
    filename = "/home/classify/workspace/medical_data/data_dir/medical_word_detail.pickle"
    endict = pickle_from_file(filename)
    fo = open("all_medical_words.csv", "w")
    csvwriter = csv.writer(fo, dialect='excel')

    for w in endict:
        id = endict[w]['id']
        cate = endict[w]['cate']

        row = [str(id), w, cate]
        row = convert2gbk(row)
        csvwriter.writerow(row)
    fo.close()

예제 #4

0

파일 보기

파일: test.py 프로젝트: johndpope/material_recommendation

def test18():
    # 看没有图的科普文章占比
    from general_utils.db_utils import get_newsdb_handler
    from general_utils.text_utils import filterHTML

    sql = 'select id,title,content,created_time,mini_img from news_healthnews where is_online=1;'

    o = get_newsdb_handler().do_one(sql)

    has_image_cnt = 0
    has_no_image_cnt = 0

    fo = open('news_with_no_image_info.csv', 'w')
    csvwriter = csv.writer(fo)
    first_line = ['id', 'title', 'content_len', 'created_time']
    csvwriter.writerow(first_line)

    for item in o:
        id = item[0]
        title = item[1]
        if not title:
            continue

        content = item[2]
        content = filterHTML(content)

        content_len = len(content)
        created_time = item[3]

        mini_img = item[4]

        if mini_img and len(mini_img) > 5:
            has_image_cnt += 1
            continue
        has_no_image_cnt += 1

        line = [str(id), title, str(content_len), str(created_time)]
        line = convert2gbk(line)
        csvwriter.writerow(line)

    line = ['no_image_cnt', 'has_image_cnt', 'all']
    csvwriter.writerow(line)
    line = [
        str(has_no_image_cnt),
        str(has_image_cnt),
        str(has_no_image_cnt + has_image_cnt)
    ]
    csvwriter.writerow(line)
    fo.close()

예제 #5

0

파일 보기

파일: test.py 프로젝트: johndpope/material_recommendation

def test4():
    from general_utils.db_utils import get_medical_entity_handler
    sql = 'select id,name,frequency from medicaldb_newdiseases;'
    o = get_medical_entity_handler(False).do_one(sql)
    fo = open("diseases_frequency.csv", "w")
    csvwriter = csv.writer(fo, dialect='excel')
    first = ["id", "name", "freq"]
    csvwriter.writerow(first)
    for item in o:
        id = item[0]
        name = item[1]
        freq = item[2]
        rows = [id, name, freq]
        rows = convert2gbk(rows)
        csvwriter.writerow(rows)
    fo.close()

예제 #6

0

파일 보기

def main6(test_uid=None):
    # test recommend_topics
    from recommend.manager.recommend_resource import Recommend_topics
    from recommend.manager.recommend_topic_data_helper import parse_user_info as parse_user_info2
    now = time.time()
    if test_uid == "n":
        test_uid = None
    data_dict = cy_time_event_kernel_test(now - 12000.0, now, test_uid)
    fo = open("20180102_rt.csv", "w")
    csvwriter = csv.writer(fo)
    first_line = ['uid', 'tags', 'sp', 'topicid', 'score', 't_title', 't_tags']
    csvwriter.writerow(first_line)
    times = {}
    for uid in data_dict.keys():
        t1 = time.time()
        topic_ids, user_info, score_dict = Recommend_topics(uid, 5, now, True)
        t2 = time.time()
        times[uid] = t2 - t1
        if not user_info:
            continue
        tags = user_info['tags']
        sp = user_info['special_population']
        for x in topic_ids:
            title = nat_get_title('topic_' + str(x))
            score = score_dict['topic_' + str(x)]
            t_tags = get_news_tags_from_solr("r_topic_" + str(x))
            row = [
                str(uid), '-'.join(tags), sp,
                str(x),
                str(score), title, '-'.join(t_tags)
            ]
            row = convert2gbk(row)
            csvwriter.writerow(row)
    fo.close()

    s_times = sorted(times.iteritems(), key=lambda x: x[1], reverse=True)[:10]
    for x, y in s_times:
        print x, y

예제 #7

0

파일 보기

def main9(test_uid=None):
    # test recommend tags
    from recommend.manager.recommend_resource import Recommend_tags
    from recommend.manager.recommend_tags_data_helper import get_user_last_query
    if test_uid == "n":
        test_uid = None
    now = time.time()
    # now = 1513780888
    data_dict = cy_time_event_kernel_test(now - 2000, now, test_uid)
    if not test_uid:
        fo = open("20180102_1_rtr.csv", "w")
    else:
        fo = open('test.csv', 'w')

    first_line = ['uid', 'last_query', 'r_tags', 'r_plan']
    csvwriter = csv.writer(fo)
    csvwriter.writerow(first_line)
    total_t = {}
    for uid in data_dict.keys():
        t1 = time.time()
        res = Recommend_tags(uid)
        t2 = time.time()
        total_t[uid] = t2 - t1
        #{'words': tags, 'plan': plans}
        words = res['words']
        plan = res['plan']
        last_query = get_user_last_query(uid)
        row = [
            str(uid), last_query, '-'.join(words),
            '-'.join([item['name'] for item in plan])
        ]
        row = convert2gbk(row)
        csvwriter.writerow(row)

    s_total_t = sorted(total_t.iteritems(), key=lambda x: x[1], reverse=True)
    for uid, t in s_total_t[:10]:
        csvwriter.writerow([str(uid), str(t)])

예제 #8

0

파일 보기

def main8(test_uid=None):
    # test cf
    from recommend.manager.recommend_resource import Recommend_by_user_info
    if test_uid == "n":
        test_uid = None
    now = time.time()
    # now = 1513780888
    data_dict = cy_time_event_kernel_test(now - 6000, now, test_uid)
    if not test_uid:
        fo = open("20171229_1_cfr.csv", "w")
    else:
        fo = open('test.csv', 'w')
    csvwriter = csv.writer(fo, dialect="excel")
    first_line = [
        u"uid", u"u_tags", u"special_population", u"trigger", u"trigger_info",
        u"trigger_time", u"material_id", u"material_type", u"score", u"title",
        u"m_tags", u"only_topic"
    ]
    csvwriter.writerow(first_line)
    fail_cases = {
        'big_search': defaultdict(int),
        'free_problem_create': defaultdict(int)
    }

    all_uid_cnt = 0
    all_valid_res_cnt = 0
    qa_score = [[1.0 - i / 10.0, 0] for i in range(11)]
    bs_score = [[1.0 - i / 10.0, 0] for i in range(11)]
    trigger_cnt = {'qa': 0, 'bs': 0}
    cal_time = {}
    for uid in data_dict.keys():
        time.sleep(0.5)
        print '=' * 10, uid, '=' * 10

        user_info0 = data_dict[uid]
        t1 = time.time()
        res = Recommend_by_user_info(user_info0=user_info0,
                                     uid=uid,
                                     log_mark="test8",
                                     num=6,
                                     test=True)
        t2 = time.time()
        cal_time[uid] = t2 - t1

        user_info = res['user_info']
        res1 = res['res']
        topn_ids_scores = res['topn_ids_scores']
        only_topic = res['only_topic']
        status = res['status']
        v_score_dict = res['v_score_dict']
        if not user_info:
            continue
        all_uid_cnt += 1
        trigger = user_info["trigger"]
        if trigger == "big_search":
            trigger_cnt['bs'] += 1
        else:
            trigger_cnt['qa'] += 1

        if status != 'succeed':
            fail_cases[trigger][status] += 1
            continue

        texts = user_info["texts"]
        tags = user_info["tags"]
        special_population = user_info["special_population"]

        timestamp = user_info['timestamp']

        best_id, best_title, mtype = res1[0]
        best_score = v_score_dict[mtype + '_' + str(best_id)]

        if trigger == 'big_search':
            for i, item in enumerate(bs_score):
                if best_score >= item[0]:
                    bs_score[i][1] += 1
                    break
        else:
            for i, item in enumerate(qa_score):
                if best_score >= item[0]:
                    qa_score[i][1] += 1
                    break

        if trigger == 'big_search':
            trigger_info = "-".join(texts)
        elif trigger == "free_problem_create":
            problem_id, ask = get_medicaldb_handler().get_ask_by_timestamp(
                uid, timestamp)
            if not ask:
                ask = texts[0]
            trigger_info = '-'.join([str(problem_id), str(ask)])

            # [u"uid", u"u_tags", u"special_population", u"trigger", u"trigger_info", u"trigger_time",
            # u"material_id", u"material_type", u"score", u"title", u"m_tags", u"only_topic",

        for id, title, mtype in res1:
            prefix = 'news_' if mtype == 'news' else 'r_topic_'
            mtags = get_news_tags_from_solr(prefix + str(uid))
            rows = [
                str(uid), '-'.join(tags), special_population, trigger,
                trigger_info,
                str(timestamp),
                str(id), mtype, v_score_dict[mtype + '_' + str(id)], title,
                '-'.join(mtags),
                str(only_topic)
            ]

            rows = convert2gbk(rows)
            csvwriter.writerow(rows)
        if res1:
            all_valid_res_cnt += 1

    # fail_cases
    for trigger in fail_cases:
        for reason in fail_cases[trigger]:
            rows = [trigger, reason, str(fail_cases[trigger][reason])]
            rows = convert2gbk(rows)
            csvwriter.writerow(rows)
    # ana
    rows = ['all', str(all_uid_cnt), 'res_cnt', str(all_valid_res_cnt)]
    rows = convert2gbk(rows)
    csvwriter.writerow(rows)

    # score cut
    rows = ['bs score cut']
    csvwriter.writerow(rows)

    cum_cnt = 0
    for score, cnt in bs_score:
        cum_cnt += cnt
        true_recall = cum_cnt / float(trigger_cnt['bs'])
        rows = [str(score), str(cnt), str(true_recall)]
        csvwriter.writerow(rows)

    rows = ['qa score cut']
    csvwriter.writerow(rows)

    cum_cnt = 0
    for score, cnt in qa_score:
        cum_cnt += cnt
        true_recall = cum_cnt / float(trigger_cnt['bs'])
        rows = [str(score), str(cnt), str(true_recall)]
        csvwriter.writerow(rows)

    # cal time

    s_cal_time = sorted(cal_time.iteritems(), key=lambda x: x[1], reverse=True)
    for u, t in s_cal_time[:20]:
        csvwriter.writerow([str(u), str(t)])

    fo.close()

예제 #9

0

파일 보기

def main5(test_uid=None, now=None):
    if test_uid == "n":
        test_uid = None
    now = time.time()
    if not now:
        now = 1512379920.1
    else:
        now = float(ensure_second_timestamp(now))
    t10 = time.time()
    data_dict = cy_time_event_kernel_test(now - 12000.0, now, test_uid)
    t20 = time.time()

    print "len(data_dict)", len(data_dict)

    if not test_uid:
        fo = open("20171220_1_res.csv", "w")
    else:
        fo = open('test.csv', 'w')
    csvwriter = csv.writer(fo, dialect="excel")
    first_line = [
        u"uid", u"u_tags", u"special_population", u"trigger", u"trigger_info",
        u"trigger_time", u"material_id", u"material_type", u"score", u"title",
        u"m_tags", u"only_topic", u"best_id", u"best_score", u"time"
    ]
    csvwriter.writerow(first_line)
    all_call_cnt = 0
    all_valid_res_cnt = 0
    exception_cnt = 0
    status_dict = defaultdict(int)

    total_time = []
    slow_case = []
    for uid in data_dict:

        all_call_cnt += 1

        user_info0 = data_dict[uid]
        try:
            # if True:
            t1 = time.time()
            res = Recommend_by_user_info(user_info0,
                                         uid,
                                         log_mark='testmain5',
                                         test=True)

            # return = {"user_info": None, "res": None, "topn_ids_scores": None, "only_topic": None,"status":"succeed"}

            t2 = time.time()
            print t2 - t1
            if t2 - t1 >= 3:
                break

            user_info = res['user_info']
            res1 = res['res']
            topn_ids_scores = res['topn_ids_scores']
            only_topic = res['only_topic']
            status = res['status']
            v_score_dict = res['v_score_dict']

            best_id, best_title, mtype = res1[0]

            this_time = t2 - t1
            if this_time >= 1.0:
                slow_case.append([uid, this_time])
            total_time.append(t2 - t1)
        except Exception, e:
            print e

            exception_cnt += 0
            continue
        status_dict[status] += 1

        ####################
        # if not only_topic:
        #     continue
        ####################

        if best_id == -1 or user_info is None:
            continue

        print '================='
        print uid

        texts = user_info["texts"]
        tags = user_info["tags"]
        special_population = user_info["special_population"]
        trigger = user_info["trigger"]
        timestamp = user_info['timestamp']
        best_score = v_score_dict[mtype + '_' + str(best_id)]
        # if trigger == "big_search":
        #     continue

        if trigger == 'big_search':
            trigger_info = "-".join(texts)
        elif trigger == "free_problem_create":
            problem_id, ask = get_medicaldb_handler().get_ask_by_timestamp(
                uid, timestamp)
            if not ask:
                ask = texts[0]
            trigger_info = '-'.join([str(problem_id), str(ask)])
        print "u tags", "-".join(tags), special_population
        print trigger_info, best_id, best_score, best_title

        for unique_id, score in topn_ids_scores:
            material_type, id = unique_id.split('_')
            if material_type == "news":
                title, _ = get_newsdb_handler().get_title_digest_by_nid(id)
                m_tags = get_news_tags_from_solr("news_" + str(id))
            elif material_type == "topic":
                title = get_medicaldb_handler().get_topic_title(id)
                m_tags = get_news_tags_from_solr("r_topic_" + str(id))

            rows = [
                str(uid), "-".join(tags),
                str(special_population), trigger, trigger_info,
                str(timestamp),
                str(id), material_type,
                str(score), title, "-".join(m_tags),
                str(only_topic),
                str(best_id),
                str(best_score),
                str(this_time)
            ]
            rows = convert2gbk(rows)
            csvwriter.writerow(rows)
        all_valid_res_cnt += 1

예제 #10

0

파일 보기

파일: analysis_logs.py 프로젝트: johndpope/material_recommendation

def a1(log_file_name, ana_file_name, bdp_file_name):
    START = "==========start======="
    # 2017 11 08 日，被成功推荐的，用户uid，触发时间，推送文章id，推送时间，
    # 当天推送时间后是否浏览过该文章，以及浏览时间

    today_zero, today_end = get_today_timestamp(time.time() - 86400.0)

    def get_uid(l):
        return l.split("=uid=")[1].split('=')[0]

    fi = open(log_file_name, 'r')
    fo = open(ana_file_name, "w")
    csvwriter = csv.writer(fo, dialect='excel')
    first_line = [u"uid", u"触发时间", u"触发类型", u"用户全文", u"用户tag", u"用户人群",
                  u"文章id", u"文章标题", u"文章tag", u"文章分类", u"返回时间", u"点击时间"]
    csvwriter.writerow(convert2gbk(first_line))
    uid = None
    uni_key0 = None
    trigger_time = None
    trigger_type = None
    caled = set()
    all = set()  # 所有触发的请求
    reason = None

    all_qa = defaultdict(set)
    all_bs = defaultdict(set)
    cnt = 0
    for l in fi:
        # if not l.startswith("2017-11-08"):
        #     continue
        cnt += 1
        # if cnt > 10000:
        #     continue

        if START in l:
            # 记录上一个
            if reason and uni_key0 and trigger_type:
                if trigger_type == "bs":
                    all_bs["all"].add(uni_key0)
                elif trigger_type == "qa":
                    all_qa["all"].add(uni_key0)

            if uni_key0 and trigger_type and not reason:
                if trigger_type == "bs":
                    all_bs["failed"].add(uni_key0)
                elif trigger_type == "qa":
                    all_qa["failed"].add(uni_key0)

            uid = get_uid(l)
            trigger_time = l.split(',')[0]
            uni_key0 = uid + '|' + trigger_time

            if "pid=None" in l:
                trigger_type = "bs"

            else:
                trigger_type = "qa"
                all_qa["all"].add(uni_key0)

            reason = None

            # all.add(uni_key0)

            trigger_ts = datetime_str2timestamp(trigger_time)
            print "uni_key", uni_key0
            print "ts", trigger_ts

        # if "=trigger=" in l:
        #     trigger_type0 = l.split("=trigger=")[1].split('=')[0]

        if "=special_population=" in l:
            special_population0 = l.split("=special_population=")[1].split("=")[0]

        if "=texts=" in l:
            texts0 = l.split("=texts=")[1].split("=")[0]

        if "=tags=" in l:
            tags0 = l.split("=tags=")[1].split("=")[0]

        if "failed in recommend==" in l:
            reason = l.split("failed in recommend==")[1].split("=")[0]
            if trigger_type == "qa":

                if reason not in all_qa:
                    all_qa[reason] = set([uni_key0])
                else:
                    all_qa[reason].add(uni_key0)
            elif trigger_type == "bs":

                if reason not in all_bs:
                    all_bs[reason] = set([uni_key0])
                else:
                    all_bs[reason].add(uni_key0)

        if "succeed in recommend==========" in l:
            reason = "succeed"
            if trigger_type == "qa":
                if reason not in all_qa:
                    all_qa[reason] = set([uni_key0])
                else:
                    all_qa[reason].add(uni_key0)
            elif trigger_type == "bs":
                if reason not in all_bs:
                    all_bs[reason] = set([uni_key0])
                else:
                    all_bs[reason].add(uni_key0)

            return_time = l.split(',')[0]
            uni_key = uid + return_time
            if uni_key in caled:
                continue
            print 'WWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWW'

            caled.add(uni_key)
            return_ts = datetime_str2timestamp(return_time)
            nid = l.split("=====id=")[1].split("=")[0]
            ntitle = l.split("===title=")[1].split("=")[0]

            # news_title = get_db_data_local_handler().get_news_title(nid)
            news_type = get_db_data_local_handler().get_news_type(nid)
            news_tags = u'|||'.join(get_news_tags_from_solr("news_" + str(nid)))

            print uid
            print trigger_type

            print nid
            print return_ts
            print type(ntitle)

            # first_line = [u"uid", u"触发时间", u"触发类型",u"用户全文",u"用户tag",u"用户人群",
            #    u"文章id", u"文章标题",u"文章tag",u"文章分类",u"返回时间", u"点击时间"]
            views = cy_time_event_one_user_viewnews(uid, begin=return_ts, end=today_end)
            print views

            rows = [str(uid), trigger_time, trigger_type, texts0, tags0, special_population0,
                    str(nid), ntitle, news_tags, news_type, return_time,
                    str(timestamp2datetime(views.get(nid, -1)))]
            rows = convert2gbk(rows)

            csvwriter.writerow(rows)

    csvwriter.writerow([u"所有uid".encode("gbk"), u"推了的uid".encode("gbk")])
    rows = [str(len(all)), str(len(caled))]
    rows = convert2gbk(rows)
    csvwriter.writerow(rows)

    fi.close()
    fo.close()

    for x in all_qa:
        print x + "|||" + str(len(all_qa[x]))

    for x in all_bs:
        print x + "|||" + str(len(all_bs[x]))

    with open(bdp_file_name, "w") as f:
        f.write("news_all_input_qa|||" + str(len(all_qa["all"])) + "\n")
        f.write("news_all_output_qa|||" + str(len(all_qa["succeed"])) + "\n")
        f.write("news_no_info_qa|||" + str(len(all_qa["user_info is None "])) + "\n")
        f.write("news_filtered_by_preprocessing_qa|||" + str(len(all_qa["filter_user_info bad "])) + "\n")
        f.write("news_empty_res_qa|||" + str(len(all_qa["topn_ids_scores empty"])) + "\n")
        f.write("news_bad_res_qa|||" + str(len(all_qa["best_score so low"])) + "\n")
        f.write("qa_failed|||" + str(len(all_qa["failed"])) + "\n")

        f.write("news_all_input_bs|||" + str(len(all_bs["all"])) + "\n")
        f.write("news_all_output_bs|||" + str(len(all_bs["succeed"])) + "\n")
        f.write("news_no_info_bs|||" + str(len(all_bs["user_info is None "])) + "\n")
        f.write("news_filtered_by_preprocessing_bs|||" + str(len(all_bs["filter_user_info bad "])) + "\n")
        f.write("news_empty_res_bs|||" + str(len(all_bs["topn_ids_scores empty"])) + "\n")
        f.write("news_bad_res_bs|||" + str(len(all_bs["best_score so low"])) + "\n")
        f.write("bs_failed|||" + str(len(all_bs["failed"])) + "\n")

예제 #11

0

파일 보기

def g1():
    '''
    查看不使用热卖tag扩充的覆盖率，和使用热卖tag扩充的覆盖率

    分子，能匹配上热卖tag的，分母，一天内有活动用户（cy_event
    '''

    from general_utils.hbase_utils import get_user_query, get_user_query2
    from general_utils.solr_utils import get_last_login_uids
    from recommend.manager.recommend_tags_data_helper import get_relation_plan3
    from general_utils.db_utils import get_db_data_local_handler
    from general_utils.hbase_utils import get_sp_duration_active_userid

    from general_utils.time_utils import timestamp2datetime, ensure_second_timestamp
    # 用户采样时间窗
    # 用户采样命中率

    end_ds0 = '2018-01-21 23:59:40'
    end0 = datetime_str2timestamp(end_ds0)
    begin0 = end0 - 86400 * 1

    # 每个选中用户的数据采集时间窗
    end_ds = '2018-01-22 23:59:40'
    end = datetime_str2timestamp(end_ds)
    begin = end - 86400 * 180.0  # 半年

    # 最后登录时间在2018-01-21 23:59:40前一周的用户
    # test_uids = get_last_login_uids(begin0, end0)
    # test_uids = get_sp_duration_active_userid(begin0,end0)
    test_uids = get_one_day_uid_from_file('log_event_20180122')
    print "test_uids num", len(test_uids)

    # 打乱顺序，取1000个样本
    random.shuffle(test_uids)
    selected_uids = test_uids[:3000]

    all_good_cnt = 0
    all_cnt = 0
    app_cnt = 0
    good_app_cnt = 0

    text_empty_cnt = 0
    fo = open('180129_rp_1.csv', 'w')
    csvwriter = csv.writer(fo)
    first_line = [
        'uid', 'username', 'is_app', 'last_info_time', 'use_tags',
        'systag_ids', 'tag_names', 't', 'is_tangsai'
    ]
    csvwriter.writerow(first_line)
    # status_dict = {
    #     1: "qa and query",
    #     2: "view actions",
    #     3: "search_doctor clinic_no",
    #     0: ""
    # }

    total_time = {}
    for uid in selected_uids:
        print '==============uid=%s=======================' % uid
        username = get_username(uid)
        is_app = is_app_user(uid)

        all_cnt += 1
        if is_app:
            app_cnt += 1

        t1 = time.time()
        res = get_relation_plan3(uid, test=True)
        t2 = time.time()
        t = t2 - t1
        total_time[uid] = t
        status = res['status']
        is_tangsai = False
        if status:
            all_good_cnt += 1
            if is_app:
                good_app_cnt += 1
            systag_ids = res['ids']
            if 96 in systag_ids:
                is_tangsai = True
            tagnames = [
                get_db_data_local_handler().get_systagid_name(id)
                for id in systag_ids
            ]
            if status in (1, 2, 4):
                info0 = res['systag_id_dict']
                record_info = '~'.join(info0.keys())
            elif status == 3:
                info0 = res['clinic_no']
                record_info = '~'.join(info0)
            last_ts = res['last_ts']
            last_info_time = timestamp2datetime(
                ensure_second_timestamp(last_ts))

        else:
            systag_ids = []
            tagnames = []
            record_info = ''
            last_info_time = ''

        systag_ids_str = '~'.join([str(x) for x in systag_ids])
        tagnames_str = '~'.join(tagnames)

        line = convert2gbk([
            str(uid), username,
            str(is_app), last_info_time, record_info, systag_ids_str,
            tagnames_str,
            str(t),
            str(is_tangsai)
        ])
        csvwriter.writerow(line)

    line = [str(all_cnt), str(all_good_cnt), str(app_cnt), str(good_app_cnt)]
    csvwriter.writerow(line)
    s_total_time = sorted(total_time.iteritems(),
                          key=lambda x: x[1],
                          reverse=True)
    times = total_time.values()
    line = [str(min(times)), str(max(times)), str(sum(times) / len(times))]
    csvwriter.writerow(line)
    for uid, t in s_total_time[:10]:
        line = [str(uid), str(t)]
        csvwriter.writerow(line)

    fo.close()

    print str(max(times))
    print all_good_cnt

예제 #12

0

파일 보기

def g2():
    # test recommend_news
    '''

    :return:
    '''
    from recommend.manager.feed_data_helper import recommend_news_kernel
    from general_utils.solr_utils import nat_get_title

    test_uids = get_one_day_uid_from_file('log_event_20180222')
    print "test_uids num", len(test_uids)

    # 打乱顺序，取1000个样本
    random.shuffle(test_uids)
    selected_uids = test_uids[:1000]

    fo = open('20180321_rn_1.csv', 'w')
    csvwriter = csv.writer(fo)
    first_line = [
        'uid', 'username', 'utags', 'user_bs', 'user_qa', 'user_look_title',
        'news_id', 'title', 'score'
    ]
    csvwriter.writerow(first_line)
    total_time = {}

    cnt_all = 0
    cnt_good = 0

    for uid in selected_uids:
        print '==============uid=%s=======================' % uid
        username = get_username(uid)
        is_app = is_app_user(uid)
        if not is_app:
            continue
        cnt_all += 1
        t1 = time.time()
        recommend_res = recommend_news_kernel(uid, True)
        t2 = time.time()
        total_time[uid] = t2 - t1
        parsed_user_info = recommend_res['parsed_user_info']
        utags = parsed_user_info['weight_dict'].keys()
        user_info_list = recommend_res['user_info_list']

        bs_text_list = []
        qa_text_list = []
        view_news_title_list = []
        view_topic_title_list = []

        for ts, obj, action_type in user_info_list:
            if action_type in ('bs', 'sd'):
                bs_text_list.append(obj)
            elif action_type == 'qa':
                qa_text_list.append(obj)
            elif action_type == 'vt':
                title = nat_get_title('topic_' + str(obj))
                view_topic_title_list.append(title)
            elif action_type == 'vn':
                title = nat_get_title('news_' + str(obj))
                view_news_title_list.append(title)

        user_bs = '~'.join([str(item) for item in bs_text_list])
        user_qa = '~'.join([str(item) for item in qa_text_list])
        user_look_title = '~'.join([
            str(item) for item in view_news_title_list + view_topic_title_list
        ])

        title_dict = recommend_res['title_dict']
        ids_list = recommend_res['ids']
        score_dict = recommend_res['v_score_dict']
        ids = [['%s-news_' % i + str(x) for x in ids]
               for [i, ids] in enumerate(ids_list)]
        ids1 = []
        for x in ids:
            ids1.extend(x)
        ids = ids1
        tcnt = 0

        if ids:
            cnt_good += 1
        for id in ids:
            id0 = id.split('-')[1]
            title = title_dict[id0]
            score = score_dict[id0]
            if tcnt == 0:
                line = convert2gbk([
                    str(uid), username, '~'.join(utags), user_bs, user_qa,
                    user_look_title,
                    str(id), title, score
                ])
            else:
                line = convert2gbk([
                    ' ', ' ', '~'.join(utags), user_bs, user_qa,
                    user_look_title,
                    str(id), title, score
                ])
            csvwriter.writerow(line)
            tcnt += 1

    min_t = min(total_time.values())
    max_t = max(total_time.values())
    mean_t = sum(total_time.values()) / len(total_time)

    line = ['min', 'max', 'mean']
    csvwriter.writerow(line)
    line = [str(min_t), str(max_t), str(mean_t)]
    csvwriter.writerow(line)

    sorted_total_time = sorted(total_time.iteritems(),
                               key=lambda x: x[1],
                               reverse=True)
    for uid, t in sorted_total_time[:10]:
        line = [str(uid), str(t)]
        csvwriter.writerow(line)

    line = ['all_app_user_num', 'good_add_user_num']
    csvwriter.writerow(line)
    line = [str(cnt_all), str(cnt_good)]
    csvwriter.writerow(line)

    fo.close()

예제 #13

0

파일 보기

파일: test.py 프로젝트: johndpope/material_recommendation

def test10():
    from rpc_services.word2vec_api import get_similar
    from rpc_services.medical_service_api import tokenizer_default

    # 寻找相似词
    # id  query  分词结果  实体词分类  疾病词1 疾病词2 疾病词3  症状词1 症状词2 症状词3 药品词1 药品词2  药品词3
    # input_file = "/Users/satoshi/Documents/work file/query_result_o1.csv"
    input_file = sys.argv[2]
    endict = pickle_from_file(
        "/home/classify/workspace/medical_data/data_dir/medical_word_detail.pickle"
    )
    first_line = [
        u"id",
        u"query",
        u"words",
        u"cates",
        u"disease",
        u"symptom",
        u"drug",
    ]
    fo = open("query_similar_words.csv", "w")
    csvwriter = csv.writer(fo, dialect='excel')
    csvwriter.writerow(first_line)

    with open(input_file, 'r') as f:
        for l in f:
            ll = l.strip('\n').split(',')
            print l
            print ll
            id, text = ll[0], ll[1]
            text = text.decode('gbk', 'ignore')
            similar_word_score_dict = {}
            seged = []
            cates = []
            tokens = tokenizer_default([text])["tokens"][0]

            for item in tokens:
                if u"neg_ne" in item:
                    continue
                if "cate" not in item:
                    continue
                word = item['token']
                if word in seged:
                    continue
                seged.append(word)
                cates.append(item['cate'])

            for x in seged:
                x_s = get_similar(x, 100)
                if not x_s:
                    continue
                for w, s in x_s:
                    if w not in similar_word_score_dict:
                        similar_word_score_dict[w] = s
                    elif s > similar_word_score_dict[w]:
                        similar_word_score_dict[w] = s

            dis = []
            sym = []
            drug = []
            s_similar_word_score = sorted(similar_word_score_dict.iteritems(),
                                          key=lambda x: x[1],
                                          reverse=True)
            for w, s in s_similar_word_score:
                if w not in endict:
                    continue
                cate = endict[w]['cate']
                if cate == "SYMPTOM_DESC" and len(sym) < 3:
                    sym.append(w)
                if cate == "DISEASE_DESC" and len(dis) < 3:
                    dis.append(w)
                if cate == "DRUG_DESC" and len(drug) < 3:
                    drug.append(w)
            row = [
                id, text, u"|||".join(seged), u"|||".join(cates),
                u"|||".join(dis), u"|||".join(sym), u"|||".join(drug)
            ]
            row = convert2gbk(row)
            csvwriter.writerow(row)
    fo.close()

예제 #14

0

파일 보기

def main1():
    uids = []

    # 获取所有uid
    for i in (0, 1, 2, 3):
        uid_filename = get_parti_uid_filename(part=i, mode='news')
        with open(uid_filename, 'r') as f:
            ls = f.readlines()
            t_uids = [int(item.strip('\n')) for item in ls]
            uids.extend(t_uids)

    #
    output_filename = '20180312_user_event_and_recommend_news.csv'
    yesterday_begin, yesterday_end = get_yesterday_timestamp()
    yesterday_begin = int(yesterday_begin * 1000)
    yesterday_end = int(yesterday_end * 1000)

    #

    fo = open(output_filename, 'w')
    csvwriter = csv.writer(fo)
    first_line = [
        'uid', 'is_app_user', 'event_datetime', 'event_type', 'event_obj',
        'recommended_news'
    ]
    csvwriter.writerow(first_line)

    all_cnt = 0
    good_cnt = 0

    shuffle(uids)
    for uid in uids[:1000]:
        all_cnt += 1
        is_app = is_app_user(uid)
        print '+' * 10, uid, '+' * 10
        user_action_list = cy_time_event_one_user_kernel2(
            uid, yesterday_begin, yesterday_end)
        recommended_news_ids = get_caled_user_topn_news(uid)
        recommended_news_ids = new_newsids_check(recommended_news_ids, 2)
        if recommended_news_ids:
            good_cnt += 1

        cnt = 0
        for i in range(max([len(user_action_list),
                            len(recommended_news_ids)])):
            if cnt == 0:
                user_id = str(uid)

            else:
                user_id = ''
                is_app = ''
            try:
                event_datetime = timestamp2datetime(user_action_list[i][0] /
                                                    1000.0)
                event_type = user_action_list[i][2]
                event_obj = user_action_list[i][1]
                if event_type == 'vn':
                    title = nat_get_title('news_' + str(event_obj))
                    event_obj_str = str(event_obj) + '|' + title
                elif event_type == 'vt':
                    title = nat_get_title('topic_' + str(event_obj))
                    event_obj_str = str(event_obj) + '|' + title
                else:
                    event_obj_str = event_obj

            except:
                event_datetime = ''
                event_obj_str = ''
                event_type = ''

            try:
                recommended_news_id = recommended_news_ids[i]
                title = nat_get_title('news_' + str(recommended_news_id))
                recommend_str = str(recommended_news_id) + '|' + title

            except:
                recommend_str = ''

            line = convert2gbk([
                user_id,
                str(is_app), event_datetime, event_type, event_obj_str,
                recommend_str
            ])
            csvwriter.writerow(line)

            cnt += 1

    line = ['all', 'good']
    csvwriter.writerow(line)
    csvwriter.writerow([str(all_cnt), str(good_cnt)])
    fo.close()