示例#1
0
def get_user_qa_content(uid, begin, end):
    # 不要了,改用从hbase取数据
    # 获取用户在一段时间内所有qa的全文
    begin_ds = timestamp2datetime(ensure_second_timestamp(begin))
    end_ds = timestamp2datetime(ensure_second_timestamp(end))

    all_qa_text = []

    sql1 = 'select id from ask_problem where user_id=%s and created_time>"%s" and created_time<"%s";' \
           % (
               uid, begin_ds, end_ds
           )

    o1 = get_medicaldb_handler().do_one(sql1)
    if o1 is None or len(o1) == 0:
        return all_qa_text

    for item in o1:
        problem_id = item[0]
        sql = 'select content from ask_problemcontent where problem_id=%s;' % problem_id
        o = get_medicaldb_handler().do_one(sql)
        if o is None or len(o) == 0:
            continue

        content = o[0][0]
        content_dict = json.loads(content)[0]
        if content_dict['type'] != 'text':
            continue
        text = content_dict['text']
        all_qa_text.append(text)

    return all_qa_text
示例#2
0
def test5():
    from rpc_services.medical_service_utils import get_entities
    from rpc_services.search_api import more_topic
    from general_utils.db_utils import get_medicaldb_handler

    sql = "select ask from ask_problem order by id desc limit 1000;"
    o = get_medicaldb_handler().do_one(sql)
    yes = 0
    all = 0
    for item in o:
        print "---=-=-==-==-==--=--===-=="
        text = item[0]
        tags = " ".join(get_entities(text))
        print "text", text
        if not text:
            continue

        o = more_topic(text)
        o = json.loads(o)["result"]
        for item in o:
            print item['title']

        print len(o)
        print "=================="
        print "tags", tags
        if not tags:
            continue
        o = more_topic(tags)
        o = json.loads(o)["result"]
        for item in o:
            print item['title']
        cnt = 0
        print len(o)
示例#3
0
def test9():
    from general_utils.db_utils import get_medicaldb_handler
    from add_data_to_solr.manager.add_utils import topic_info, doctor_info
    fo = open("topic_score.csv", "w")
    csvwriter = csv.writer(fo, dialect='excel')
    first_line = [u'topic id', u'score', u'topic title', u'content len', u'image num', u'is original',
                  u'doctor id', u'职称', u'医院级别', u'1科室', u"2科室", u'城市', u'hospital_name']
    first_line = convert2gbk(first_line)
    csvwriter.writerow(first_line)
    d = "data_dir/topic_data/"
    b2 = pickle_from_file(d + 'all_doc_big_2')
    for item in b2:
        id = int(item['id'].split('_')[-1])
        score = item['tid'] / 10.0
        title = item['title']
        content_len = item['content_len']
        sql = 'select doctor_id from api_doctortopic where id=%s;' % id
        o = get_medicaldb_handler().do_one(sql)
        doctor_id = o[0][0]
        ti = topic_info(id)
        di = doctor_info(doctor_id)
        image_num = ti['image_num']
        is_original = ti['is_original']
        d_title = di['title']
        h_level = di['hospital_level']
        h_name = di['hospital_name']
        clinic_no = di['first_class_clinic_no']
        s_clinic_no = di['second_class_clinic_no']
        city = di['city']
        rows = [str(id), str(score), title, str(content_len), str(image_num),
                str(is_original), doctor_id, d_title, h_level, clinic_no, s_clinic_no, city, h_name]
        rows = convert2gbk(rows)
        csvwriter.writerow(rows)
    fo.close()
示例#4
0
def get_problem_contents():
    from general_utils.db_utils import get_medicaldb_handler
    pid = sys.argv[2]
    sql = 'select content from ask_problemcontent where problem_id=%s;' % pid
    o = get_medicaldb_handler().dbhandler.do_one(sql)
    for item in o:
        print item
示例#5
0
def get_topic_data():
    # score
    old_score = pickle_from_file(TOPIC_SCORE_FILE)
    biggest_id = max(old_score.keys())  # 最大的topic_id
    sql1 = "select id,doctor_id from api_doctortopic where is_deleted=0 and title <> '' and id>%s;" % biggest_id
    o = get_medicaldb_handler().do_one(sql1)
    cnt = 0
    for item in o:
        id = item[0]
        doctor_id = item[1]
        info_of_topic = topic_info(id)
        info_of_doc = doctor_info(doctor_id)
        title_tags = get_entities_cyseg(info_of_topic["title"])

        content_tags = get_entities_cyseg(info_of_topic["text"])
        # print "content",info_of_topic["text"]

        if len(content_tags) == 0 or len(info_of_topic['title']) == 0:
            print "no content tag", id
            continue

        score = grade_topic(info_of_topic, info_of_doc, title_tags,
                            content_tags)
        old_score[int(id)] = score
        cnt += 1
    print "new topic id num", cnt
    pickle_to_file(old_score, TOPIC_SCORE_FILE)
示例#6
0
def get_unique_clinic_no_1(topic_id):
    # 获取医生话题的一级科室信息
    clinic_no, second_class_clinic_no = get_medicaldb_handler(
    ).get_topic_clinic_no(topic_id)
    if clinic_no and clinic_no in LONELY_FIRST_CLINIC_NO:
        return str(clinic_no)
    if second_class_clinic_no:
        return map_second_clinic_2_first(second_class_clinic_no)
    return None
示例#7
0
def one_user_last_qa_info(pid):
    # 从数据库ask_problem表里把ask取出来,整理成hbase_utils里cy_time_event_one_user_kernel输出的格式
    info = {"last_event": None, "last_event_time": 0}

    text = get_medicaldb_handler().get_ask_by_pid(pid)
    sex = ''
    age = ''
    # info_logger.info("qa text %s", text)
    # text = u"感冒发烧了吃什么药好,二甲双胍可以吃吗 肺气肿 怀孕 糖尿病(男,1岁)"#############
    info["last_event"] = ["free_problem_create", [text, sex, age]]
    return info
def get_qa_uids(begin, end):
    # 获取begin-end之间所有qa对应的user_id
    begin_dt = timestamp2datetime(ensure_second_timestamp(begin))
    end_dt = timestamp2datetime(ensure_second_timestamp(end))
    sql = 'select distinct user_id from ask_problem where created_time>"%s" and created_time<"%s";' % (
        begin_dt, end_dt)
    o = get_medicaldb_handler().dbhandler.do_one(sql)
    uids = set()
    for item in o:
        uid = item[0]
        uids.add(int(uid))
    return uids
def doctor_info(doctor_id):
    sql = "select title,level_title,second_class_clinic_no,first_class_clinic_no,hospital_name from symptomchecker_doctor where id='%s';" % doctor_id
    o = get_medicaldb_handler().do_one(sql)
    if o is None or len(o) == 0:
        return None
    title = o[0][0]  # unicode
    level_title = o[0][1]  # unicode
    second_class_clinic_no = o[0][2]
    first_class_clinic_no = o[0][3]
    hospital_name = o[0][4]
    sql = "select province from clinic_clinicdoctorinfo where doctor_id='%s';" % doctor_id
    o = get_medicaldb_handler().do_one(sql)
    if o is None or len(o) == 0:
        return None
    city = o[0][0]
    return {
        "title": title,
        "hospital_level": level_title,
        "second_class_clinic_no": second_class_clinic_no,
        "first_class_clinic_no": first_class_clinic_no,
        "city": city,
        "hospital_name": hospital_name,
    }
示例#10
0
def test19():
    import time
    from general_utils.db_utils import get_medicaldb_handler
    from general_utils.time_utils import timestamp2datetime
    uid = sys.argv[2]
    print 'uid', uid
    t1 = time.time()
    sql = 'select id from ask_problem where user_id=%s and created_time>"%s";' % (
        uid, timestamp2datetime(time.time() - 180 * 86400))

    o = get_medicaldb_handler().do_one(sql)
    if o is None or len(o) == 0:
        print 'nothing'
        return
    all_content = []
    for item in o:
        id = item[0]
        print id
        sql1 = 'select content from ask_problemcontent where problem_id=%s;' % id
        o1 = get_medicaldb_handler().do_one(sql1)
        all_content.append(o1)

    t2 = time.time()
    print 'time', t2 - t1
示例#11
0
def test14():
    from add_data_to_solr.manager.add_utils import topic_info
    from general_utils.db_utils import get_medicaldb_handler
    sql = 'select id from api_doctortopic where is_deleted=0 and title <> "";'
    o = get_medicaldb_handler().do_one(sql)
    fo = open('topic_content_len.csv', 'w')
    csvwriter = csv.writer(fo)
    first_line = ['topic id', 'doctor id', 'content length']
    csvwriter.writerow(first_line)

    for item in o:
        id = int(item[0])
        info_of_topic = topic_info(id)
        doctor_id = info_of_topic['doctor_id']
        content_len = info_of_topic['content_len']
        csvwriter.writerow([str(id), str(doctor_id), str(content_len)])
    fo.close()
示例#12
0
def get_user_qa_content_smart(uid, num=5):
    all_qa_text = []
    sql1 = 'select id from ask_problem where user_id=%s order by created_time limit %s;' % (
        uid, num)
    t1 = time.time()
    o1 = get_medicaldb_handler().do_one(sql1)
    t2 = time.time()
    print "get_user_qa_content_smart mysql time", t2 - t1
    if o1 is None or len(o1) == 0:
        return all_qa_text
    for item in o1:
        problem_id = item[0]
        print '-' * 10
        t1 = time.time()
        qa_texts = get_qa_texts_by_pid(problem_id)
        t2 = time.time()
        print "get_qa_texts_by_pid time", problem_id, t2 - t1
        all_qa_text.extend(qa_texts)
    return all_qa_text
示例#13
0
def get_user_qa_content2(uid, begin, end):
    # 从habse problem2表中 获取用户在一段时间内所有qa的全文
    begin_ds = timestamp2datetime(ensure_second_timestamp(begin))
    end_ds = timestamp2datetime(ensure_second_timestamp(end))

    all_qa_text = []

    sql1 = 'select id from ask_problem where user_id=%s and created_time>"%s" and created_time<"%s";' \
           % (
               uid, begin_ds, end_ds
           )

    o1 = get_medicaldb_handler().do_one(sql1)
    if o1 is None or len(o1) == 0:
        return all_qa_text

    for item in o1:
        problem_id = item[0]
        qa_texts = get_qa_texts_by_pid(problem_id)
        all_qa_text.extend(qa_texts)
    return all_qa_text
def topic_info(topic_id):
    sql = "select doctor_id,title,content,html,image,is_original from api_doctortopic where id=%s;" % topic_id
    o = get_medicaldb_handler().do_one(sql)
    if o is None or len(o) == 0:
        return None
    doctor_id = o[0][0]  # unicode
    if not doctor_id:
        return None
    title = o[0][1]  # unicode
    content = o[0][2]  # unicode
    text = ""
    if content and len(content) > 0:
        content = content.replace(u"\r", u"\\r")
        content = content.replace(u"\n", u"\\n")
        content = re.sub(ur"\\+[nrt]", u" ", content)

        content = json.loads(content)
        text = get_text(content)
    elif len(text) == 0:
        html_text = o[0][3]  # unicode
        if html_text:  # default None
            text = filterHTML(html_text)
    content_len = len(text)
    image = o[0][4]
    if image:
        image_num = len(json.loads(image))
    else:
        image_num = 0
    is_original = o[0][5]

    return {
        "doctor_id": doctor_id,
        "title": title,
        "text": text,
        "content_len": content_len,
        "image_num": image_num,
        "is_original": is_original,
    }
def get_qa_text(uid, begin, end, num):
    # 需要快,同时保留事件的时间
    bad_return = [], []
    begin = ensure_second_timestamp(begin)
    end = ensure_second_timestamp(end)
    sql = 'select id,created_time,ask from ask_problem  where user_id=%s order by id desc limit %s;' % (
        uid, num)
    # print 'sql', sql
    o = get_medicaldb_handler().do_one(sql)
    if o is None or len(o) == 0:
        return bad_return

    text_list = []
    ts_list = []
    for item in o:
        dt = str(item[1])
        ts = datetime_str2timestamp(dt)
        if ts < begin or ts > end:
            continue
        first_ask = unicode(item[2])
        text_list.append(first_ask)
        ts_list.append(ts)
    return text_list, ts_list
示例#16
0
def main8(test_uid=None):
    # test cf
    from recommend.manager.recommend_resource import Recommend_by_user_info
    if test_uid == "n":
        test_uid = None
    now = time.time()
    # now = 1513780888
    data_dict = cy_time_event_kernel_test(now - 6000, now, test_uid)
    if not test_uid:
        fo = open("20171229_1_cfr.csv", "w")
    else:
        fo = open('test.csv', 'w')
    csvwriter = csv.writer(fo, dialect="excel")
    first_line = [
        u"uid", u"u_tags", u"special_population", u"trigger", u"trigger_info",
        u"trigger_time", u"material_id", u"material_type", u"score", u"title",
        u"m_tags", u"only_topic"
    ]
    csvwriter.writerow(first_line)
    fail_cases = {
        'big_search': defaultdict(int),
        'free_problem_create': defaultdict(int)
    }

    all_uid_cnt = 0
    all_valid_res_cnt = 0
    qa_score = [[1.0 - i / 10.0, 0] for i in range(11)]
    bs_score = [[1.0 - i / 10.0, 0] for i in range(11)]
    trigger_cnt = {'qa': 0, 'bs': 0}
    cal_time = {}
    for uid in data_dict.keys():
        time.sleep(0.5)
        print '=' * 10, uid, '=' * 10

        user_info0 = data_dict[uid]
        t1 = time.time()
        res = Recommend_by_user_info(user_info0=user_info0,
                                     uid=uid,
                                     log_mark="test8",
                                     num=6,
                                     test=True)
        t2 = time.time()
        cal_time[uid] = t2 - t1

        user_info = res['user_info']
        res1 = res['res']
        topn_ids_scores = res['topn_ids_scores']
        only_topic = res['only_topic']
        status = res['status']
        v_score_dict = res['v_score_dict']
        if not user_info:
            continue
        all_uid_cnt += 1
        trigger = user_info["trigger"]
        if trigger == "big_search":
            trigger_cnt['bs'] += 1
        else:
            trigger_cnt['qa'] += 1

        if status != 'succeed':
            fail_cases[trigger][status] += 1
            continue

        texts = user_info["texts"]
        tags = user_info["tags"]
        special_population = user_info["special_population"]

        timestamp = user_info['timestamp']

        best_id, best_title, mtype = res1[0]
        best_score = v_score_dict[mtype + '_' + str(best_id)]

        if trigger == 'big_search':
            for i, item in enumerate(bs_score):
                if best_score >= item[0]:
                    bs_score[i][1] += 1
                    break
        else:
            for i, item in enumerate(qa_score):
                if best_score >= item[0]:
                    qa_score[i][1] += 1
                    break

        if trigger == 'big_search':
            trigger_info = "-".join(texts)
        elif trigger == "free_problem_create":
            problem_id, ask = get_medicaldb_handler().get_ask_by_timestamp(
                uid, timestamp)
            if not ask:
                ask = texts[0]
            trigger_info = '-'.join([str(problem_id), str(ask)])

            # [u"uid", u"u_tags", u"special_population", u"trigger", u"trigger_info", u"trigger_time",
            # u"material_id", u"material_type", u"score", u"title", u"m_tags", u"only_topic",

        for id, title, mtype in res1:
            prefix = 'news_' if mtype == 'news' else 'r_topic_'
            mtags = get_news_tags_from_solr(prefix + str(uid))
            rows = [
                str(uid), '-'.join(tags), special_population, trigger,
                trigger_info,
                str(timestamp),
                str(id), mtype, v_score_dict[mtype + '_' + str(id)], title,
                '-'.join(mtags),
                str(only_topic)
            ]

            rows = convert2gbk(rows)
            csvwriter.writerow(rows)
        if res1:
            all_valid_res_cnt += 1

    # fail_cases
    for trigger in fail_cases:
        for reason in fail_cases[trigger]:
            rows = [trigger, reason, str(fail_cases[trigger][reason])]
            rows = convert2gbk(rows)
            csvwriter.writerow(rows)
    # ana
    rows = ['all', str(all_uid_cnt), 'res_cnt', str(all_valid_res_cnt)]
    rows = convert2gbk(rows)
    csvwriter.writerow(rows)

    # score cut
    rows = ['bs score cut']
    csvwriter.writerow(rows)

    cum_cnt = 0
    for score, cnt in bs_score:
        cum_cnt += cnt
        true_recall = cum_cnt / float(trigger_cnt['bs'])
        rows = [str(score), str(cnt), str(true_recall)]
        csvwriter.writerow(rows)

    rows = ['qa score cut']
    csvwriter.writerow(rows)

    cum_cnt = 0
    for score, cnt in qa_score:
        cum_cnt += cnt
        true_recall = cum_cnt / float(trigger_cnt['bs'])
        rows = [str(score), str(cnt), str(true_recall)]
        csvwriter.writerow(rows)

    # cal time

    s_cal_time = sorted(cal_time.iteritems(), key=lambda x: x[1], reverse=True)
    for u, t in s_cal_time[:20]:
        csvwriter.writerow([str(u), str(t)])

    fo.close()
示例#17
0
def main5(test_uid=None, now=None):
    if test_uid == "n":
        test_uid = None
    now = time.time()
    if not now:
        now = 1512379920.1
    else:
        now = float(ensure_second_timestamp(now))
    t10 = time.time()
    data_dict = cy_time_event_kernel_test(now - 12000.0, now, test_uid)
    t20 = time.time()

    print "len(data_dict)", len(data_dict)

    if not test_uid:
        fo = open("20171220_1_res.csv", "w")
    else:
        fo = open('test.csv', 'w')
    csvwriter = csv.writer(fo, dialect="excel")
    first_line = [
        u"uid", u"u_tags", u"special_population", u"trigger", u"trigger_info",
        u"trigger_time", u"material_id", u"material_type", u"score", u"title",
        u"m_tags", u"only_topic", u"best_id", u"best_score", u"time"
    ]
    csvwriter.writerow(first_line)
    all_call_cnt = 0
    all_valid_res_cnt = 0
    exception_cnt = 0
    status_dict = defaultdict(int)

    total_time = []
    slow_case = []
    for uid in data_dict:

        all_call_cnt += 1

        user_info0 = data_dict[uid]
        try:
            # if True:
            t1 = time.time()
            res = Recommend_by_user_info(user_info0,
                                         uid,
                                         log_mark='testmain5',
                                         test=True)

            # return = {"user_info": None, "res": None, "topn_ids_scores": None, "only_topic": None,"status":"succeed"}

            t2 = time.time()
            print t2 - t1
            if t2 - t1 >= 3:
                break

            user_info = res['user_info']
            res1 = res['res']
            topn_ids_scores = res['topn_ids_scores']
            only_topic = res['only_topic']
            status = res['status']
            v_score_dict = res['v_score_dict']

            best_id, best_title, mtype = res1[0]

            this_time = t2 - t1
            if this_time >= 1.0:
                slow_case.append([uid, this_time])
            total_time.append(t2 - t1)
        except Exception, e:
            print e

            exception_cnt += 0
            continue
        status_dict[status] += 1

        ####################
        # if not only_topic:
        #     continue
        ####################

        if best_id == -1 or user_info is None:
            continue

        print '================='
        print uid

        texts = user_info["texts"]
        tags = user_info["tags"]
        special_population = user_info["special_population"]
        trigger = user_info["trigger"]
        timestamp = user_info['timestamp']
        best_score = v_score_dict[mtype + '_' + str(best_id)]
        # if trigger == "big_search":
        #     continue

        if trigger == 'big_search':
            trigger_info = "-".join(texts)
        elif trigger == "free_problem_create":
            problem_id, ask = get_medicaldb_handler().get_ask_by_timestamp(
                uid, timestamp)
            if not ask:
                ask = texts[0]
            trigger_info = '-'.join([str(problem_id), str(ask)])
        print "u tags", "-".join(tags), special_population
        print trigger_info, best_id, best_score, best_title

        for unique_id, score in topn_ids_scores:
            material_type, id = unique_id.split('_')
            if material_type == "news":
                title, _ = get_newsdb_handler().get_title_digest_by_nid(id)
                m_tags = get_news_tags_from_solr("news_" + str(id))
            elif material_type == "topic":
                title = get_medicaldb_handler().get_topic_title(id)
                m_tags = get_news_tags_from_solr("r_topic_" + str(id))

            rows = [
                str(uid), "-".join(tags),
                str(special_population), trigger, trigger_info,
                str(timestamp),
                str(id), material_type,
                str(score), title, "-".join(m_tags),
                str(only_topic),
                str(best_id),
                str(best_score),
                str(this_time)
            ]
            rows = convert2gbk(rows)
            csvwriter.writerow(rows)
        all_valid_res_cnt += 1
示例#18
0
def get_systag_data():
    # 获取热卖tag相关数据,keywords,target_param,name等
    sql = "select sysTag_id, keywords ,clinic_no,second_clinic_no from ner_systagsolrgenerateconf;"
    data = dict()
    data["systag"] = {}
    # 9:{'tag_name':'gastroscope_colonoscope','plan':[{'url':url1,'name':name1},{'url':url2,'name':name2}]}
    data['keyword'] = defaultdict(list)  # '感冒':[systag_id1,systag_id2...]
    data['keyword_extend'] = {}
    data['clinic_no'] = defaultdict(list)  # u'1':[systag_id1]
    all_plan_name = []
    o = get_diagnose_handler().dbhandler.do_one(sql)

    for item in o:
        systag_id = item[0]
        keywords = item[1].strip()
        clinic_no = item[2].strip()
        second_clinic_no = item[3].strip()

        # 科室信息与systag_id的对应关系,不标记区分一二级科室
        if clinic_no:
            clinic_nos = clinic_no.split()
            for x in clinic_nos:
                x = ensure_unicode(x)
                data['clinic_no'][x].append(systag_id)
        if second_clinic_no:
            second_clinic_nos = second_clinic_no.split()
            for x in second_clinic_nos:
                x = ensure_unicode(x)
                data['clinic_no'][x].append(systag_id)

        # data['systag']
        tag_name = get_diagnose_handler().get_systag_en_name(systag_id)
        sql1 = 'select id,name,target_param from api_userhomehotsalegallery where tag="%s" and is_online=1;' % tag_name
        o1 = get_medicaldb_handler().do_one(sql1)

        data['systag'][systag_id] = {'tag_name': tag_name, 'plan': []}

        if not o1:
            continue

        for item1 in o1:
            plan_id = item1[0]
            name = item1[1]
            url = item1[2].replace('\r\n', '')
            print systag_id, tag_name, name, url
            data['systag'][systag_id]['plan'].append({
                'url': url,
                'name': name,
                'plan_id': plan_id
            })

            all_plan_name.append([systag_id, name])

        if keywords == u"*":
            continue
            # data['keyword']
        keywords = keywords.lower().split()
        for k in keywords:
            if systag_id not in data['keyword'][k]:
                data['keyword'][k].append(systag_id)

    # 用相似词将keyword扩充
    num = 20
    master_slave = {}
    high_freq_words = get_high_freq_words()

    for k in data['keyword']:
        systag_id_list = data['keyword'][k]
        # data['keyword_extend'][k] = [systag_id_list, 1.0]
        master_slave[k] = [systag_id_list, []]
        for w, s in get_similar_redis(k, num):
            w = ensure_unicode(w)
            if len(w) < 2:
                # 去掉长度为1的相似词
                continue
            if s < 0.41:
                # 分数过低的不要
                break
            if w in high_freq_words:
                # 去掉公认的高频词
                continue

            data['keyword_extend'][w] = [systag_id_list, s]
            master_slave[k][1].append([w, s])

    for k in data['keyword']:
        systag_id_list = data['keyword'][k]
        data['keyword_extend'][k] = [systag_id_list, 1.0]

    # 把keyword_extend信息存文件里,方便查看
    with open(SYSTAG_DATA_CHECK_FILE, 'w') as fc:
        for k in master_slave:
            systag_id_list, ws_list = master_slave[k]
            fc.write('###' + k + '|||' + json.dumps(systag_id_list) +
                     '=' * 10 + '\n')
            for w, s in ws_list:
                fc.write(w + '|||' + str(s) + '\n')
        for systag_id, plan_name in all_plan_name:
            fc.write(str(systag_id) + '---' + plan_name + '\n')

    pickle_to_file(data, SYSTAG_DATA_FILE)
示例#19
0
def add_topic():
    batch_size = 1000
    all_doc_small = []
    all_doc_big = []
    docs_small = []
    docs_big = []
    sql = 'select id from api_doctortopic where is_deleted=0 and title <> "" and id > 154517 limit 20000;'
    o = get_medicaldb_handler().do_one(sql)
    id_prefix_small = "r_topic_"
    id_prefix_big = "r_topicbig_"
    content_type_small = "r_topic"
    content_type_big = "r_topicbig"
    # fo = open("topic_score.csv", "w")
    # csvwriter = csv.writer(fo, dialect='excel')
    # first_line = [u'topic id', u'score', u'topic title', u'content len', u'image num', u'is original',
    #               u'doctor id', u'职称', u'医院级别', u'科室', u'城市']

    # first_line = convert2gbk(first_line)
    # csvwriter.writerow(first_line)
    # index = range(len(o))
    # shuffle(index)
    ff = open("failed_id","a")
    solr = SolrHelper("online").get_solr("topic_tpl")
    is_end = False
    for item in o:
        if item == o[-1]:
            is_end = True
        #print "is_end",is_end
        topic_id = item[0]
        print "topic_id",topic_id
        info_of_topic = topic_info(topic_id)
        topic_title = info_of_topic['title']
        if len(topic_title) == 0:
            #print "empty title",topic_id
            continue
        doctor_id = info_of_topic["doctor_id"]
        info_of_doctor = doctor_info(doctor_id)
        title_tags = get_entities_cyseg(info_of_topic["title"])
        content_tags = get_entities_cyseg(info_of_topic["text"])

        if len(content_tags) == 0:
            print "no content tag",topic_id
            continue

        title_vecs = get_vecs2(title_tags)
        content_vecs = get_vecs2(content_tags)
        print "content_vecs len",len(content_vecs)


        score = int(grade_topic(info_of_topic, info_of_doctor, title_tags, content_tags) * 10)
        if title_vecs and len(title_vecs) > 0:
            #若title有vec,存之
            try:
                add_topic_kernel(topic_id=topic_id,
                                docs=docs_small,
                                 tags=title_tags,
                                 score=score,
                                 info_of_topic=info_of_topic,
                                 info_of_doctor=info_of_doctor,
                                 vecs=title_vecs,
                                 id_prefix=id_prefix_small,
                                 content_type=content_type_small
                                 )
            except:
                ff.write("small|||" + str(topic_id) + "\n")
        if content_vecs and len(content_vecs) > 0:
            #若content有vec,存之
            try:
                add_topic_kernel(topic_id=topic_id,
                                 docs=docs_big,
                                 tags=content_tags,
                                 score=score,
                                 info_of_topic=info_of_topic,
                                 info_of_doctor=info_of_doctor,
                                 vecs=content_vecs,
                                 id_prefix=id_prefix_big,
                                 content_type=content_type_big)
            except:
                ff.write("big|||" + str(topic_id) + "\n")

        ###########




        ############
        print "eln docs_small",len(docs_small)
        print "len docs_big",len(docs_big)
        if len(docs_small) == batch_size or is_end:
            print "topic_id",topic_id
            print "is end",is_end
            print "add small", len(docs_small)

            #print json.dumps(docs_small)
            #add(docs_small,solr)
            all_doc_small.extend(docs_small)
            docs_small = []
        if len(docs_big) == batch_size or is_end:
            print "topic_id", topic_id
            print "is end", is_end
            print "add big", len(docs_big)
            #print json.dumps(docs_big)
            #add(docs_big, solr)
            all_doc_big.extend(docs_big)
            docs_big = []



    ff.close()
    pickle_to_file(all_doc_small,"all_doc_small_3")
    pickle_to_file(all_doc_big,"all_doc_big_3")
示例#20
0
def recall_together(text,
                    tags,
                    weights,
                    cates,
                    special_population,
                    trigger_type=None,
                    only_topic=False,
                    yxjt=False):
    # news and topic 一起召回
    news_cons = population_cons2(special_population)  # 不能在这些分类
    news_limits = population_limits(special_population)  # 必须在这些分类
    if trigger_type == "big_search":
        drug_words = [
            x for x in tags if (x in cates and cates[x] == 'DRUG_DESC')
        ]
    else:
        drug_words = None
    rows = 25  # 这里太大可能导致文章多->访问word2vec次数多->超时的可能性变大
    if yxjt:  # 医学讲堂
        rows = 25  # 医学讲堂只处理标题所以这里可以取大一些
    res_ids, title_dict, score_dict = more_news_and_topic_from_solr(
        # text=text,
        text='',
        tags=tags,
        weights=weights,
        rows=rows,
        drug_words=drug_words,
        news_cons=news_cons,
        news_limits=news_limits,
        topic_only=only_topic)

    # 召回后,进行过滤
    res_ids1 = []
    all_titles = set()
    all_doctor_ids = set()
    for id in res_ids:
        print '--------======-----'
        print id
        # news and topic 标题去重
        title = title_dict.get(id, '')
        if title in all_titles:
            # 对所有标题去重
            continue
        all_titles.add(title)

        # 获取物料类型和真实id
        type, true_id = id.split('_')

        # 医学讲堂标题长度限制
        if type == 'topic' and yxjt:
            if len(title) < 8:
                continue

        # 对topic的医生id去重
        if type == 'topic':
            doctor_id = get_medicaldb_handler().get_topic_doctor_id(true_id)
            if doctor_id and doctor_id in all_doctor_ids:
                continue
            all_doctor_ids.add(doctor_id)

        # 规则过滤
        if not child_match(special_population, text, title_dict[id]):
            continue

        res_ids1.append(id)

    print "recall ids"
    for id in res_ids1:
        print id, score_dict[id], title_dict[id]

    return res_ids1, title_dict, score_dict