def test9(): from general_utils.db_utils import get_medicaldb_handler from add_data_to_solr.manager.add_utils import topic_info, doctor_info fo = open("topic_score.csv", "w") csvwriter = csv.writer(fo, dialect='excel') first_line = [u'topic id', u'score', u'topic title', u'content len', u'image num', u'is original', u'doctor id', u'职称', u'医院级别', u'1科室', u"2科室", u'城市', u'hospital_name'] first_line = convert2gbk(first_line) csvwriter.writerow(first_line) d = "data_dir/topic_data/" b2 = pickle_from_file(d + 'all_doc_big_2') for item in b2: id = int(item['id'].split('_')[-1]) score = item['tid'] / 10.0 title = item['title'] content_len = item['content_len'] sql = 'select doctor_id from api_doctortopic where id=%s;' % id o = get_medicaldb_handler().do_one(sql) doctor_id = o[0][0] ti = topic_info(id) di = doctor_info(doctor_id) image_num = ti['image_num'] is_original = ti['is_original'] d_title = di['title'] h_level = di['hospital_level'] h_name = di['hospital_name'] clinic_no = di['first_class_clinic_no'] s_clinic_no = di['second_class_clinic_no'] city = di['city'] rows = [str(id), str(score), title, str(content_len), str(image_num), str(is_original), doctor_id, d_title, h_level, clinic_no, s_clinic_no, city, h_name] rows = convert2gbk(rows) csvwriter.writerow(rows) fo.close()
def test17(): from general_utils.solr_utils import nat_get_title fin = sys.argv[2] mtype = 'topic' if 'topic' in fin else 'news' fon = mtype + '_nearest_top10.csv' fo = open(fon, 'w') csvwriter = csv.writer(fo) first_line = ['id', 'title', 'top_id', 'top_title', 'score'] csvwriter.writerow(first_line) cnt = 0 with open(fin, 'r') as f: for l in f: if cnt > 1000: break cnt += 1 l = l.strip('\n') this_dict = json.loads(l) main_id = this_dict['id'] main_title = nat_get_title(mtype + '_' + str(main_id)) if not main_title: continue top = this_dict['top'][:10] for subordinate_id, score in top: subordinate_title = nat_get_title(mtype + '_' + str(subordinate_id)) row = [ str(main_id), main_title, str(subordinate_id), subordinate_title, str(score) ] row = convert2gbk(row) csvwriter.writerow(row) fo.close()
def test11(): filename = "/home/classify/workspace/medical_data/data_dir/medical_word_detail.pickle" endict = pickle_from_file(filename) fo = open("all_medical_words.csv", "w") csvwriter = csv.writer(fo, dialect='excel') for w in endict: id = endict[w]['id'] cate = endict[w]['cate'] row = [str(id), w, cate] row = convert2gbk(row) csvwriter.writerow(row) fo.close()
def test18(): # 看没有图的科普文章占比 from general_utils.db_utils import get_newsdb_handler from general_utils.text_utils import filterHTML sql = 'select id,title,content,created_time,mini_img from news_healthnews where is_online=1;' o = get_newsdb_handler().do_one(sql) has_image_cnt = 0 has_no_image_cnt = 0 fo = open('news_with_no_image_info.csv', 'w') csvwriter = csv.writer(fo) first_line = ['id', 'title', 'content_len', 'created_time'] csvwriter.writerow(first_line) for item in o: id = item[0] title = item[1] if not title: continue content = item[2] content = filterHTML(content) content_len = len(content) created_time = item[3] mini_img = item[4] if mini_img and len(mini_img) > 5: has_image_cnt += 1 continue has_no_image_cnt += 1 line = [str(id), title, str(content_len), str(created_time)] line = convert2gbk(line) csvwriter.writerow(line) line = ['no_image_cnt', 'has_image_cnt', 'all'] csvwriter.writerow(line) line = [ str(has_no_image_cnt), str(has_image_cnt), str(has_no_image_cnt + has_image_cnt) ] csvwriter.writerow(line) fo.close()
def test4(): from general_utils.db_utils import get_medical_entity_handler sql = 'select id,name,frequency from medicaldb_newdiseases;' o = get_medical_entity_handler(False).do_one(sql) fo = open("diseases_frequency.csv", "w") csvwriter = csv.writer(fo, dialect='excel') first = ["id", "name", "freq"] csvwriter.writerow(first) for item in o: id = item[0] name = item[1] freq = item[2] rows = [id, name, freq] rows = convert2gbk(rows) csvwriter.writerow(rows) fo.close()
def main6(test_uid=None): # test recommend_topics from recommend.manager.recommend_resource import Recommend_topics from recommend.manager.recommend_topic_data_helper import parse_user_info as parse_user_info2 now = time.time() if test_uid == "n": test_uid = None data_dict = cy_time_event_kernel_test(now - 12000.0, now, test_uid) fo = open("20180102_rt.csv", "w") csvwriter = csv.writer(fo) first_line = ['uid', 'tags', 'sp', 'topicid', 'score', 't_title', 't_tags'] csvwriter.writerow(first_line) times = {} for uid in data_dict.keys(): t1 = time.time() topic_ids, user_info, score_dict = Recommend_topics(uid, 5, now, True) t2 = time.time() times[uid] = t2 - t1 if not user_info: continue tags = user_info['tags'] sp = user_info['special_population'] for x in topic_ids: title = nat_get_title('topic_' + str(x)) score = score_dict['topic_' + str(x)] t_tags = get_news_tags_from_solr("r_topic_" + str(x)) row = [ str(uid), '-'.join(tags), sp, str(x), str(score), title, '-'.join(t_tags) ] row = convert2gbk(row) csvwriter.writerow(row) fo.close() s_times = sorted(times.iteritems(), key=lambda x: x[1], reverse=True)[:10] for x, y in s_times: print x, y
def main9(test_uid=None): # test recommend tags from recommend.manager.recommend_resource import Recommend_tags from recommend.manager.recommend_tags_data_helper import get_user_last_query if test_uid == "n": test_uid = None now = time.time() # now = 1513780888 data_dict = cy_time_event_kernel_test(now - 2000, now, test_uid) if not test_uid: fo = open("20180102_1_rtr.csv", "w") else: fo = open('test.csv', 'w') first_line = ['uid', 'last_query', 'r_tags', 'r_plan'] csvwriter = csv.writer(fo) csvwriter.writerow(first_line) total_t = {} for uid in data_dict.keys(): t1 = time.time() res = Recommend_tags(uid) t2 = time.time() total_t[uid] = t2 - t1 #{'words': tags, 'plan': plans} words = res['words'] plan = res['plan'] last_query = get_user_last_query(uid) row = [ str(uid), last_query, '-'.join(words), '-'.join([item['name'] for item in plan]) ] row = convert2gbk(row) csvwriter.writerow(row) s_total_t = sorted(total_t.iteritems(), key=lambda x: x[1], reverse=True) for uid, t in s_total_t[:10]: csvwriter.writerow([str(uid), str(t)])
def main8(test_uid=None): # test cf from recommend.manager.recommend_resource import Recommend_by_user_info if test_uid == "n": test_uid = None now = time.time() # now = 1513780888 data_dict = cy_time_event_kernel_test(now - 6000, now, test_uid) if not test_uid: fo = open("20171229_1_cfr.csv", "w") else: fo = open('test.csv', 'w') csvwriter = csv.writer(fo, dialect="excel") first_line = [ u"uid", u"u_tags", u"special_population", u"trigger", u"trigger_info", u"trigger_time", u"material_id", u"material_type", u"score", u"title", u"m_tags", u"only_topic" ] csvwriter.writerow(first_line) fail_cases = { 'big_search': defaultdict(int), 'free_problem_create': defaultdict(int) } all_uid_cnt = 0 all_valid_res_cnt = 0 qa_score = [[1.0 - i / 10.0, 0] for i in range(11)] bs_score = [[1.0 - i / 10.0, 0] for i in range(11)] trigger_cnt = {'qa': 0, 'bs': 0} cal_time = {} for uid in data_dict.keys(): time.sleep(0.5) print '=' * 10, uid, '=' * 10 user_info0 = data_dict[uid] t1 = time.time() res = Recommend_by_user_info(user_info0=user_info0, uid=uid, log_mark="test8", num=6, test=True) t2 = time.time() cal_time[uid] = t2 - t1 user_info = res['user_info'] res1 = res['res'] topn_ids_scores = res['topn_ids_scores'] only_topic = res['only_topic'] status = res['status'] v_score_dict = res['v_score_dict'] if not user_info: continue all_uid_cnt += 1 trigger = user_info["trigger"] if trigger == "big_search": trigger_cnt['bs'] += 1 else: trigger_cnt['qa'] += 1 if status != 'succeed': fail_cases[trigger][status] += 1 continue texts = user_info["texts"] tags = user_info["tags"] special_population = user_info["special_population"] timestamp = user_info['timestamp'] best_id, best_title, mtype = res1[0] best_score = v_score_dict[mtype + '_' + str(best_id)] if trigger == 'big_search': for i, item in enumerate(bs_score): if best_score >= item[0]: bs_score[i][1] += 1 break else: for i, item in enumerate(qa_score): if best_score >= item[0]: qa_score[i][1] += 1 break if trigger == 'big_search': trigger_info = "-".join(texts) elif trigger == "free_problem_create": problem_id, ask = get_medicaldb_handler().get_ask_by_timestamp( uid, timestamp) if not ask: ask = texts[0] trigger_info = '-'.join([str(problem_id), str(ask)]) # [u"uid", u"u_tags", u"special_population", u"trigger", u"trigger_info", u"trigger_time", # u"material_id", u"material_type", u"score", u"title", u"m_tags", u"only_topic", for id, title, mtype in res1: prefix = 'news_' if mtype == 'news' else 'r_topic_' mtags = get_news_tags_from_solr(prefix + str(uid)) rows = [ str(uid), '-'.join(tags), special_population, trigger, trigger_info, str(timestamp), str(id), mtype, v_score_dict[mtype + '_' + str(id)], title, '-'.join(mtags), str(only_topic) ] rows = convert2gbk(rows) csvwriter.writerow(rows) if res1: all_valid_res_cnt += 1 # fail_cases for trigger in fail_cases: for reason in fail_cases[trigger]: rows = [trigger, reason, str(fail_cases[trigger][reason])] rows = convert2gbk(rows) csvwriter.writerow(rows) # ana rows = ['all', str(all_uid_cnt), 'res_cnt', str(all_valid_res_cnt)] rows = convert2gbk(rows) csvwriter.writerow(rows) # score cut rows = ['bs score cut'] csvwriter.writerow(rows) cum_cnt = 0 for score, cnt in bs_score: cum_cnt += cnt true_recall = cum_cnt / float(trigger_cnt['bs']) rows = [str(score), str(cnt), str(true_recall)] csvwriter.writerow(rows) rows = ['qa score cut'] csvwriter.writerow(rows) cum_cnt = 0 for score, cnt in qa_score: cum_cnt += cnt true_recall = cum_cnt / float(trigger_cnt['bs']) rows = [str(score), str(cnt), str(true_recall)] csvwriter.writerow(rows) # cal time s_cal_time = sorted(cal_time.iteritems(), key=lambda x: x[1], reverse=True) for u, t in s_cal_time[:20]: csvwriter.writerow([str(u), str(t)]) fo.close()
def main5(test_uid=None, now=None): if test_uid == "n": test_uid = None now = time.time() if not now: now = 1512379920.1 else: now = float(ensure_second_timestamp(now)) t10 = time.time() data_dict = cy_time_event_kernel_test(now - 12000.0, now, test_uid) t20 = time.time() print "len(data_dict)", len(data_dict) if not test_uid: fo = open("20171220_1_res.csv", "w") else: fo = open('test.csv', 'w') csvwriter = csv.writer(fo, dialect="excel") first_line = [ u"uid", u"u_tags", u"special_population", u"trigger", u"trigger_info", u"trigger_time", u"material_id", u"material_type", u"score", u"title", u"m_tags", u"only_topic", u"best_id", u"best_score", u"time" ] csvwriter.writerow(first_line) all_call_cnt = 0 all_valid_res_cnt = 0 exception_cnt = 0 status_dict = defaultdict(int) total_time = [] slow_case = [] for uid in data_dict: all_call_cnt += 1 user_info0 = data_dict[uid] try: # if True: t1 = time.time() res = Recommend_by_user_info(user_info0, uid, log_mark='testmain5', test=True) # return = {"user_info": None, "res": None, "topn_ids_scores": None, "only_topic": None,"status":"succeed"} t2 = time.time() print t2 - t1 if t2 - t1 >= 3: break user_info = res['user_info'] res1 = res['res'] topn_ids_scores = res['topn_ids_scores'] only_topic = res['only_topic'] status = res['status'] v_score_dict = res['v_score_dict'] best_id, best_title, mtype = res1[0] this_time = t2 - t1 if this_time >= 1.0: slow_case.append([uid, this_time]) total_time.append(t2 - t1) except Exception, e: print e exception_cnt += 0 continue status_dict[status] += 1 #################### # if not only_topic: # continue #################### if best_id == -1 or user_info is None: continue print '=================' print uid texts = user_info["texts"] tags = user_info["tags"] special_population = user_info["special_population"] trigger = user_info["trigger"] timestamp = user_info['timestamp'] best_score = v_score_dict[mtype + '_' + str(best_id)] # if trigger == "big_search": # continue if trigger == 'big_search': trigger_info = "-".join(texts) elif trigger == "free_problem_create": problem_id, ask = get_medicaldb_handler().get_ask_by_timestamp( uid, timestamp) if not ask: ask = texts[0] trigger_info = '-'.join([str(problem_id), str(ask)]) print "u tags", "-".join(tags), special_population print trigger_info, best_id, best_score, best_title for unique_id, score in topn_ids_scores: material_type, id = unique_id.split('_') if material_type == "news": title, _ = get_newsdb_handler().get_title_digest_by_nid(id) m_tags = get_news_tags_from_solr("news_" + str(id)) elif material_type == "topic": title = get_medicaldb_handler().get_topic_title(id) m_tags = get_news_tags_from_solr("r_topic_" + str(id)) rows = [ str(uid), "-".join(tags), str(special_population), trigger, trigger_info, str(timestamp), str(id), material_type, str(score), title, "-".join(m_tags), str(only_topic), str(best_id), str(best_score), str(this_time) ] rows = convert2gbk(rows) csvwriter.writerow(rows) all_valid_res_cnt += 1
def a1(log_file_name, ana_file_name, bdp_file_name): START = "==========start=======" # 2017 11 08 日,被成功推荐的,用户uid,触发时间,推送文章id,推送时间, # 当天推送时间后是否浏览过该文章,以及浏览时间 today_zero, today_end = get_today_timestamp(time.time() - 86400.0) def get_uid(l): return l.split("=uid=")[1].split('=')[0] fi = open(log_file_name, 'r') fo = open(ana_file_name, "w") csvwriter = csv.writer(fo, dialect='excel') first_line = [u"uid", u"触发时间", u"触发类型", u"用户全文", u"用户tag", u"用户人群", u"文章id", u"文章标题", u"文章tag", u"文章分类", u"返回时间", u"点击时间"] csvwriter.writerow(convert2gbk(first_line)) uid = None uni_key0 = None trigger_time = None trigger_type = None caled = set() all = set() # 所有触发的请求 reason = None all_qa = defaultdict(set) all_bs = defaultdict(set) cnt = 0 for l in fi: # if not l.startswith("2017-11-08"): # continue cnt += 1 # if cnt > 10000: # continue if START in l: # 记录上一个 if reason and uni_key0 and trigger_type: if trigger_type == "bs": all_bs["all"].add(uni_key0) elif trigger_type == "qa": all_qa["all"].add(uni_key0) if uni_key0 and trigger_type and not reason: if trigger_type == "bs": all_bs["failed"].add(uni_key0) elif trigger_type == "qa": all_qa["failed"].add(uni_key0) uid = get_uid(l) trigger_time = l.split(',')[0] uni_key0 = uid + '|' + trigger_time if "pid=None" in l: trigger_type = "bs" else: trigger_type = "qa" all_qa["all"].add(uni_key0) reason = None # all.add(uni_key0) trigger_ts = datetime_str2timestamp(trigger_time) print "uni_key", uni_key0 print "ts", trigger_ts # if "=trigger=" in l: # trigger_type0 = l.split("=trigger=")[1].split('=')[0] if "=special_population=" in l: special_population0 = l.split("=special_population=")[1].split("=")[0] if "=texts=" in l: texts0 = l.split("=texts=")[1].split("=")[0] if "=tags=" in l: tags0 = l.split("=tags=")[1].split("=")[0] if "failed in recommend==" in l: reason = l.split("failed in recommend==")[1].split("=")[0] if trigger_type == "qa": if reason not in all_qa: all_qa[reason] = set([uni_key0]) else: all_qa[reason].add(uni_key0) elif trigger_type == "bs": if reason not in all_bs: all_bs[reason] = set([uni_key0]) else: all_bs[reason].add(uni_key0) if "succeed in recommend==========" in l: reason = "succeed" if trigger_type == "qa": if reason not in all_qa: all_qa[reason] = set([uni_key0]) else: all_qa[reason].add(uni_key0) elif trigger_type == "bs": if reason not in all_bs: all_bs[reason] = set([uni_key0]) else: all_bs[reason].add(uni_key0) return_time = l.split(',')[0] uni_key = uid + return_time if uni_key in caled: continue print 'WWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWW' caled.add(uni_key) return_ts = datetime_str2timestamp(return_time) nid = l.split("=====id=")[1].split("=")[0] ntitle = l.split("===title=")[1].split("=")[0] # news_title = get_db_data_local_handler().get_news_title(nid) news_type = get_db_data_local_handler().get_news_type(nid) news_tags = u'|||'.join(get_news_tags_from_solr("news_" + str(nid))) print uid print trigger_type print nid print return_ts print type(ntitle) # first_line = [u"uid", u"触发时间", u"触发类型",u"用户全文",u"用户tag",u"用户人群", # u"文章id", u"文章标题",u"文章tag",u"文章分类",u"返回时间", u"点击时间"] views = cy_time_event_one_user_viewnews(uid, begin=return_ts, end=today_end) print views rows = [str(uid), trigger_time, trigger_type, texts0, tags0, special_population0, str(nid), ntitle, news_tags, news_type, return_time, str(timestamp2datetime(views.get(nid, -1)))] rows = convert2gbk(rows) csvwriter.writerow(rows) csvwriter.writerow([u"所有uid".encode("gbk"), u"推了的uid".encode("gbk")]) rows = [str(len(all)), str(len(caled))] rows = convert2gbk(rows) csvwriter.writerow(rows) fi.close() fo.close() for x in all_qa: print x + "|||" + str(len(all_qa[x])) for x in all_bs: print x + "|||" + str(len(all_bs[x])) with open(bdp_file_name, "w") as f: f.write("news_all_input_qa|||" + str(len(all_qa["all"])) + "\n") f.write("news_all_output_qa|||" + str(len(all_qa["succeed"])) + "\n") f.write("news_no_info_qa|||" + str(len(all_qa["user_info is None "])) + "\n") f.write("news_filtered_by_preprocessing_qa|||" + str(len(all_qa["filter_user_info bad "])) + "\n") f.write("news_empty_res_qa|||" + str(len(all_qa["topn_ids_scores empty"])) + "\n") f.write("news_bad_res_qa|||" + str(len(all_qa["best_score so low"])) + "\n") f.write("qa_failed|||" + str(len(all_qa["failed"])) + "\n") f.write("news_all_input_bs|||" + str(len(all_bs["all"])) + "\n") f.write("news_all_output_bs|||" + str(len(all_bs["succeed"])) + "\n") f.write("news_no_info_bs|||" + str(len(all_bs["user_info is None "])) + "\n") f.write("news_filtered_by_preprocessing_bs|||" + str(len(all_bs["filter_user_info bad "])) + "\n") f.write("news_empty_res_bs|||" + str(len(all_bs["topn_ids_scores empty"])) + "\n") f.write("news_bad_res_bs|||" + str(len(all_bs["best_score so low"])) + "\n") f.write("bs_failed|||" + str(len(all_bs["failed"])) + "\n")
def g1(): ''' 查看不使用热卖tag扩充的覆盖率,和使用热卖tag扩充的覆盖率 分子,能匹配上热卖tag的,分母,一天内有活动用户(cy_event ''' from general_utils.hbase_utils import get_user_query, get_user_query2 from general_utils.solr_utils import get_last_login_uids from recommend.manager.recommend_tags_data_helper import get_relation_plan3 from general_utils.db_utils import get_db_data_local_handler from general_utils.hbase_utils import get_sp_duration_active_userid from general_utils.time_utils import timestamp2datetime, ensure_second_timestamp # 用户采样时间窗 # 用户采样命中率 end_ds0 = '2018-01-21 23:59:40' end0 = datetime_str2timestamp(end_ds0) begin0 = end0 - 86400 * 1 # 每个选中用户的数据采集时间窗 end_ds = '2018-01-22 23:59:40' end = datetime_str2timestamp(end_ds) begin = end - 86400 * 180.0 # 半年 # 最后登录时间在2018-01-21 23:59:40前一周的用户 # test_uids = get_last_login_uids(begin0, end0) # test_uids = get_sp_duration_active_userid(begin0,end0) test_uids = get_one_day_uid_from_file('log_event_20180122') print "test_uids num", len(test_uids) # 打乱顺序,取1000个样本 random.shuffle(test_uids) selected_uids = test_uids[:3000] all_good_cnt = 0 all_cnt = 0 app_cnt = 0 good_app_cnt = 0 text_empty_cnt = 0 fo = open('180129_rp_1.csv', 'w') csvwriter = csv.writer(fo) first_line = [ 'uid', 'username', 'is_app', 'last_info_time', 'use_tags', 'systag_ids', 'tag_names', 't', 'is_tangsai' ] csvwriter.writerow(first_line) # status_dict = { # 1: "qa and query", # 2: "view actions", # 3: "search_doctor clinic_no", # 0: "" # } total_time = {} for uid in selected_uids: print '==============uid=%s=======================' % uid username = get_username(uid) is_app = is_app_user(uid) all_cnt += 1 if is_app: app_cnt += 1 t1 = time.time() res = get_relation_plan3(uid, test=True) t2 = time.time() t = t2 - t1 total_time[uid] = t status = res['status'] is_tangsai = False if status: all_good_cnt += 1 if is_app: good_app_cnt += 1 systag_ids = res['ids'] if 96 in systag_ids: is_tangsai = True tagnames = [ get_db_data_local_handler().get_systagid_name(id) for id in systag_ids ] if status in (1, 2, 4): info0 = res['systag_id_dict'] record_info = '~'.join(info0.keys()) elif status == 3: info0 = res['clinic_no'] record_info = '~'.join(info0) last_ts = res['last_ts'] last_info_time = timestamp2datetime( ensure_second_timestamp(last_ts)) else: systag_ids = [] tagnames = [] record_info = '' last_info_time = '' systag_ids_str = '~'.join([str(x) for x in systag_ids]) tagnames_str = '~'.join(tagnames) line = convert2gbk([ str(uid), username, str(is_app), last_info_time, record_info, systag_ids_str, tagnames_str, str(t), str(is_tangsai) ]) csvwriter.writerow(line) line = [str(all_cnt), str(all_good_cnt), str(app_cnt), str(good_app_cnt)] csvwriter.writerow(line) s_total_time = sorted(total_time.iteritems(), key=lambda x: x[1], reverse=True) times = total_time.values() line = [str(min(times)), str(max(times)), str(sum(times) / len(times))] csvwriter.writerow(line) for uid, t in s_total_time[:10]: line = [str(uid), str(t)] csvwriter.writerow(line) fo.close() print str(max(times)) print all_good_cnt
def g2(): # test recommend_news ''' :return: ''' from recommend.manager.feed_data_helper import recommend_news_kernel from general_utils.solr_utils import nat_get_title test_uids = get_one_day_uid_from_file('log_event_20180222') print "test_uids num", len(test_uids) # 打乱顺序,取1000个样本 random.shuffle(test_uids) selected_uids = test_uids[:1000] fo = open('20180321_rn_1.csv', 'w') csvwriter = csv.writer(fo) first_line = [ 'uid', 'username', 'utags', 'user_bs', 'user_qa', 'user_look_title', 'news_id', 'title', 'score' ] csvwriter.writerow(first_line) total_time = {} cnt_all = 0 cnt_good = 0 for uid in selected_uids: print '==============uid=%s=======================' % uid username = get_username(uid) is_app = is_app_user(uid) if not is_app: continue cnt_all += 1 t1 = time.time() recommend_res = recommend_news_kernel(uid, True) t2 = time.time() total_time[uid] = t2 - t1 parsed_user_info = recommend_res['parsed_user_info'] utags = parsed_user_info['weight_dict'].keys() user_info_list = recommend_res['user_info_list'] bs_text_list = [] qa_text_list = [] view_news_title_list = [] view_topic_title_list = [] for ts, obj, action_type in user_info_list: if action_type in ('bs', 'sd'): bs_text_list.append(obj) elif action_type == 'qa': qa_text_list.append(obj) elif action_type == 'vt': title = nat_get_title('topic_' + str(obj)) view_topic_title_list.append(title) elif action_type == 'vn': title = nat_get_title('news_' + str(obj)) view_news_title_list.append(title) user_bs = '~'.join([str(item) for item in bs_text_list]) user_qa = '~'.join([str(item) for item in qa_text_list]) user_look_title = '~'.join([ str(item) for item in view_news_title_list + view_topic_title_list ]) title_dict = recommend_res['title_dict'] ids_list = recommend_res['ids'] score_dict = recommend_res['v_score_dict'] ids = [['%s-news_' % i + str(x) for x in ids] for [i, ids] in enumerate(ids_list)] ids1 = [] for x in ids: ids1.extend(x) ids = ids1 tcnt = 0 if ids: cnt_good += 1 for id in ids: id0 = id.split('-')[1] title = title_dict[id0] score = score_dict[id0] if tcnt == 0: line = convert2gbk([ str(uid), username, '~'.join(utags), user_bs, user_qa, user_look_title, str(id), title, score ]) else: line = convert2gbk([ ' ', ' ', '~'.join(utags), user_bs, user_qa, user_look_title, str(id), title, score ]) csvwriter.writerow(line) tcnt += 1 min_t = min(total_time.values()) max_t = max(total_time.values()) mean_t = sum(total_time.values()) / len(total_time) line = ['min', 'max', 'mean'] csvwriter.writerow(line) line = [str(min_t), str(max_t), str(mean_t)] csvwriter.writerow(line) sorted_total_time = sorted(total_time.iteritems(), key=lambda x: x[1], reverse=True) for uid, t in sorted_total_time[:10]: line = [str(uid), str(t)] csvwriter.writerow(line) line = ['all_app_user_num', 'good_add_user_num'] csvwriter.writerow(line) line = [str(cnt_all), str(cnt_good)] csvwriter.writerow(line) fo.close()
def test10(): from rpc_services.word2vec_api import get_similar from rpc_services.medical_service_api import tokenizer_default # 寻找相似词 # id query 分词结果 实体词分类 疾病词1 疾病词2 疾病词3 症状词1 症状词2 症状词3 药品词1 药品词2 药品词3 # input_file = "/Users/satoshi/Documents/work file/query_result_o1.csv" input_file = sys.argv[2] endict = pickle_from_file( "/home/classify/workspace/medical_data/data_dir/medical_word_detail.pickle" ) first_line = [ u"id", u"query", u"words", u"cates", u"disease", u"symptom", u"drug", ] fo = open("query_similar_words.csv", "w") csvwriter = csv.writer(fo, dialect='excel') csvwriter.writerow(first_line) with open(input_file, 'r') as f: for l in f: ll = l.strip('\n').split(',') print l print ll id, text = ll[0], ll[1] text = text.decode('gbk', 'ignore') similar_word_score_dict = {} seged = [] cates = [] tokens = tokenizer_default([text])["tokens"][0] for item in tokens: if u"neg_ne" in item: continue if "cate" not in item: continue word = item['token'] if word in seged: continue seged.append(word) cates.append(item['cate']) for x in seged: x_s = get_similar(x, 100) if not x_s: continue for w, s in x_s: if w not in similar_word_score_dict: similar_word_score_dict[w] = s elif s > similar_word_score_dict[w]: similar_word_score_dict[w] = s dis = [] sym = [] drug = [] s_similar_word_score = sorted(similar_word_score_dict.iteritems(), key=lambda x: x[1], reverse=True) for w, s in s_similar_word_score: if w not in endict: continue cate = endict[w]['cate'] if cate == "SYMPTOM_DESC" and len(sym) < 3: sym.append(w) if cate == "DISEASE_DESC" and len(dis) < 3: dis.append(w) if cate == "DRUG_DESC" and len(drug) < 3: drug.append(w) row = [ id, text, u"|||".join(seged), u"|||".join(cates), u"|||".join(dis), u"|||".join(sym), u"|||".join(drug) ] row = convert2gbk(row) csvwriter.writerow(row) fo.close()
def main1(): uids = [] # 获取所有uid for i in (0, 1, 2, 3): uid_filename = get_parti_uid_filename(part=i, mode='news') with open(uid_filename, 'r') as f: ls = f.readlines() t_uids = [int(item.strip('\n')) for item in ls] uids.extend(t_uids) # output_filename = '20180312_user_event_and_recommend_news.csv' yesterday_begin, yesterday_end = get_yesterday_timestamp() yesterday_begin = int(yesterday_begin * 1000) yesterday_end = int(yesterday_end * 1000) # fo = open(output_filename, 'w') csvwriter = csv.writer(fo) first_line = [ 'uid', 'is_app_user', 'event_datetime', 'event_type', 'event_obj', 'recommended_news' ] csvwriter.writerow(first_line) all_cnt = 0 good_cnt = 0 shuffle(uids) for uid in uids[:1000]: all_cnt += 1 is_app = is_app_user(uid) print '+' * 10, uid, '+' * 10 user_action_list = cy_time_event_one_user_kernel2( uid, yesterday_begin, yesterday_end) recommended_news_ids = get_caled_user_topn_news(uid) recommended_news_ids = new_newsids_check(recommended_news_ids, 2) if recommended_news_ids: good_cnt += 1 cnt = 0 for i in range(max([len(user_action_list), len(recommended_news_ids)])): if cnt == 0: user_id = str(uid) else: user_id = '' is_app = '' try: event_datetime = timestamp2datetime(user_action_list[i][0] / 1000.0) event_type = user_action_list[i][2] event_obj = user_action_list[i][1] if event_type == 'vn': title = nat_get_title('news_' + str(event_obj)) event_obj_str = str(event_obj) + '|' + title elif event_type == 'vt': title = nat_get_title('topic_' + str(event_obj)) event_obj_str = str(event_obj) + '|' + title else: event_obj_str = event_obj except: event_datetime = '' event_obj_str = '' event_type = '' try: recommended_news_id = recommended_news_ids[i] title = nat_get_title('news_' + str(recommended_news_id)) recommend_str = str(recommended_news_id) + '|' + title except: recommend_str = '' line = convert2gbk([ user_id, str(is_app), event_datetime, event_type, event_obj_str, recommend_str ]) csvwriter.writerow(line) cnt += 1 line = ['all', 'good'] csvwriter.writerow(line) csvwriter.writerow([str(all_cnt), str(good_cnt)]) fo.close()