def combine_bdpinput(bdp_file_name1, bdp_file_name2, bdp_file_name): ''' news_all_input_qa|||9017 news_all_output_qa|||810 news_no_info_qa|||0 news_filtered_by_preprocessing_qa|||3659 news_empty_res_qa|||1301 news_bad_res_qa|||3203 qa_failed|||86 news_all_input_bs|||3988 news_all_output_bs|||1515 news_no_info_bs|||110 news_filtered_by_preprocessing_bs|||1589 news_empty_res_bs|||138 news_bad_res_bs|||666 bs_failed|||32 ''' KEY_EXCLUDE = ("qa_failed", "bs_failed") cnt_dict = defaultdict(int) for x in (bdp_file_name1, bdp_file_name2): with open(x, 'r') as f: for l in f: key, cnt = l.strip('\n').split('|||') if key in KEY_EXCLUDE: continue cnt = int(cnt) cnt_dict[key] += cnt pickle_to_file(cnt_dict, bdp_file_name)
def get_topic_data(): # score old_score = pickle_from_file(TOPIC_SCORE_FILE) biggest_id = max(old_score.keys()) # 最大的topic_id sql1 = "select id,doctor_id from api_doctortopic where is_deleted=0 and title <> '' and id>%s;" % biggest_id o = get_medicaldb_handler().do_one(sql1) cnt = 0 for item in o: id = item[0] doctor_id = item[1] info_of_topic = topic_info(id) info_of_doc = doctor_info(doctor_id) title_tags = get_entities_cyseg(info_of_topic["title"]) content_tags = get_entities_cyseg(info_of_topic["text"]) # print "content",info_of_topic["text"] if len(content_tags) == 0 or len(info_of_topic['title']) == 0: print "no content tag", id continue score = grade_topic(info_of_topic, info_of_doc, title_tags, content_tags) old_score[int(id)] = score cnt += 1 print "new topic id num", cnt pickle_to_file(old_score, TOPIC_SCORE_FILE)
def get_bodypart_data(): all_bodypart = set() sql = 'select name from medicaldb_bodypart;' o = get_medical_entity_handler(False).do_one(sql) for item in o: name = item[0] all_bodypart.add(name) print "all_bodypart len is", len(all_bodypart) pickle_to_file(all_bodypart, BODYPART_FILE)
def get_feed_showlist_data(): ''' 使用一年的数据吧(2017.01.01-2018.01.01) :return: ''' begin = datetime_str2timestamp('2017-01-01 0:0:0') end = datetime_str2timestamp('2018-01-01 0:0:0') date_newsid_dict = get_feed_showlist_dict(begin, end) pickle_to_file(date_newsid_dict, showlist_filename)
def test8(): from general_utils.file_utils import pickle_from_file, pickle_to_file from general_utils.db_utils import get_medicaldb_handler d = "data_dir/topic_data/" s1 = pickle_from_file(d + 'all_doc_small') s2 = pickle_from_file(d + 'all_doc_small_2') s3 = pickle_from_file(d + 'all_doc_small_3') b1 = pickle_from_file(d + 'all_doc_big') b2 = pickle_from_file(d + 'all_doc_big_2') b3 = pickle_from_file(d + 'all_doc_big_3') s = s1 + s2 + s3 # 用不到 b = b1 + b2 + b3 fo_name = "data_dir/topic_score.pickle" res = {} open(fo_name, "w").close() for item in b: id = int(item['id'].split('_')[-1]) score = item['tid'] / 10.0 res[id] = score pickle_to_file(res, fo_name)
def add_topic(): batch_size = 1000 all_doc_small = [] all_doc_big = [] docs_small = [] docs_big = [] sql = 'select id from api_doctortopic where is_deleted=0 and title <> "" and id > 154517 limit 20000;' o = get_medicaldb_handler().do_one(sql) id_prefix_small = "r_topic_" id_prefix_big = "r_topicbig_" content_type_small = "r_topic" content_type_big = "r_topicbig" # fo = open("topic_score.csv", "w") # csvwriter = csv.writer(fo, dialect='excel') # first_line = [u'topic id', u'score', u'topic title', u'content len', u'image num', u'is original', # u'doctor id', u'职称', u'医院级别', u'科室', u'城市'] # first_line = convert2gbk(first_line) # csvwriter.writerow(first_line) # index = range(len(o)) # shuffle(index) ff = open("failed_id","a") solr = SolrHelper("online").get_solr("topic_tpl") is_end = False for item in o: if item == o[-1]: is_end = True #print "is_end",is_end topic_id = item[0] print "topic_id",topic_id info_of_topic = topic_info(topic_id) topic_title = info_of_topic['title'] if len(topic_title) == 0: #print "empty title",topic_id continue doctor_id = info_of_topic["doctor_id"] info_of_doctor = doctor_info(doctor_id) title_tags = get_entities_cyseg(info_of_topic["title"]) content_tags = get_entities_cyseg(info_of_topic["text"]) if len(content_tags) == 0: print "no content tag",topic_id continue title_vecs = get_vecs2(title_tags) content_vecs = get_vecs2(content_tags) print "content_vecs len",len(content_vecs) score = int(grade_topic(info_of_topic, info_of_doctor, title_tags, content_tags) * 10) if title_vecs and len(title_vecs) > 0: #若title有vec,存之 try: add_topic_kernel(topic_id=topic_id, docs=docs_small, tags=title_tags, score=score, info_of_topic=info_of_topic, info_of_doctor=info_of_doctor, vecs=title_vecs, id_prefix=id_prefix_small, content_type=content_type_small ) except: ff.write("small|||" + str(topic_id) + "\n") if content_vecs and len(content_vecs) > 0: #若content有vec,存之 try: add_topic_kernel(topic_id=topic_id, docs=docs_big, tags=content_tags, score=score, info_of_topic=info_of_topic, info_of_doctor=info_of_doctor, vecs=content_vecs, id_prefix=id_prefix_big, content_type=content_type_big) except: ff.write("big|||" + str(topic_id) + "\n") ########### ############ print "eln docs_small",len(docs_small) print "len docs_big",len(docs_big) if len(docs_small) == batch_size or is_end: print "topic_id",topic_id print "is end",is_end print "add small", len(docs_small) #print json.dumps(docs_small) #add(docs_small,solr) all_doc_small.extend(docs_small) docs_small = [] if len(docs_big) == batch_size or is_end: print "topic_id", topic_id print "is end", is_end print "add big", len(docs_big) #print json.dumps(docs_big) #add(docs_big, solr) all_doc_big.extend(docs_big) docs_big = [] ff.close() pickle_to_file(all_doc_small,"all_doc_small_3") pickle_to_file(all_doc_big,"all_doc_big_3")
def get_systag_data(): # 获取热卖tag相关数据,keywords,target_param,name等 sql = "select sysTag_id, keywords ,clinic_no,second_clinic_no from ner_systagsolrgenerateconf;" data = dict() data["systag"] = {} # 9:{'tag_name':'gastroscope_colonoscope','plan':[{'url':url1,'name':name1},{'url':url2,'name':name2}]} data['keyword'] = defaultdict(list) # '感冒':[systag_id1,systag_id2...] data['keyword_extend'] = {} data['clinic_no'] = defaultdict(list) # u'1':[systag_id1] all_plan_name = [] o = get_diagnose_handler().dbhandler.do_one(sql) for item in o: systag_id = item[0] keywords = item[1].strip() clinic_no = item[2].strip() second_clinic_no = item[3].strip() # 科室信息与systag_id的对应关系,不标记区分一二级科室 if clinic_no: clinic_nos = clinic_no.split() for x in clinic_nos: x = ensure_unicode(x) data['clinic_no'][x].append(systag_id) if second_clinic_no: second_clinic_nos = second_clinic_no.split() for x in second_clinic_nos: x = ensure_unicode(x) data['clinic_no'][x].append(systag_id) # data['systag'] tag_name = get_diagnose_handler().get_systag_en_name(systag_id) sql1 = 'select id,name,target_param from api_userhomehotsalegallery where tag="%s" and is_online=1;' % tag_name o1 = get_medicaldb_handler().do_one(sql1) data['systag'][systag_id] = {'tag_name': tag_name, 'plan': []} if not o1: continue for item1 in o1: plan_id = item1[0] name = item1[1] url = item1[2].replace('\r\n', '') print systag_id, tag_name, name, url data['systag'][systag_id]['plan'].append({ 'url': url, 'name': name, 'plan_id': plan_id }) all_plan_name.append([systag_id, name]) if keywords == u"*": continue # data['keyword'] keywords = keywords.lower().split() for k in keywords: if systag_id not in data['keyword'][k]: data['keyword'][k].append(systag_id) # 用相似词将keyword扩充 num = 20 master_slave = {} high_freq_words = get_high_freq_words() for k in data['keyword']: systag_id_list = data['keyword'][k] # data['keyword_extend'][k] = [systag_id_list, 1.0] master_slave[k] = [systag_id_list, []] for w, s in get_similar_redis(k, num): w = ensure_unicode(w) if len(w) < 2: # 去掉长度为1的相似词 continue if s < 0.41: # 分数过低的不要 break if w in high_freq_words: # 去掉公认的高频词 continue data['keyword_extend'][w] = [systag_id_list, s] master_slave[k][1].append([w, s]) for k in data['keyword']: systag_id_list = data['keyword'][k] data['keyword_extend'][k] = [systag_id_list, 1.0] # 把keyword_extend信息存文件里,方便查看 with open(SYSTAG_DATA_CHECK_FILE, 'w') as fc: for k in master_slave: systag_id_list, ws_list = master_slave[k] fc.write('###' + k + '|||' + json.dumps(systag_id_list) + '=' * 10 + '\n') for w, s in ws_list: fc.write(w + '|||' + str(s) + '\n') for systag_id, plan_name in all_plan_name: fc.write(str(systag_id) + '---' + plan_name + '\n') pickle_to_file(data, SYSTAG_DATA_FILE)
def get_simple_medical_entity_data(): # 获取实体词极其category的对应字典 entity_cate, entity_relation_drug = get_entity_cate() pickle_to_file(entity_cate, MEDICAL_ENTITY_FILE) pickle_to_file(entity_relation_drug, MEDICAL_RELATION_DRUG_FILE)
def get_and_save_resource(): resource_data = {} resource_data["news"] = get_newsdb_handler().get_all_articles() for x in resource_data: print "resource_data", x, len(resource_data[x]) pickle_to_file(resource_data, RESOURCE_DATA_FILE)
# encoding=utf8 from collections import defaultdict import json from django.core.management.base import BaseCommand from chunyu.utils.general.encoding_utils import ensure_unicode from recommend.app_config import RESOURCE_DATA_FILE, TOPIC_DATA_FILE, TOPIC_SCORE_FILE, BODYPART_FILE, \ MEDICAL_ENTITY_FILE, SYSTAG_DATA_FILE, SYSTAG_DATA_CHECK_FILE, MEDICAL_RELATION_DRUG_FILE, get_high_freq_words from general_utils.file_utils import pickle_to_file, pickle_from_file from rpc_services.medical_service_utils import get_entities, get_entities_cyseg from rpc_services.word2vec_api import get_similar_redis pickle_to_file({}, RESOURCE_DATA_FILE) from general_utils.db_utils import get_newsdb_handler, get_medicaldb_handler, get_medical_entity_handler, \ get_entity_cate, get_diagnose_handler from add_data_to_solr.manager.add_utils import topic_info, doctor_info, grade_topic class Command(BaseCommand): def handle(self, *args, **options): """ ./PYTHON.sh manage.py get_local_data """ # get_and_save_resource() # get_topic_data() get_bodypart_data() get_simple_medical_entity_data()
def handle(self, *args, **options): """ ./PYTHON.sh manage.py gen_empty_pickle_file """ file_name = args[0] pickle_to_file({}, file_name)