def get_special_population(text): text = ensure_unicode(text) try: return qa_pro_special_population(text) except Exception, e: print 'get_special_population Exception', e return 'common_population'
def get_sex(text): text = ensure_unicode(text) if u"男" in text: return 2 if u"女" in text: return 1 return 0
def load_simple_lines(filename): res = set() with open(filename, 'r') as f: for l in f: l = l.strip('\n').strip() res.add(ensure_unicode(l)) print "load from file %s" % filename return res
def get_vecs_weighted(words, weights): # 不再用 vecs = [] new_weights = [] words = [ensure_unicode(x) for x in words] for i in range(len(words)): vec = get_vec(words[i]) if vec is None: continue vecs.append(vec) new_weights.append(weights[i]) return vecs, new_weights
def population_pros2(population_type): # 首推 population_type = ensure_unicode(population_type) if population_type in (u"children", u"pregnant_woman", u"lactating_women"): return [u"MYBJ", u"MR", u"YY"] if population_type == u"elder": return [u"PY", u"YY", u"AZ", u"TNB"] if population_type == u"children": return [u"MYBJ", u"YY"] if population_type == u"for_pregnant": return [u"MYBJ", u"YY"] return []
def load_from_file(filename, limit=None, to_unicode=True): res = [] with open(filename, 'r') as f: for i, line in enumerate(f): if limit and i >= limit - 1: break line = line.strip() if to_unicode: line = ensure_unicode(line) if line: res.append(line) return res
def get_vecs_weighted3(words): # 与get_vecs_weighted2功能一样,但使用word2vec的get_vec_list_redis接口 vec_dict = get_vec_list_redis(list(words)) # vec_dict 的key 确保都是unicode编码的 # 已经归一化,并去掉word2vec里边没有向量的,但是没有转换为ndarray keep_indices = [] vecs = [] for i, word in enumerate(words): word = ensure_unicode(word) if word in vec_dict: keep_indices.append(i) vecs.append(np.array(vec_dict[word])) return vecs, keep_indices
def get_vecs_weighted2(words): # 给出那些保留的indices # 不再用 keep_indices = [] vecs = [] words = [ensure_unicode(x) for x in words] for i in range(len(words)): vec = get_vec(words[i]) if vec is None: continue vecs.append(vec) keep_indices.append(i) return vecs, keep_indices
def find_bdpart(tags, max_len=4): cutups = set() tags = [ensure_unicode(x) for x in tags] for tag in tags: for begin_index in range(len(tag)): l = len(tag) for w_size in range(0, l - begin_index): end_index = begin_index + w_size + 1 if w_size > max_len: break w = tag[begin_index:end_index] if get_db_data_local_handler().is_in_bodypart(w): cutups.add(w) return cutups
def is_baby_text(text): ''' 用规则判断是不是宝宝文本 :param text: :return: ''' text = ensure_unicode(text) for x in BABY_WORDS: if x in text: return True for p in BABY_SEARCH: if p.search(text): return True return False
def get_bodypart_word(text): # md4 simple_medical_entity text = ensure_unicode(text) solr_query = SolrQuery() query = 'name:%s' % text solr_query.set('q', query) solr_query.set('fl', ['name']) solr_query.set('rows', 3) solr_query.set('fq', 'type:bodypart') bp_words = [] for item in solr_sme.search(**solr_query.get_query_dict()): names = item['name'] for name in names: if name in text: bp_words.append(name) return bp_words
def qa_ask_info(text): sex = '' age = '' text = ensure_unicode(text).strip() ask_tail_pattern = re.compile(u"(.{1,8})$") sex_pattern = re.compile(u"[男|女]") tail = ask_tail_pattern.search(text) if tail: text = ask_tail_pattern.sub("", text) tail = tail.group(0) sex0 = sex_pattern.search(tail) if sex0: sex = sex0.group(0) age = sex_pattern.sub(u"", tail.replace(u"(", u"").replace(u")", "")).replace(u",", u"").strip() return text, sex, age
def population_cons2(population_type): if population_type is None: return [] population_type = ensure_unicode(population_type) if population_type == u"children": return [u"MALE", u"NX", u"LXXZ"] if population_type == u"lactating_women": return [u"MALE"] if population_type == u"common_population_men": return [u"NX", u"MYBJ", u"LXXZ"] if population_type == u"common_population_women": return [u"MALE", u"MYBJ", u"LXXZ"] if population_type == u"common_population": return [u"MYBJ", u"LXXZ"] if population_type not in (u"pregnant_woman", u"lactating_women", u"children"): return [u"MYBJ"] return []
def find_systag_keywords_extend(text, max_len=7): cnt_dict = defaultdict(int) systag_id_dict = defaultdict(list) text = ensure_unicode(text) l = len(text) for begin_index in range(len(text)): cnt = 0 for end_index in range(begin_index + 1, l + 1): if cnt >= max_len: continue w = text[begin_index:end_index] relation_systag_id, weight = get_db_data_local_handler( ).get_extend_keyword_relation_systag_id(w) if relation_systag_id: cnt_dict[w] += weight systag_id_dict[w] = relation_systag_id cnt += 1 return systag_id_dict, cnt_dict
def get_vec_list_redis(word_list): # get_vec_list的redis版本,但向量归一化了,是list,不是ndarray # step 1 尝试从redis获取vec_list,不在Redis的NOT_IN_REDIS_SIGN表示 word_list = [ensure_unicode(x) for x in word_list] vec_dict = {} not_in_redis_indices = [] redis_res = Word2VecCache.get_vec_list(word_list) # # 更换数据期间,所有词假设都不在redis中,而从api中重新获取,故redis_res = [NOT_IN_REDIS_SIGN]*len(word_list) # redis_res = [NOT_IN_REDIS_SIGN]*len(word_list) for i, vec in enumerate(redis_res): # vec 是归一化了的 if vec == NOT_IN_REDIS_SIGN: not_in_redis_indices.append(i) else: if vec not in BAD_VEC_SIGN: # 只加入有向量的词 vec_dict[word_list[i]] = vec # step 2 redis里没有的,从接口里取,并把它们放进redis not_in_redis_word_list = [word_list[i] for i in not_in_redis_indices] # print 'not_in_redis_word_list', '|||'.join(not_in_redis_word_list) # for x in vec_dict: # print 'already in redis good word', x, len(vec_dict[x]) vec_dict_from_api = get_vec_list(not_in_redis_word_list) # for x in vec_dict_from_api: # print 'vec_dict_from_api', x, len(vec_dict_from_api[x]) for word in vec_dict_from_api: vec = vec_dict_from_api[word] vec = norm_list(vec) # 归一化 if vec != NOT_IN_WORD2VEC_SIGN_API: # 只加入有向量的词,word2vec接口返回的 vec_dict[word] = vec Word2VecCache.set_vec(word, vec) # 存入redis # vec_dict.update(vec_dict_from_api) # for x in vec_dict: # vec = vec_dict[x] # print 'final word', x, type(x),len(vec), type(vec), math.sqrt(sum([y * y for y in vec])) return vec_dict
def is_for_pregnant(text): # 用规则判断是不是备孕人群(刘慧珠) text = ensure_unicode(text) patterns = ( u"想怀孕", u"尝试怀孕", u"怎么才能怀孕", u"能不能怀孕", u"能怀孕", u"可不可以怀孕", u"可以怀孕", u"备孕", u"影响怀孕", u"想要宝宝", u"要宝宝", u"要孩子", u"要小孩", u"想生宝宝", u"想生小孩", u"想生孩子", u"如何怀孩子", u"如何生孩子", u"怎样怀孩子", u"怎样生孩子", u"如何怀宝宝", u"如何生宝宝", u"怎样怀宝宝", u"怎样生宝宝", u"如何怀上孩子", u"怎样怀上孩子", u"如何怀上宝宝", u"如何怀上宝宝", ) for p in patterns: if p in text: return True return False
def get_keyword_relation_systag_id(cls, word): word = ensure_unicode(word) return cls.systag_data['keyword'].get(word, [])
def get_vecs2(words): # 不考虑权重 words = [ensure_unicode(x) for x in words] return [get_vec(word) for word in words if get_vec(word) is not None]
def get_news_from_bigsearch(query): query = ensure_unicode(query) return [int(x) for x in json.loads(more_news(query))[0]["ids"]]
def get_systag_data(): # 获取热卖tag相关数据,keywords,target_param,name等 sql = "select sysTag_id, keywords ,clinic_no,second_clinic_no from ner_systagsolrgenerateconf;" data = dict() data["systag"] = {} # 9:{'tag_name':'gastroscope_colonoscope','plan':[{'url':url1,'name':name1},{'url':url2,'name':name2}]} data['keyword'] = defaultdict(list) # '感冒':[systag_id1,systag_id2...] data['keyword_extend'] = {} data['clinic_no'] = defaultdict(list) # u'1':[systag_id1] all_plan_name = [] o = get_diagnose_handler().dbhandler.do_one(sql) for item in o: systag_id = item[0] keywords = item[1].strip() clinic_no = item[2].strip() second_clinic_no = item[3].strip() # 科室信息与systag_id的对应关系,不标记区分一二级科室 if clinic_no: clinic_nos = clinic_no.split() for x in clinic_nos: x = ensure_unicode(x) data['clinic_no'][x].append(systag_id) if second_clinic_no: second_clinic_nos = second_clinic_no.split() for x in second_clinic_nos: x = ensure_unicode(x) data['clinic_no'][x].append(systag_id) # data['systag'] tag_name = get_diagnose_handler().get_systag_en_name(systag_id) sql1 = 'select id,name,target_param from api_userhomehotsalegallery where tag="%s" and is_online=1;' % tag_name o1 = get_medicaldb_handler().do_one(sql1) data['systag'][systag_id] = {'tag_name': tag_name, 'plan': []} if not o1: continue for item1 in o1: plan_id = item1[0] name = item1[1] url = item1[2].replace('\r\n', '') print systag_id, tag_name, name, url data['systag'][systag_id]['plan'].append({ 'url': url, 'name': name, 'plan_id': plan_id }) all_plan_name.append([systag_id, name]) if keywords == u"*": continue # data['keyword'] keywords = keywords.lower().split() for k in keywords: if systag_id not in data['keyword'][k]: data['keyword'][k].append(systag_id) # 用相似词将keyword扩充 num = 20 master_slave = {} high_freq_words = get_high_freq_words() for k in data['keyword']: systag_id_list = data['keyword'][k] # data['keyword_extend'][k] = [systag_id_list, 1.0] master_slave[k] = [systag_id_list, []] for w, s in get_similar_redis(k, num): w = ensure_unicode(w) if len(w) < 2: # 去掉长度为1的相似词 continue if s < 0.41: # 分数过低的不要 break if w in high_freq_words: # 去掉公认的高频词 continue data['keyword_extend'][w] = [systag_id_list, s] master_slave[k][1].append([w, s]) for k in data['keyword']: systag_id_list = data['keyword'][k] data['keyword_extend'][k] = [systag_id_list, 1.0] # 把keyword_extend信息存文件里,方便查看 with open(SYSTAG_DATA_CHECK_FILE, 'w') as fc: for k in master_slave: systag_id_list, ws_list = master_slave[k] fc.write('###' + k + '|||' + json.dumps(systag_id_list) + '=' * 10 + '\n') for w, s in ws_list: fc.write(w + '|||' + str(s) + '\n') for systag_id, plan_name in all_plan_name: fc.write(str(systag_id) + '---' + plan_name + '\n') pickle_to_file(data, SYSTAG_DATA_FILE)
def is_in_bodypart(cls, word): word = ensure_unicode(word) if word == u"血": return False return word in cls.bodypart_data
def get_entity_cate(cls, word): word = ensure_unicode(word.lower()) return cls.medical_entity_cate.get(word, '')
def get_extend_keyword_relation_systag_id(cls, word): # 对热卖tag进行相似词扩展后 word = ensure_unicode(word) # return [systag_id_list,similarity] return cls.systag_data['keyword_extend'].get(word, [[], 0.0])
def is_e_stop_word(cls, word): word = ensure_unicode(word) return word in cls.e_data
def get_relation_drug(cls, word, num=100): word = ensure_unicode(word) return cls.medical_relation_drug.get(word, [])[:num]
def is_entity(cls, word): word = ensure_unicode(word) return word in cls.medical_entity_cate