def _prepare_data(self, files): print('prepare data...') embeddings = list() queries = list() queries_ = dict() labels = list() mlb = MultiLabelBinarizer() for index in xrange(len(files)): path = files[index] label = self.named_labels[index] queries_[label] = list() with open(path, 'r') as f: for line in f: # line = json.loads(line.strip().decode('utf-8')) # question = line['question'] question = line.replace('\t', '').replace( ' ', '').strip('\n').decode('utf-8') question = QueryUtils.static_remove_cn_punct(str(question)) tokens = QueryUtils.static_jieba_cut(question) # print_cn(tokens) if len(tokens) == 0: continue # cc=self.check_zero_tokens(tokens) # if not cc: # continue queries_[label].append(question) # print len(queries_) for label, questions in queries_.iteritems(): for question in questions: if question in queries and label not in labels[queries.index( question)]: # print_cn(question) index = queries.index(question) labels[index].append(label) else: # print_cn(question) queries.append(question) labels.append([label]) tokens = self.cut(question).split(' ') embedding = self.get_w2v_emb(tokens) embeddings.append(embedding) embeddings = np.array(embeddings) embeddings = np.squeeze(embeddings) self.mlb = mlb.fit(labels) labels = self.mlb.transform(labels) # print (embeddings.shape, len(queries)) # print_cn(labels.shape) return embeddings, labels, queries
class QAKernel: qa_url = 'http://localhost:11403/solr/qa/select?wt=json&q=question:(%s)' # null_anwer = ['这个我不知道,您可以谷歌或百度', '我知识有限,这个我不知道怎么回答...[晕][晕][晕]'] null_answer = ['null'] def __init__(self): print('initilizing qa kernel...') self.qu = QueryUtils() def kernel(self, q): r = self._request_solr(q) answer = self._extract_answer(r) return answer def _extract_answer(self, r, random_range=1): try: num = self._num_answer(r) if num > 0: x = random.randint(0, min(random_range - 1, num - 1)) response = self._get_response(r, x) return response else: return np.random.choice(self.null_anwer, 1)[0] except: return np.random.choice(self.null_anwer, 1)[0] def _request_solr(self, q): tokenized, exact_q = self.purify_q(q) url = self.qa_url % tokenized.decode('utf-8') print('qa_debug:', url) r = requests.get(url) return r def _num_answer(self, r): return int(r.json()["response"]["numFound"]) def _get_response(self, r, i=0): try: a = r.json()["response"]["docs"] return a[i]["answer"][0].encode('utf8') except: return None def purify_q(self, q): q = self.qu.remove_cn_punct(q) pos_q = self.qu.corenlp_cut(q, remove_tags=["CD", "VA", "AD", "VC"]) return ''.join(pos_q), q
def _build_feature_extraction(self, data_path): print('Build feature extraction...') corpus = list() with open(data_path, 'r') as f: reader = csv.reader(f, delimiter='#') for line in reader: b = line[1].decode('utf-8') # b = QueryUtils.static_remove_stop_words(b) tokens = QueryUtils.static_jieba_cut(b) corpus.append(tokens) if self.mode == 'ngram': print_cn('Use {0}'.format(self.mode)) bigram_vectorizer = CountVectorizer( ngram_range=(1, 2), min_df=0.0, max_df=1.0, analyzer='char', stop_words=[',', '?', '我', '我要'], binary=True) self.feature_extractor = bigram_vectorizer.fit(corpus) if self.mode == 'tfidf': print_cn('Use {0}'.format(self.mode)) tfidf_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 2), max_df=1.0, min_df=1, sublinear_tf=True) self.feature_extractor = tfidf_vectorizer.fit(corpus) if self.mode == 'fasttext': pass
def where(self, q, last_r): current_entity, current_type, current_solr_r = self.retrieve_entity( q, 'store%20entertainment%20facility') if last_r and not current_entity: last_entity, last_type, last_solr_r = self.retrieve_entity( last_r, 'store%20entertainment%20facility') else: last_entity = None if current_entity: location = SolrUtils.get_dynamic_response(current_solr_r, 'rich_location', random_field=True) if not location: return self.simple.kernel(q) return None, current_entity + "," + location if not last_entity: location = '您在问什么?' return None, location q = QueryUtils.static_remove_pu(q).decode('utf-8') strict = re.compile(ur'在哪|在什么地方|带我去|在哪里') if re.match(strict, q): location = SolrUtils.get_dynamic_response(last_solr_r, 'rich_location', random_field=True) if not location: location = '数据库中不存在' return None, location return 'base', "数据库中不存在"
def _build_feature_extractor(self, mode, files): print('Build feature extraction...') corpus = list() for path in files: with open(path, 'r') as f: for line in f: # line = json.loads(line.strip().decode('utf-8')) # question = line['question'] question = line.replace('\t', '').replace( ' ', '').strip('\n').decode('utf-8') question = QueryUtils.static_remove_cn_punct(str(question)) tokens = self.cut(question) corpus.append(tokens) if mode == 'ngram': bigram_vectorizer = CountVectorizer( ngram_range=(1, 2), min_df=0.0, max_df=1.0, analyzer='char', stop_words=[',', '?', '我', '我要'], binary=True) self.feature_extractor = bigram_vectorizer.fit(corpus) if mode == 'tfidf': print_cn('use {0}'.format(mode)) tfidf_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 2), max_df=1.0, min_df=1, sublinear_tf=True) self.feature_extractor = tfidf_vectorizer.fit(corpus)
def __init__(self, graph_path, clf_path): # self.tokenizer = CoreNLP() self.graph = None self.gbdt = None self.state_cleared = True self._load_graph(graph_path) self._load_clf(clf_path) self.qu = QueryUtils()
def _request_solr(self, q): ## cut q into tokens tokens = ['question:' + s for s in QueryUtils.static_jieba_cut(q, smart=False, remove_single=True)] q = ' OR '.join(tokens) url = self.qa_url.format(q) # print('qa_debug:', url) cn_util.print_cn(url) r = requests.get(url) return r
def __init__(self): print('attaching qa kernel...') ## http://localhost:11403/solr/sc_qa/select?fq=entity:%E5%8E%95%E6%89%80&indent=on&q=*:*&wt=json self.qa_url = 'http://localhost:11403/solr/sc_qa/select?q.op=OR&wt=json&q={0}' self.qa_exact_match_url = 'http://localhost:11403/solr/sc_qa/select?wt=json&q=question:{0}' self.qu = QueryUtils() try: self.bt = BenebotSim.Instance() except: traceback.print_exc()
def __init__(self, data_path): print('initilizing classifier...') self.data_path = data_path self.num_vol = 0 self.vol = {} self.classes = {} self.index_classes = {} self.classes_num_sub = {} self.classifiers = {} self.qu = QueryUtils()
def _build(self, data_path): self._build_feature_extraction(data_path) mlb = MultiLabelBinarizer() embeddings = list() labels = list() if self.mode == 'fasttext': with open(data_path, 'r') as f: reader = csv.reader(f, delimiter='#') for line in reader: key = line[0].decode('utf-8') input_ = line[1].decode('utf-8') intention_list = key.split(",") tokens = QueryUtils.static_jieba_cut(input_) # embedding = self.feature_extractor.transform(tokens).toarray() vector = self._fasttext_vector(tokens) if not vector: continue embedding = vector embeddings.append(embedding) labels.append(intention_list) # embeddings = self.feature_extractor.transform(embeddings).toarray() self.mlb = mlb.fit(labels) labels_ = self.mlb.transform(labels) return embeddings, labels_ else: with open(data_path, 'r') as f: reader = csv.reader(f, delimiter='#') for line in reader: key = line[0].encode('utf-8') input_ = line[1].encode('utf-8') intention_list = key.split(",") tokens = QueryUtils.static_jieba_cut(input_) # embedding = self.feature_extractor.transform(tokens).toarray() embeddings.append(tokens) labels.append(intention_list) embeddings = self.feature_extractor.transform(embeddings).toarray() self.mlb = mlb.fit(labels) labels_ = self.mlb.transform(labels) return embeddings, labels_
def select_max_match_with_sim(self, q, r): if not self.bt: return None matched_questions = SolrUtils.get_dynamic_response(r=r, key='question', random_hit=False, random_field=True, keep_array=False, facet=True) q_tokens = ' '.join(QueryUtils.static_jieba_cut(q)) matched_questions_tokens = [' '.join(QueryUtils.static_jieba_cut(mqt)) for mqt in matched_questions] max_sim = self.bt.getMaxSim(q_tokens, matched_questions_tokens) best_sentence = ''.join(max_sim['sentence'].split(' ')) sim = max_sim['sim'] cn_util.print_cn(best_sentence, str(sim), '[' + ','.join(matched_questions) + ']') if sim > 0.3: index = matched_questions.index(best_sentence) answer = SolrUtils.get_dynamic_response(r, key='answer', force_hit=index, random_field=True, random_hit=False) return answer return None
def kernel(self, q, last_r): try: exact = self.exact_match(QueryUtils.static_remove_pu(q)) if exact: return None, exact cls, probs = self.clf.predict(q) # if cls == 'where': direction, answer = self.where(q=q, last_r=last_r) return direction, answer if cls == 'exist': direction, answer = self.exist(q=q, last_r=last_r) return direction, answer if cls == 'ask_price': direction, answer = self.ask_price(q=q, last_r=last_r) return direction, answer if cls == 'ask_discount': direction, answer = self.ask_discount(q=q, last_r=last_r) return direction, answer if cls == 'ask_queue': direction, answer = self.ask_queue(q=q, last_r=last_r) return direction, answer # if cls == 'permit': # direction, answer = self.permit(q=q, last_r=last_r) # return direction, answer # if cls == 'whether': # direction, answer = self.whether(q=q, last_r=last_r) # return direction, answer # if cls == 'when': # direction, answer = self.when(q=q, last_r=last_r) # return direction, answer # if cls == 'how': # direction, answer = self.how(q=q, last_r=last_r) # return direction, answer # if cls == 'which': # direction, answer = self.which(q=q, last_r=last_r) # return direction, answer # if cls == 'what': # direction, answer = self.what(q=q, last_r=last_r) # return direction, answer if cls == 'list': direction, answer = self.list(q=q, last_r=last_r) return direction, answer if cls == 'ask_taste': direction, answer = self.taste(q=q, last_r=last_r) return direction, answer return self.simple.kernel(q) except Exception, e: traceback.print_exc() return self.simple.kernel(q)
def _request_solr(self, q, key, base_url): ## cut q into tokens key = '%s:' % key tokens = [ s for s in QueryUtils.static_jieba_cut( q, smart=False, remove_single=True) ] if len(tokens) == 0: return None q = key + "(" + '%20'.join(tokens) + ")" url = base_url % q cn_util.print_cn(url) r = requests.get(url) return r
def _request_solr(self, q, key, base_url): try: ## cut q into tokens key = 'fq=%s:' % key tokens = [ s for s in QueryUtils.static_jieba_cut( q, smart=False, remove_single=True) ] q = key + '%20'.join(tokens) url = base_url.format(q) cn_util.print_cn(url) r = requests.get(url) return r except: traceback.print_exc() return None
def regex_plugin(self, q): # q = QueryUtils.static_corenlp_cut(q, remove_tags=QueryUtils.remove_tags) if re.match(self.request_more_pattern, q): return 'reqmore', None, q if re.match(self.base_pattern, q): return 'base', None, q type_ = self.entity_recog(q) if type_ == 'store': _, neg = self.neg.predict(q) if neg: return 'base', None, q else: return 'qa', None, q # if type_ == 'item': # return 'sale', 'qa', q try: if re.match(self.qa_pattern, q): # q = re.sub(self.qa_clean_pattern, '', q) return 'qa', None, q if re.match(self.sing_pattern, q): if re.match(self.sing_diff_pattern, q): return 'qa', None, q return 'sing', None, q if re.match(self.sale_pattern, q): return 'sale', None, q if re.match(self.greeting_pattern, q): if (len(q)) > 1: q = re.sub(self.greeting_clean_pattern, '', q) try: q = QueryUtils.static_corenlp_cut( q, remove_tags=QueryUtils.remove_tags) q = ''.join(q).decode('utf-8') if not q: q = u'你好' except: pass if isinstance(q, str): q = q.decode('unicode-escape').encode('utf-8') return 'greeting', 'base', q return None, None, q except: return None, None, q
def predict(self, input_): # input_ = QueryUtils.static_remove_stop_words(input_) tokens = QueryUtils.static_jieba_cut(input_) try: if self.mode == 'fasttext': embedding = self._fasttext_vector(tokens) embeddings = np.array([embedding]) else: embeddings = np.reshape( self.feature_extractor.transform(tokens).toarray()[0], [1, -1]) except: exit(-1) prediction = self.clf.predict(embeddings) prediction_index_first_sample = np.where(prediction[0] == 1) labels = self.mlb.inverse_transform(prediction) # print_cn(labels) probs = self.clf.predict_proba(embeddings) # note that in prediction stage, n_samples == 1 return labels[0], probs[0][prediction_index_first_sample]
def kernel(self, q): ## first try regex_plugin: scene, sugg_scene, q = self.regex_plugin(q) if scene: return scene, sugg_scene, q try: if not self.web: if not self.clf: return 'sale', None, q q = QueryUtils.static_remove_pu(q) labels, _ = self.clf.predict(question=q) select = self.select_label(labels) ## qa plugin: if select == 'qa': return select, 'sale', q if select == 'greeting': return select, 'base', q return select, None, q else: text = requests.get('http://localhost:11305/sc/scene?q=' + q) return text.text, None, q except: return None, q
def _prepare_data(self, files): print('prepare data...') embeddings = list() queries = list() labels = list() # mlb = MultiLabelBinarizer() for index in xrange(len(files)): path = files[index] with open(path, 'r') as f: for line in f: # line = json.loads(line.strip().decode('utf-8')) # question = line['question'] line = line.replace('\t', '').replace( ' ', '').strip('\n').decode('utf-8').split('#') question = QueryUtils.static_simple_remove_punct( str(line[0])) label = self.named_labels.index( str(line[1].encode('utf-8'))) queries.append(question) labels.append(label) tokens = [self.cut(question)] embedding = self.feature_extractor.transform( tokens).toarray() embeddings.append(embedding) embeddings = np.array(embeddings) embeddings = np.squeeze(embeddings) # self.kernel.fit() # self.mlb = mlb.fit(labels) # labels = self.mlb.transform(labels) # print (embeddings.shape, len(queries)) # print_cn(labels.shape) return embeddings, labels, queries
def __init__(self): print('initilizing interactive kernel...') self.last_g = None self.qu = QueryUtils()
class IKernel: i_url = 'http://localhost:11403/solr/interactive/select?wt=json&q=g:(%s) OR exact_g:(%s)^4' simple_context_i_url = 'http://localhost:11403/solr/interactive/select?wt=json&q=g:(%s)^10 OR exact_g:(%s)^20 OR last_g:(%s)^2 OR exact_last_g:(%s)^8' # null_anwer = ['我没听懂您的意思', '我好像不明白...[晕][晕][晕]', '[晕][晕][晕]您能再说一遍吗?我刚刚没听清'] null_answer = ['null'] def __init__(self): print('initilizing interactive kernel...') self.last_g = None self.qu = QueryUtils() def kernel(self, q): r = self._request_solr(q) answer = self._extract_answer(r) return answer def _extract_answer(self, r, random_range=1): try: num = self._num_answer(r) if num > 0: x = random.randint(0, min(random_range - 1, num - 1)) response = self._get_response(r, x) return response else: return np.random.choice(self.null_anwer, 1, p=[0.5, 0.5])[0] except: return np.random.choice(self.null_anwer, 1)[0] def _request_solr(self, q): tokenized, exact_q = self.purify_q(q) if not self.last_g: url = self.i_url % (tokenized, exact_q) self.last_g = q else: last_tkz, last_exact_q = self.purify_q(self.last_g) url = self.simple_context_i_url % (tokenized, exact_q, last_tkz, last_exact_q) self.last_g = q cn_util.print_cn('debug:interactive_url:' + url) r = requests.get(url) return r def clear_state(self): self.last_g = None def _num_answer(self, r): return int(r.json()["response"]["numFound"]) def _get_response(self, r, i=0): try: a = r.json()["response"]["docs"][i]['b'] x = random.randint(0, len(a) - 1) return a[x].encode('utf8') except: return None def purify_q(self, q): q = self.qu.remove_cn_punct(q) pos_q = self.qu.corenlp_cut( q, remove_tags=["CD", "PN", "VA", "AD", "VC", "SP"]) return ''.join(pos_q), q
def __init__(self): print('attaching sing kernel...') ## http://localhost:11403/solr/sc_qa/select?fq=entity:%E5%8E%95%E6%89%80&indent=on&q=*:*&wt=json self.qa_url = 'http://localhost:11403/solr/sc_music_kb/select?q.op=OR&wt=json&q=%s' self.qu = QueryUtils()
def __init__(self): print('attaching greeting kernel...') self.last_g = None self.qu = QueryUtils() self.greeting_url = 'http://localhost:11403/solr/sc_greeting/select?q.op=OR&wt=json&q=question:(%s)' self.exact_greeting_url = 'http://localhost:11403/solr/sc_greeting/select?wt=json&q=exact_question:(%s)'
def __init__(self): print('initilizing qa kernel...') self.qu = QueryUtils()
def cut(self, input_): input_ = QueryUtils.static_remove_cn_punct(input_) tokens = jieba.cut(input_, cut_all=True) seg = " ".join(tokens) tokens = _uniout.unescape(str(seg), 'utf8') return tokens
def cut(input_): input_ = QueryUtils.static_remove_cn_punct(input_) tokens = list(jieba.cut(input_, cut_all=False)) return tokens
def cut(self, input_): input_ = QueryUtils.static_simple_remove_punct(input_) seg = " ".join(jieba.cut(input_, cut_all=False)) tokens = _uniout.unescape(str(seg), 'utf8') return tokens