def _build_feature_extraction(self, data_path): print('Build feature extraction...') corpus = list() with open(data_path, 'r') as f: reader = csv.reader(f, delimiter='#') for line in reader: b = line[1].decode('utf-8') # b = QueryUtils.static_remove_stop_words(b) tokens = QueryUtils.static_jieba_cut(b) corpus.append(tokens) if self.mode == 'ngram': print_cn('Use {0}'.format(self.mode)) bigram_vectorizer = CountVectorizer( ngram_range=(1, 2), min_df=0.0, max_df=1.0, analyzer='char', stop_words=[',', '?', '我', '我要'], binary=True) self.feature_extractor = bigram_vectorizer.fit(corpus) if self.mode == 'tfidf': print_cn('Use {0}'.format(self.mode)) tfidf_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 2), max_df=1.0, min_df=1, sublinear_tf=True) self.feature_extractor = tfidf_vectorizer.fit(corpus) if self.mode == 'fasttext': pass
def _request_solr(self, q): ## cut q into tokens tokens = ['question:' + s for s in QueryUtils.static_jieba_cut(q, smart=False, remove_single=True)] q = ' OR '.join(tokens) url = self.qa_url.format(q) # print('qa_debug:', url) cn_util.print_cn(url) r = requests.get(url) return r
def _build(self, data_path): self._build_feature_extraction(data_path) mlb = MultiLabelBinarizer() embeddings = list() labels = list() if self.mode == 'fasttext': with open(data_path, 'r') as f: reader = csv.reader(f, delimiter='#') for line in reader: key = line[0].decode('utf-8') input_ = line[1].decode('utf-8') intention_list = key.split(",") tokens = QueryUtils.static_jieba_cut(input_) # embedding = self.feature_extractor.transform(tokens).toarray() vector = self._fasttext_vector(tokens) if not vector: continue embedding = vector embeddings.append(embedding) labels.append(intention_list) # embeddings = self.feature_extractor.transform(embeddings).toarray() self.mlb = mlb.fit(labels) labels_ = self.mlb.transform(labels) return embeddings, labels_ else: with open(data_path, 'r') as f: reader = csv.reader(f, delimiter='#') for line in reader: key = line[0].encode('utf-8') input_ = line[1].encode('utf-8') intention_list = key.split(",") tokens = QueryUtils.static_jieba_cut(input_) # embedding = self.feature_extractor.transform(tokens).toarray() embeddings.append(tokens) labels.append(intention_list) embeddings = self.feature_extractor.transform(embeddings).toarray() self.mlb = mlb.fit(labels) labels_ = self.mlb.transform(labels) return embeddings, labels_
def select_max_match_with_sim(self, q, r): if not self.bt: return None matched_questions = SolrUtils.get_dynamic_response(r=r, key='question', random_hit=False, random_field=True, keep_array=False, facet=True) q_tokens = ' '.join(QueryUtils.static_jieba_cut(q)) matched_questions_tokens = [' '.join(QueryUtils.static_jieba_cut(mqt)) for mqt in matched_questions] max_sim = self.bt.getMaxSim(q_tokens, matched_questions_tokens) best_sentence = ''.join(max_sim['sentence'].split(' ')) sim = max_sim['sim'] cn_util.print_cn(best_sentence, str(sim), '[' + ','.join(matched_questions) + ']') if sim > 0.3: index = matched_questions.index(best_sentence) answer = SolrUtils.get_dynamic_response(r, key='answer', force_hit=index, random_field=True, random_hit=False) return answer return None
def _prepare_data(self, files): print('prepare data...') embeddings = list() queries = list() queries_ = dict() labels = list() mlb = MultiLabelBinarizer() for index in xrange(len(files)): path = files[index] label = self.named_labels[index] queries_[label] = list() with open(path, 'r') as f: for line in f: # line = json.loads(line.strip().decode('utf-8')) # question = line['question'] question = line.replace('\t', '').replace( ' ', '').strip('\n').decode('utf-8') question = QueryUtils.static_remove_cn_punct(str(question)) tokens = QueryUtils.static_jieba_cut(question) # print_cn(tokens) if len(tokens) == 0: continue # cc=self.check_zero_tokens(tokens) # if not cc: # continue queries_[label].append(question) # print len(queries_) for label, questions in queries_.iteritems(): for question in questions: if question in queries and label not in labels[queries.index( question)]: # print_cn(question) index = queries.index(question) labels[index].append(label) else: # print_cn(question) queries.append(question) labels.append([label]) tokens = self.cut(question).split(' ') embedding = self.get_w2v_emb(tokens) embeddings.append(embedding) embeddings = np.array(embeddings) embeddings = np.squeeze(embeddings) self.mlb = mlb.fit(labels) labels = self.mlb.transform(labels) # print (embeddings.shape, len(queries)) # print_cn(labels.shape) return embeddings, labels, queries
def _request_solr(self, q, key, base_url): ## cut q into tokens key = '%s:' % key tokens = [ s for s in QueryUtils.static_jieba_cut( q, smart=False, remove_single=True) ] if len(tokens) == 0: return None q = key + "(" + '%20'.join(tokens) + ")" url = base_url % q cn_util.print_cn(url) r = requests.get(url) return r
def _request_solr(self, q, key, base_url): try: ## cut q into tokens key = 'fq=%s:' % key tokens = [ s for s in QueryUtils.static_jieba_cut( q, smart=False, remove_single=True) ] q = key + '%20'.join(tokens) url = base_url.format(q) cn_util.print_cn(url) r = requests.get(url) return r except: traceback.print_exc() return None
def predict(self, input_): # input_ = QueryUtils.static_remove_stop_words(input_) tokens = QueryUtils.static_jieba_cut(input_) try: if self.mode == 'fasttext': embedding = self._fasttext_vector(tokens) embeddings = np.array([embedding]) else: embeddings = np.reshape( self.feature_extractor.transform(tokens).toarray()[0], [1, -1]) except: exit(-1) prediction = self.clf.predict(embeddings) prediction_index_first_sample = np.where(prediction[0] == 1) labels = self.mlb.inverse_transform(prediction) # print_cn(labels) probs = self.clf.predict_proba(embeddings) # note that in prediction stage, n_samples == 1 return labels[0], probs[0][prediction_index_first_sample]