def evaluate(encoder, decoder, source_vocab, target_vocab, sentence, max_length=20): with torch.no_grad(): input_tensor = tensorFromSentence(cut(sentence), source_vocab) input_length = input_tensor.size()[0] encoder_hidden = encoder.initHidden() encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device) for ei in range(input_length): encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden) encoder_outputs[ei] += encoder_output[0, 0] decoder_input = torch.tensor([[SOS_token]], device=device) # SOS decoder_hidden = encoder_hidden decoded_words = [] decoder_attentions = torch.zeros(max_length, max_length) for di in range(max_length): decoder_output, decoder_hidden, decoder_attention = decoder( decoder_input, decoder_hidden, encoder_outputs) decoder_attentions[di] = decoder_attention.data topv, topi = decoder_output.data.topk(1) if topi.item() == EOS_token: decoded_words.append('<eos>') break else: decoded_words.append(target_vocab.idx2word[topi.item()]) decoder_input = topi.squeeze().detach() return decoded_words, decoder_attentions[:di + 1]
def load(path, MAX_LEN): s_vocab = Vocabulary(init_vocablary=["<sos>", "<eos>"]) t_vocab = Vocabulary(init_vocablary=["<sos>", "<eos>"]) with open(path, 'r') as f: data = f.readlines() size = int(len(data) / 3) source_seq = [] target_seq = [] for i in range(size): s_i = re.sub(" ", "", data[i * 3 + 1].strip("\n").lower()[2:]) t_i = re.sub(" ", "", data[i * 3 + 2].strip("\n").lower()[2:]) if len(s_i) == 0 or len(t_i) == 0 or len(t_i) > MAX_LEN or len(s_i) > MAX_LEN: continue source_seq.append(s_i) target_seq.append(t_i) source_seq = cut(source_seq, 4) target_seq = cut(target_seq, 4) for i in range(len(target_seq)): target_seq[i].append("<eos>") s_vocab.fit(source_seq) t_vocab.fit(target_seq) return source_seq, s_vocab, target_seq, t_vocab
def _update_context(self, context_id, query): text = query["text"] text_cut = cut(text) logger.info("update context") self.contexts[context_id]["last_query_time"] = dt.datetime.now() self.contexts[context_id]["query_cut"] = text_cut self.contexts[context_id]["query_idx"] = self.vocab.transform_sentence( text_cut) self.contexts[context_id]["query"] = text self.contexts[context_id]["history_query"].append(text) self.contexts[context_id]["entities"] = self.ner.extract( self.contexts[context_id]) intent, confidence = self._intent_recognition(context_id) logger.info("intent: {}, confidence: {}".format(intent, confidence)) self.contexts[context_id]["intent"] = intent self._update_intent_slots(context_id, intent) self.contexts[context_id]["history_intent"].append(intent)
def main(input_path, output_path, n_job): """文本文件分词 要求文件以\n断句,不含标点及特殊字符 分词后的文本以" "作为分割符号 :param input_path: :param output_path: :return: """ assert os.path.exists(input_path) out_path = Path(output_path).resolve().parent out_path.mkdir(exist_ok=True) in_file = open(input_path, 'r', encoding='utf8') out_file = open(output_path, 'w', encoding='utf8') text = in_file.readlines() text_cut = cut(text, n_job=n_job) for i in text_cut: out_file.write(" ".join(i[:-1]) + "\n") in_file.close() out_file.close()
from chatbot.utils.data import read_fasttext_file from chatbot.cparse.vocabulary import Vocabulary from chatbot.cparse.label import IntentLabel from chatbot.preprocessing.text import cut # p = ROOT_PATH.parent / "corpus" / "intent" / "fastText" # x, y = read_fasttext_file(str(p / "amazon.txt")) # train_x, train_y = x[:7000], y[:7000] # test_x, test_y = x[7000:], y[7000:] import pandas as pd p = ROOT_PATH.parent / "corpus" / "intent" c1 = pd.read_excel(str(p / "intent_1.xlsx"))[["text", "intent"]] c2 = pd.read_excel(str(p / "intent_2.xlsx")) corpus_train = pd.concat([c1, c2]).reset_index(drop=True) corpus_test = pd.read_excel(str(p / "intent_valid.xlsx")) train_x = cut(corpus_train.text.tolist()) train_y = corpus_train.intent.tolist() test_x = cut(corpus_test.text.tolist()) test_y = corpus_test.intent.tolist() vocab = Vocabulary() vocab.fit(train_x) label = IntentLabel() label.fit(train_y) train_x = np.array(vocab.transform(train_x, max_length=10)) test_x = np.array(vocab.transform(test_x, max_length=10)) train_y = np.array(label.transform(train_y)) test_y = np.array(label.transform(test_y)) p = { "vocab_size": len(vocab),
l = "不限" if location is None else location s = "不限" if enddate is None else startdate e = "不限" if enddate is None else enddate return "您查询的地区“{}”在{}至{}相关文件如下: \n".format(l, s, e) @property def _not_find(self): return "抱歉,您所查找的政策文件不存在,小益已经上报,可能明天就有了哦~" if __name__ == "__main__": from chatbot.utils.path import MODEL_PATH, ROOT_PATH skill = FileRetrievalExt( str(ROOT_PATH.parent / "corpus" / "test"), str(ROOT_PATH.parent / "corpus" / "policy_filev3.utf8.csv")) from chatbot.preprocessing.text import cut context = { "query_cut": " ".join(cut("月度竞价")), "slots": { "文件检索": skill.init_slots() }, "intent": "文件检索" } context["slots"]["文件检索"]["TimeInterval"]["end"] = "2018-01-01" context["slots"]["文件检索"]["TimeInterval"]["start"] = "2018-07-02" context["slots"]["文件检索"]["Location"]["province"] = "四川" print(skill(context))
def test(s): s = vocab.transform_sentence(cut(s), max_length=10) return label.reverse_one(model.infer(s)[0])
from chatbot.utils.path import ROOT_PATH, MODEL_PATH from chatbot.utils.data import read_fasttext_file from chatbot.cparse.vocabulary import Vocabulary from chatbot.cparse.label import IntentLabel # p = ROOT_PATH.parent / "corpus" / "intent" / "fastText" # train_x, train_y = read_fasttext_file(str(p / "demo.train.txt")) # test_x, test_y = read_fasttext_file(str(p / "demo.train.txt")) # x, y = read_fasttext_file(str(p / "amazon.txt")) # train_x, train_y = x[:7000], y[:7000] # test_x, test_y = x[7000:], y[7000:] import pandas as pd from chatbot.preprocessing.text import cut corpus = pd.read_excel(ROOT_PATH.parent / "corpus" / "intent" / "intent_corpus.xlsx") x = cut(corpus.text.tolist()) y = corpus.intent.tolist() vocab = Vocabulary() vocab.fit(x) label = IntentLabel() label.init_from_config("intent.v0.2.cfg") # label.fit(y) train_x = np.array(vocab.transform(x, max_length=10)) test_x = np.array(vocab.transform(x, max_length=10)) train_y = np.array(label.transform(y)) test_y = np.array(label.transform(y)) fasttext_param = { "vocab_size": len(vocab), "embed_dim": 60, "class_num": len(label),
"lr": 0.01, "hidden_dim": 10, # "dropout": 0.5, } model = AttRCNN(param) model.fit(train_x, train_y, test_x, test_y, 2, 32, save_best=False) model.param["lr"] = 0.003 model.fit(train_x, train_y, test_x, test_y, 4, 64, save_best=False) # model.save("test") # x = FastText.load(str(MODEL_PATH / "intent" / "test.FastText")) s = [ "你真是可爱阿", "你很喜欢学习哦", "我再也不想理你了", "吃饭没", "明天会下雨马", "你哥哥是谁", "你有哥哥么", "弟弟是谁", "我想买手机", "我是你主人", "我可以给你打分吗,评价" ] from chatbot.preprocessing.text import cut for i in s: print( i, label.reverse_one( model.infer( np.array(vocab.transform_one(cut(i), max_length=10)))[0])) from chatbot.evaluate.plot import plot_attention_1d idx = 1200 att = model.get_attention( torch.tensor( np.array(vocab.transform_one(train_x[idx], max_length=10)).reshape(-1, 10))) print(label.reverse_one(model.infer(train_x[idx])[0])) plot_attention_1d([vocab.reverse_one(train_x[idx]).split(" ")], att.detach().numpy())
@staticmethod def _response_head(location=None, startdate=None, enddate=None): l = "不限" if location is None else location s = "不限" if enddate is None else startdate e = "不限" if enddate is None else enddate return "您查询的地区“{}”在{}至{}相关文件如下: \n".format(l, s, e) @property def _not_find(self): return "抱歉,您所查找的政策文件不存在,小益已经上报,可能明天就有了哦~" if __name__ == "__main__": from chatbot.utils.path import MODEL_PATH skill = FileRetrieval( str(MODEL_PATH / "v0.2" / "file_retrieval" / "tfidf"), str(MODEL_PATH / "v0.2" / "file_retrieval" / "cluster_index"), str(MODEL_PATH / "v0.2" / "file_retrieval" / "policy_file.utf8.csv")) from chatbot.preprocessing.text import cut context = { "text_cut": " ".join(cut("售电公司")), "slots": { "FileRetrieval": skill.init_slots } } context["slots"]["FileRetrieval"]["TimeInterval"]["end"] = "2018-06-02" context["slots"]["FileRetrieval"]["TimeInterval"]["start"] = "2014-01-02" print(skill(context))