Пример #1
0
def evaluate(encoder, decoder, source_vocab, target_vocab, sentence, max_length=20):
    with torch.no_grad():
        input_tensor = tensorFromSentence(cut(sentence), source_vocab)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS
        decoder_hidden = encoder_hidden
        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<eos>')
                break
            else:
                decoded_words.append(target_vocab.idx2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]
Пример #2
0
def load(path, MAX_LEN):
    s_vocab = Vocabulary(init_vocablary=["<sos>", "<eos>"])
    t_vocab = Vocabulary(init_vocablary=["<sos>", "<eos>"])
    with open(path, 'r') as f:
        data = f.readlines()
    size = int(len(data) / 3)
    source_seq = []
    target_seq = []
    for i in range(size):
        s_i = re.sub(" ", "", data[i * 3 + 1].strip("\n").lower()[2:])
        t_i = re.sub(" ", "", data[i * 3 + 2].strip("\n").lower()[2:])
        if len(s_i) == 0 or len(t_i) == 0 or len(t_i) > MAX_LEN or len(s_i) > MAX_LEN:
            continue
        source_seq.append(s_i)
        target_seq.append(t_i)
    source_seq = cut(source_seq, 4)
    target_seq = cut(target_seq, 4)
    for i in range(len(target_seq)):
        target_seq[i].append("<eos>")
    s_vocab.fit(source_seq)
    t_vocab.fit(target_seq)
    return source_seq, s_vocab, target_seq, t_vocab
Пример #3
0
 def _update_context(self, context_id, query):
     text = query["text"]
     text_cut = cut(text)
     logger.info("update context")
     self.contexts[context_id]["last_query_time"] = dt.datetime.now()
     self.contexts[context_id]["query_cut"] = text_cut
     self.contexts[context_id]["query_idx"] = self.vocab.transform_sentence(
         text_cut)
     self.contexts[context_id]["query"] = text
     self.contexts[context_id]["history_query"].append(text)
     self.contexts[context_id]["entities"] = self.ner.extract(
         self.contexts[context_id])
     intent, confidence = self._intent_recognition(context_id)
     logger.info("intent: {}, confidence: {}".format(intent, confidence))
     self.contexts[context_id]["intent"] = intent
     self._update_intent_slots(context_id, intent)
     self.contexts[context_id]["history_intent"].append(intent)
Пример #4
0
def main(input_path, output_path, n_job):
    """文本文件分词

    要求文件以\n断句,不含标点及特殊字符
    分词后的文本以" "作为分割符号

    :param input_path:
    :param output_path:
    :return:
    """
    assert os.path.exists(input_path)
    out_path = Path(output_path).resolve().parent
    out_path.mkdir(exist_ok=True)
    in_file = open(input_path, 'r', encoding='utf8')
    out_file = open(output_path, 'w', encoding='utf8')
    text = in_file.readlines()
    text_cut = cut(text, n_job=n_job)
    for i in text_cut:
        out_file.write(" ".join(i[:-1]) + "\n")
    in_file.close()
    out_file.close()
Пример #5
0
    from chatbot.utils.data import read_fasttext_file
    from chatbot.cparse.vocabulary import Vocabulary
    from chatbot.cparse.label import IntentLabel
    from chatbot.preprocessing.text import cut

    # p = ROOT_PATH.parent / "corpus" / "intent" / "fastText"
    # x, y = read_fasttext_file(str(p / "amazon.txt"))
    # train_x, train_y = x[:7000], y[:7000]
    # test_x, test_y = x[7000:], y[7000:]
    import pandas as pd
    p = ROOT_PATH.parent / "corpus" / "intent"
    c1 = pd.read_excel(str(p / "intent_1.xlsx"))[["text", "intent"]]
    c2 = pd.read_excel(str(p / "intent_2.xlsx"))
    corpus_train = pd.concat([c1, c2]).reset_index(drop=True)
    corpus_test = pd.read_excel(str(p / "intent_valid.xlsx"))
    train_x = cut(corpus_train.text.tolist())
    train_y = corpus_train.intent.tolist()
    test_x = cut(corpus_test.text.tolist())
    test_y = corpus_test.intent.tolist()

    vocab = Vocabulary()
    vocab.fit(train_x)
    label = IntentLabel()
    label.fit(train_y)
    train_x = np.array(vocab.transform(train_x, max_length=10))
    test_x = np.array(vocab.transform(test_x, max_length=10))
    train_y = np.array(label.transform(train_y))
    test_y = np.array(label.transform(test_y))

    p = {
        "vocab_size": len(vocab),
Пример #6
0
        l = "不限" if location is None else location
        s = "不限" if enddate is None else startdate
        e = "不限" if enddate is None else enddate
        return "您查询的地区“{}”在{}至{}相关文件如下: \n".format(l, s, e)

    @property
    def _not_find(self):
        return "抱歉,您所查找的政策文件不存在,小益已经上报,可能明天就有了哦~"


if __name__ == "__main__":
    from chatbot.utils.path import MODEL_PATH, ROOT_PATH
    skill = FileRetrievalExt(
        str(ROOT_PATH.parent / "corpus" / "test"),
        str(ROOT_PATH.parent / "corpus" / "policy_filev3.utf8.csv"))

    from chatbot.preprocessing.text import cut

    context = {
        "query_cut": " ".join(cut("月度竞价")),
        "slots": {
            "文件检索": skill.init_slots()
        },
        "intent": "文件检索"
    }

    context["slots"]["文件检索"]["TimeInterval"]["end"] = "2018-01-01"
    context["slots"]["文件检索"]["TimeInterval"]["start"] = "2018-07-02"
    context["slots"]["文件检索"]["Location"]["province"] = "四川"
    print(skill(context))
Пример #7
0
 def test(s):
     s = vocab.transform_sentence(cut(s), max_length=10)
     return label.reverse_one(model.infer(s)[0])
Пример #8
0
    from chatbot.utils.path import ROOT_PATH, MODEL_PATH
    from chatbot.utils.data import read_fasttext_file
    from chatbot.cparse.vocabulary import Vocabulary
    from chatbot.cparse.label import IntentLabel

    # p = ROOT_PATH.parent / "corpus" / "intent" / "fastText"
    # train_x, train_y = read_fasttext_file(str(p / "demo.train.txt"))
    # test_x, test_y = read_fasttext_file(str(p / "demo.train.txt"))
    # x, y = read_fasttext_file(str(p / "amazon.txt"))
    # train_x, train_y = x[:7000], y[:7000]
    # test_x, test_y = x[7000:], y[7000:]
    import pandas as pd
    from chatbot.preprocessing.text import cut
    corpus = pd.read_excel(ROOT_PATH.parent / "corpus" / "intent" /
                           "intent_corpus.xlsx")
    x = cut(corpus.text.tolist())
    y = corpus.intent.tolist()
    vocab = Vocabulary()
    vocab.fit(x)
    label = IntentLabel()
    label.init_from_config("intent.v0.2.cfg")
    # label.fit(y)
    train_x = np.array(vocab.transform(x, max_length=10))
    test_x = np.array(vocab.transform(x, max_length=10))
    train_y = np.array(label.transform(y))
    test_y = np.array(label.transform(y))

    fasttext_param = {
        "vocab_size": len(vocab),
        "embed_dim": 60,
        "class_num": len(label),
Пример #9
0
        "lr": 0.01,
        "hidden_dim": 10,
        # "dropout": 0.5,
    }
    model = AttRCNN(param)
    model.fit(train_x, train_y, test_x, test_y, 2, 32, save_best=False)
    model.param["lr"] = 0.003
    model.fit(train_x, train_y, test_x, test_y, 4, 64, save_best=False)
    # model.save("test")
    # x = FastText.load(str(MODEL_PATH / "intent" / "test.FastText"))
    s = [
        "你真是可爱阿", "你很喜欢学习哦", "我再也不想理你了", "吃饭没", "明天会下雨马", "你哥哥是谁", "你有哥哥么",
        "弟弟是谁", "我想买手机", "我是你主人", "我可以给你打分吗,评价"
    ]
    from chatbot.preprocessing.text import cut

    for i in s:
        print(
            i,
            label.reverse_one(
                model.infer(
                    np.array(vocab.transform_one(cut(i), max_length=10)))[0]))
    from chatbot.evaluate.plot import plot_attention_1d
    idx = 1200
    att = model.get_attention(
        torch.tensor(
            np.array(vocab.transform_one(train_x[idx],
                                         max_length=10)).reshape(-1, 10)))
    print(label.reverse_one(model.infer(train_x[idx])[0]))
    plot_attention_1d([vocab.reverse_one(train_x[idx]).split(" ")],
                      att.detach().numpy())
    @staticmethod
    def _response_head(location=None, startdate=None, enddate=None):
        l = "不限" if location is None else location
        s = "不限" if enddate is None else startdate
        e = "不限" if enddate is None else enddate
        return "您查询的地区“{}”在{}至{}相关文件如下: \n".format(l, s, e)

    @property
    def _not_find(self):
        return "抱歉,您所查找的政策文件不存在,小益已经上报,可能明天就有了哦~"


if __name__ == "__main__":
    from chatbot.utils.path import MODEL_PATH
    skill = FileRetrieval(
        str(MODEL_PATH / "v0.2" / "file_retrieval" / "tfidf"),
        str(MODEL_PATH / "v0.2" / "file_retrieval" / "cluster_index"),
        str(MODEL_PATH / "v0.2" / "file_retrieval" / "policy_file.utf8.csv"))

    from chatbot.preprocessing.text import cut

    context = {
        "text_cut": " ".join(cut("售电公司")),
        "slots": {
            "FileRetrieval": skill.init_slots
        }
    }
    context["slots"]["FileRetrieval"]["TimeInterval"]["end"] = "2018-06-02"
    context["slots"]["FileRetrieval"]["TimeInterval"]["start"] = "2014-01-02"
    print(skill(context))