Exemplo n.º 1
0
def load_data():
    word2id = {'sos': 0, 'eos': 1}
    x, y = LoadCorpus.load_chatbot100_train()
    max_len = 20
    for i, j in zip(x, y):
        for _i in i.split():
            if _i not in word2id:
                word2id[_i] = len(word2id)
        for _j in j.split():
            if _j not in word2id:
                word2id[_j] = len(word2id)
    id2word = dict((v, k) for k, v in word2id.items())
    vocab_size = len(word2id)

    train_x, train_y = [], []
    for i, j in zip(x, y):
        t_x = [word2id[_i] for _i in i.split()]
        t_y = [word2id[_j] for _j in j.split()]
        if len(t_x) > max_len:
            t_x = [0] + t_x[:max_len - 2] + [1]
        else:
            t_x = [0] + t_x + [1] * (max_len - len(t_x) - 1)

        if len(t_y) > max_len:
            t_y = t_y[:max_len]
        else:
            t_y = t_y + [0] * (max_len - len(t_y))
        train_x.append(t_x)
        train_y.append(t_y)
    train_x = numpy2tensor(np.array(train_x))
    train_y = numpy2tensor(np.array(train_y))
    return train_x, train_y, vocab_size
Exemplo n.º 2
0
 def create_matrix(self, vocab):
     # wv_model = LoadCorpus.load_wv_model()
     wv_model = LoadCorpus.load_wv60_model()
     embedding_matrix = np.zeros(shape=(self.nn_param.vocab_size + 1,
                                        self.nn_param.embedding_dim))
     print(embedding_matrix.shape)
     for word, index in vocab.items():
         if word in wv_model:
             embedding_matrix[index] = wv_model[word]
     print("词嵌入的大小", embedding_matrix.shape)
     return embedding_matrix
Exemplo n.º 3
0
def test():
    x, y = LoadCorpus.load_chatbot100_train()
    # x, y = LoadCorpus.load_xiaohuangji_train()
    wv = LoadCorpus.load_wv60_model()

    def trans_seq(d):
        vector = [[wv[c] for c in s.split(' ') if c in wv.wv.vocab] for s in d]
        t = pad_sequences(vector,
                          maxlen=max_words,
                          padding='post',
                          value=1.,
                          dtype='float32')
        return t

    def generate_decoder_input(decoder_output):
        word_dim = len(decoder_output[0][0])
        word_start = np.zeros(shape=(word_dim, ))
        decoder_input = []
        if not (decoder_input is decoder_output):
            for example in decoder_output:
                t = list(example[:14])
                t.insert(0, word_start)
                decoder_input.append(np.array(t))
        decoder_input = pad_sequences(decoder_input,
                                      maxlen=15,
                                      dtype='float32')
        return decoder_input

    train_x = trans_seq(x)
    train_y = trans_seq(y)
    y_input = generate_decoder_input(train_y)
    # model = SimpleSeq2Seq(input_dim=5, hidden_dim=10, output_length=8, output_dim=8, depth=3)
    model = Seq2Seq(batch_input_shape=(None, 15, 60),
                    hidden_dim=256,
                    output_length=15,
                    output_dim=60,
                    depth=3)
    model.compile(loss='mse', optimizer='rmsprop', metrics=['accuracy'])
    print(model.summary())

    model.fit(train_x, y_input, epochs=epochs, batch_size=8)
Exemplo n.º 4
0
    def pre_data(self):
        words = dict()
        nb_sentence, total = 0, 0

        x, y = LoadCorpus.load_news_train(c=15)
        sentences = [i.split() for i in x]

        # sentences = LoadCorpus.load_paper_to_word2vec()

        for sentence in sentences:
            nb_sentence += 1
            for w in sentence:
                if w not in words:
                    words[w] = 0
                words[w] += 1
                total += 1
        print('总词数', total)
        print('句子长度', nb_sentence)

        # 截断小词频单词
        self.words = dict((i, j) for i, j in words.items() if j >= min_count)
        # 词表映射 0 表示UNK
        self.id2word = dict((i + 1, k) for i, k in enumerate(words))
        self.word2id = dict((v, k) for k, v in self.id2word.items())
        self.nb_word = len(words) + 1

        print('词向量大小', self.nb_word)

        subsamples = dict((i, j / total) for i, j in words.items()
                          if j / total > subsample_t)  # 词频
        # 这个降采样公式,是按照word2vec的源码来的
        subsamples = dict((i, subsample_t / j + (subsample_t / j)**0.5)
                          for i, j in subsamples.items())
        # 降采样表
        subsamples = dict(
            (self.word2id[i], j) for i, j in subsamples.items() if j < 1.0)

        for sentence in sentences:
            # 构造当前句子的文本向量
            sentence = [0] * window + [
                self.word2id[w] for w in sentence if w in self.word2id
            ] + [0] * window
            r = np.random.random(len(sentence))
            for i in range(window, len(sentence) - window):
                if sentence[i] in subsamples:
                    self.train_x.append(sentence[i - window:i] +
                                        sentence[i + 1:i + 1 + window])
                    self.train_y.append([sentence[i]])
        x, y = np.array(self.train_x), np.array(self.train_y)
        print('样本量x', x.shape)
        print('样本量y', y.shape)
        return x, y
Exemplo n.º 5
0
def train():
    x, y = LoadCorpus.load_chatbot100_train()
    # x, y = LoadCorpus.load_xiaohuangji_train()
    wv = LoadCorpus.load_wv60_model()

    def trans_seq(d):
        vector = [[wv[c] for c in s.split(' ') if c in wv.wv.vocab] for s in d]
        t = pad_sequences(vector,
                          maxlen=max_words,
                          padding='post',
                          value=1.,
                          dtype='float32')
        return t

    def generate_decoder_input(decoder_output):
        word_dim = len(decoder_output[0][0])
        word_start = np.zeros(shape=(word_dim, ))
        decoder_input = []
        if not (decoder_input is decoder_output):
            for example in decoder_output:
                t = list(example[:14])
                t.insert(0, word_start)
                decoder_input.append(np.array(t))
        decoder_input = pad_sequences(decoder_input,
                                      maxlen=15,
                                      dtype='float32')
        return decoder_input

    train_x = trans_seq(x)
    train_y = trans_seq(y)
    y_input = generate_decoder_input(train_y)

    model = build_nn()
    model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])
    model.fit([train_x, y_input],
              train_y,
              epochs=epochs,
              batch_size=batch_size)
    model.save(seq2seq_model_path)
Exemplo n.º 6
0
def load_cha_cha(maxlen):
    c = 0
    x, y = [], []
    for _x, _y in zip(*LoadCorpus.load_xiaohuangji_train()):
        if len(''.join(_x.split())) > 50 or len(''.join(_y.split())) > 50:
            continue
        x.append(''.join(_x.split()))
        y.append('\t' + ''.join(_y.split()) + '\n')
        c += 1
        if c == maxlen:
            break
    print(x)
    return x, y
Exemplo n.º 7
0
def predict():
    model = models.load_model(seq2seq_model_path)
    encoder_lstm = model.get_layer('encoder_lstm')
    decoder_lstm = model.get_layer('decoder_lstm')
    densor = model.get_layer('densor')
    decoder_model = predict_model(encoder_lstm, decoder_lstm, densor, word_dim,
                                  ty)
    wv = LoadCorpus.load_wv60_model()
    while True:
        sentence = input('问:')
        sentence_vec = input_sentence_vector(sentence, wv)
        print(sentence_vec.shape)
        x = np.zeros(shape=(1, 1, word_dim))
        answer_sequence = decoder_model.predict([sentence_vec, x])
        print('答:', vector_sentence(answer_sequence, word2vec_model=wv))
Exemplo n.º 8
0
def load_cha_cha(maxlen):
    c = 0
    x, y = [], []
    for _x, _y in zip(*LoadCorpus.load_xiaohuangji_train()):
        if len(''.join(_x)) > input_max_len or len(
                ''.join(_y)) > input_max_len:
            continue
        # x.append(['START'] + jieba.lcut(''.join(_x.split())) + ['END'])
        # y.append(['START'] + jieba.lcut(''.join(_y.split())) + ['END'])
        x.append(start + ''.join(jieba.cut(''.join(_x.split()))) + end)
        y.append(start + ''.join(jieba.cut(''.join(_y.split()))) + end)
        c += 1
        if c == maxlen:
            break
    return x, y
Exemplo n.º 9
0
def data_generator():
    if os.path.exists(seq2seq_data_path):
        with open(seq2seq_data_path, mode='rb') as f:
            x = pickle.load(f)
            y = pickle.load(f)
    else:
        x, y = LoadCorpus.load_xiaohuangji_train()
        x = [i.replace(' ', '') for i in x]
        y = [i.replace(' ', '') for i in y]
        x = np.array(padding([str2id(i) for i in x]))
        y = np.array(padding([str2id(i, start_end=True) for i in y]))
        with open(seq2seq_data_path, mode='wb') as f:
            pickle.dump(x, f)
            pickle.dump(y, f)
    # x, y = x[:1000], y[:1000]
    return [x, y], None
Exemplo n.º 10
0
def pre_data():
    word_count = {}
    # sentences = [''.join(sentence) for sentence in LoadCorpus.load_paper_to_word2vec()]
    sentences, _ = LoadCorpus.load_news_train(c=1)
    sentences = [sentence.replace(' ', '') for sentence in sentences]

    for sentence in sentences:
        for char in sentence:
            if char not in word_count:
                word_count[char] = 0
            word_count[char] += 1

    word_total = sum(word_count.values())
    for word, value in word_count.items():
        start_dict[word] = math.log(value / word_total)

    han_pinyin_dict = dict((han, py) for han, py in zip(
        word_count.keys(), lazy_pinyin(list(word_count.keys()))))
    sentences = [[sentence[i:i + 2] for i in range(len(sentence) - 1)]
                 for sentence in sentences]

    for sentence in sentences:
        for tup in sentence:
            if tup[0] not in trans_dict:
                trans_dict[tup[0]] = {}
            if tup[1] not in trans_dict[tup[0]]:
                trans_dict[tup[0]][tup[1]] = 0
            trans_dict[tup[0]][tup[1]] += 1

    for han, pinyin in han_pinyin_dict.items():
        if pinyin not in pinyin_han_dict:
            pinyin_han_dict[pinyin] = []
        pinyin_han_dict[pinyin].append(han)

    for key, value in pinyin_han_dict.items():
        total = sum(word_count[v] for v in value)
        for v in value:
            if v not in emit_dict:
                emit_dict[v] = dict()
            emit_dict[v][key] = math.log(word_count[v] / total)
Exemplo n.º 11
0
from corpus.load_corpus import LoadCorpus
from corpus import seq2seq_config_path, seq2seq_data_path, corpus_root_path

min_count = 32
maxlen = 50
batch_size = 32
char_size = 128
z_dim = 128
epochs = 100

if os.path.exists(seq2seq_config_path):
    chars, id2char, char2id = json.load(open(seq2seq_config_path, mode='r'))
    id2char = dict((int(id), char) for id, char in id2char.items())
else:
    chars = {}
    x, y = LoadCorpus.load_xiaohuangji_train()
    for i, j in tqdm(zip(x, y)):
        for ic in i:
            chars[ic] = chars.get(ic, 0) + 1
        for jc in j:
            chars[jc] = chars.get(jc, 0) + 1
    chars = dict((i, j) for i, j in chars.items() if j >= min_count)
    # 0 mask  1 unk  2 start  3 end
    id2char = dict((i + 4, j) for i, j in enumerate(chars))
    char2id = dict((j, i) for i, j in id2char.items())
    json.dump([chars, id2char, char2id], open(seq2seq_config_path, mode='w'))


def str2id(s, start_end=False):
    if start_end:
        ids = [2] + [char2id.get(i, 1) for i in s[:maxlen - 2]] + [3]