Пример #1
0
def generate_dic_and_corpus(filepath, stop_words):  # not stop words file

    raw_questions, raw_answers, raw_questions_passages = load_data_text(
        filepath)
    q_length = len(raw_questions)
    knowledge_texts = []  # knowledge = question + passages
    questions_str = tokenizer(raw_questions, stop_words)
    #answers_str = tokenizer(raw_answers, stop_words)

    for idx in range(len(raw_questions_passages)):
        # pass_text = []
        # each passage of a question, put all passages' words together
        # for passage in raw_questions_passages[idx]:
        #    pass_text += passage
        temp = []
        q_kb = tokenizer(raw_questions_passages[idx], stop_words, remove_stopwords=True, \
                         remove_single_word=False)
        for kp in q_kb:
            temp += kp
        temp += questions_str[idx]
        knowledge_texts.append(temp)

    dictionary = corpora.Dictionary(
        knowledge_texts)  # dictionary of knowledge and train data
    os.path.join(CUR_PATH + 'tmp/dictionary.dict')
    dictionary.save(CUR_PATH + 'tmp/dictionary.dict')

    corpus = [dictionary.doc2bow(text)
              for text in knowledge_texts]  # corpus of knowledge
    corpora.MmCorpus.serialize(CUR_PATH + 'tmp/knowledge_corpus.mm', corpus)
Пример #2
0
def generate_dic_and_corpus(knowledge_file, file_name, stop_words):
    knowledge_texts = tokenizer(knowledge_file, stop_words)
    train_texts = tokenizer(file_name, stop_words)

    dictionary = corpora.Dictionary(
        knowledge_texts +
        train_texts)  # dictionary of knowledge and train data
    dictionary.save(os.path.join('tmp/dictionary.dict'))

    corpus = [dictionary.doc2bow(text)
              for text in knowledge_texts]  # corpus of knowledge
    corpora.MmCorpus.serialize('tmp/knowledge_corpus.mm', corpus)
Пример #3
0
def generate_dic_and_corpus(knowledge_file, file_name, stop_words):
    knowledge_texts = tokenizer(knowledge_file, stop_words)
    train_texts = tokenizer(file_name, stop_words)

    # 保存字典
    if not os.path.exists('./tmp'):
        os.makedirs('./tmp')
    dictionary = corpora.Dictionary(knowledge_texts + train_texts)
    dictionary.save(os.path.join('./tmp/dictionary.dict'))

    corpus = [dictionary.doc2bow(text)
              for text in knowledge_texts]  # corpus of knowledge
    corpora.MmCorpus.serialize('./tmp/knowledge_corpus.mm',
                               corpus)  # todo 啥方法????
Пример #4
0
import sys
import numpy as np

import get_config
import data_util
import gru

gConfig = get_config.get_config()
train_data = gConfig['train_data']
test_data = gConfig['test_data']
epochs = gConfig['epochs']
batch_size = gConfig['batch_size']

x_array, y_array = data_util.create_data(train_data)
a_array, b_array = data_util.create_data(test_data)
x_array, lang_tokenizer = data_util.tokenizer(x_array, 'UNK', 0)
y_array = data_util.padding_target(y_array, gConfig['max_inp'])
y_array = np.expand_dims(y_array, 2)
print(x_array.shape)
print(y_array.shape)


def train():
    print('Training data in %s' % gConfig['train_data'])
    checkpoint_dir = gConfig['model_data']
    steps_per_epoch = len(x_array) // gConfig['batch_size']
    ckpt = tf.io.gfile.exists(checkpoint_dir)
    if ckpt:
        gru.checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
    BUFFER_SIZE = len(x_array)
    dataset = tf.data.Dataset.from_tensor_slices(
Пример #5
0
                                         key=lambda item: -item[1])[:k]
                ]  # topk index
                sim_ixs.append(sim_ix)
                tmp.clear()
    with open(sim_path, "wb") as f:
        pickle.dump(sim_ixs, f)
    return sim_ixs


# module test
if __name__ == '__main__':
    stop_words_ = codecs.open("data/stop_words.txt", 'r',
                              encoding='utf8').readlines()
    stop_words_ = [w.strip() for w in stop_words_]
    generate_dic_and_corpus("data/knowledge.txt", "data/train.txt",
                            stop_words_)
    res = topk_sim_ix("data/train.txt", stop_words_, 5)
    print(len(res))

    knowledge_file = "data/knowledge.txt"
    file_name = "data/train.txt",
    knowledge_texts = tokenizer(knowledge_file, stop_words_)
    knowledge_texts[0]
    #['地球', '宇宙', '中', '一颗', '行星', '运动', '规律']
    dictionary.doc2bow(knowledge_texts[0])
    #"""Convert `document` into the bag-of-words (BoW) format = list of `(token_id, token_count)` tuples.
    #[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)]
    dictionary.doc2bow(knowledge_texts[10])
    #['蟹', '状', '星云', '金牛座', '一团', '膨胀', '气体']
    #[(27, 1), (51, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1)]