Пример #1
0
def prepare(args):
    """
    checks data, creates the directories, prepare the vocabulary and embeddings
    """
    logger = logging.getLogger("brc")
    logger.info('Checking the data files...')
    for data_path in args.train_files + args.dev_files + args.test_files:
        assert os.path.exists(data_path), '{} file does not exist.'.format(
            data_path)
    logger.info('Preparing the directories...')
    for dir_path in [
            args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir
    ]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

    logger.info('Building vocabulary...')
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.max_word_len, args.train_files, args.dev_files,
                          args.test_files)
    vocab = Vocab(lower=True)
    for word in brc_data.word_iter('train'):
        vocab.add(word)
    for word in brc_data.word_iter('test'):
        vocab.add(word)
    for word in brc_data.word_iter('dev'):
        vocab.add(word)

    unfiltered_vocab_size = vocab.size()
    vocab.filter_tokens_by_cnt(min_cnt=5)
    filtered_num = unfiltered_vocab_size - vocab.size()
    logger.info('After filter {} tokens, the final vocab size is {}'.format(
        filtered_num, vocab.size()))

    logger.info('Assigning embeddings...')
    vocab.randomly_init_embeddings(args.embed_size)
    #vocab.load_pretrained_embeddings(args.embedding_path)

    if args.use_char_level:
        for char in brc_data.char_iter('train'):
            vocab.add(char)
        for char in brc_data.char_iter('test'):
            vocab.add(char)
        for char in brc_data.char_iter('dev'):
            vocab.add(char)
        unfiltered_char_vocab_size = vocab.char_size()
        vocab.filter_chars_by_cnt(min_cnt=5)
        filtered_num = unfiltered_char_vocab_size - vocab.char_size()
        logger.info('After filter {} chars, the final vocab size is {}'.format(
            filtered_num, vocab.char_size()))
        logger.info('Assigning char embeddings...')
        vocab.randomly_init_char_embeddings(args.char_embed_size)
        #vocab.load_pretrained_char_embeddings(args.char_embedding_path)

    logger.info('Saving vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout:
        pickle.dump(vocab, fout)
    logger.info('Done with preparing!')
Пример #2
0
def calculate_unk(train_files, target_files):
    brc_data = BRCDataset(5, 500, 60, train_files, target_files)
    vocab = Vocab(lower=True)
    for word in brc_data.word_iter('train'):
        vocab.add(word)
    vocab.filter_tokens_by_cnt(min_cnt=2)
    overlap_num = 0
    dev_vocab = set()
    for word in brc_data.word_iter('dev'):
        dev_vocab.add(word)
    for word in dev_vocab:
        if word in vocab.token2id:
            overlap_num += 1
    print("over lap word is {} in {}".format(overlap_num, len(dev_vocab)))
Пример #3
0
def prepare(args):
    """
    checks data, creates the directories, prepare the vocabulary and embeddings
    """
    logger = logging.getLogger("brc")
    logger.info('Checking the data files...')
    for data_path in args.train_files + args.dev_files + args.test_files:
        assert os.path.exists(data_path), '{} file does not exist.'.format(
            data_path)
    logger.info('Preparing the directories...')
    for dir_path in [
            args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir
    ]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

    logger.info('Building vocabulary...')
    # if args.train_as:
    #     brc_data = BRCsysDataset(args, args.max_p_num, args.max_p_len, args.max_q_len, args.max_a_len,
    #                       args.train_files, args.dev_files, args.test_files, prepare=True)
    # else:
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.max_a_len, args.train_files, args.dev_files,
                          args.test_files)
    vocab = Vocab(lower=True)
    vocab1 = Vocab(lower=True)
    for word in brc_data.word_iter('train'):
        vocab.add(word)
    for word in brc_data.word_iter('train'):
        vocab1.add(word)

    unfiltered_vocab_size = vocab.size()
    vocab.filter_tokens_by_cnt(min_cnt=2)
    vocab1.filter_tokens_by_cnt(min_cnt=28)
    filtered_num = unfiltered_vocab_size - vocab.size()
    #logger.info('After filter {} tokens, the final vocab size is {}'.format(filtered_num,vocab1.size()))

    logger.info('Assigning embeddings...')
    # vocab.load_pretrained_embeddings('../data/jwe_word2vec_size300.txt')
    # vocab1.load_pretrained_embeddings('../data/jwe_word2vec_size300.txt')
    vocab.randomly_init_embeddings(args.embed_size)

    logger.info('Saving vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout:
        pickle.dump(vocab, fout)

    with open(os.path.join(args.vocab_dir, 'tar_vocab.data'), 'wb') as fout1:
        pickle.dump(vocab1, fout1)

    logger.info('Done with preparing!')
Пример #4
0
def prepare(args):
    """
    checks data, creates the directories, prepare the vocabulary and embeddings
    """
    logger = logging.getLogger("brc")
    logger.info('Checking the data files...')
    for data_path in args.train_files + args.dev_files + args.test_files:
        assert os.path.exists(data_path), '{} file does not exist.'.format(data_path)
    logger.info('Preparing the directories...')
    for dir_path in [args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

    logger.info('Building vocabulary...')
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.train_files, args.dev_files, args.test_files)
    vocab = Vocab(lower=True)
    for word in brc_data.word_iter('train'):
        vocab.add(word)

    unfiltered_vocab_size = vocab.size()
    vocab.filter_tokens_by_cnt(min_cnt=2)
    filtered_num = unfiltered_vocab_size - vocab.size()
    logger.info('After filter {} tokens, the final vocab size is {}'.format(filtered_num,
                                                                            vocab.size()))

    logger.info('Assigning embeddings...')
    vocab.randomly_init_embeddings(args.embed_size)

    logger.info('Saving vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout:
        pickle.dump(vocab, fout)

    logger.info('Done with preparing!')
Пример #5
0
def prepare(args):
    """
    checks data, creates the directories, prepare the vocabulary and embeddings
    """
    logger = logging.getLogger("brc")
    logger.info('Checking the data files...')
    for data_path in args.train_files + args.dev_files + args.test_files:
        assert os.path.exists(data_path), '{} file does not exist.'.format(
            data_path)

    logger.info('Preparing the directories...')

    logger.info('Building vocabulary...')
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.train_files, args.dev_files, args.test_files)
    vocab = Vocab(lower=True)
    for word in brc_data.word_iter('train'):  #构建词典只包含训练集
        vocab.add(word)

    unfiltered_vocab_size = vocab.size()
    vocab.filter_tokens_by_cnt(min_cnt=2)
    filtered_num = unfiltered_vocab_size - vocab.size()
    logger.info('After filter {} tokens, the final vocab size is {}'.format(
        filtered_num, vocab.size()))

    logger.info('Assigning embeddings...')
    # vocab.randomly_init_embeddings(args.embed_size)#TODO-load_pretrained_embeddings
    vocab.load_pretrained_embeddings(args.embedding_path)  #glove pre-trained

    logger.info('Saving vocab...')
    # with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout:
    with open(args.vocab_path, 'wb') as fout:  #不区分search&zhidao
        pickle.dump(vocab, fout)

    logger.info('Done with preparing!')
Пример #6
0
def prepare(logger, args):
    """
    checks data, creates the directories, prepare the vocabulary and embeddings
    """
    logger.info('Checking the data files...')
    for data_path in args.trainset + args.devset + args.testset:
        assert os.path.exists(data_path), '{} file does not exist.'.format(
            data_path)
    logger.info('Preparing the directories...')
    for dir_path in [args.vocab_dir, args.save_dir, args.result_dir]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

    logger.info('Building vocabulary...')
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.trainset, args.devset, args.testset)
    vocab = Vocab(lower=True)
    for word in brc_data.word_iter('train'):
        vocab.add(word)

    unfiltered_vocab_size = vocab.size()
    vocab.filter_tokens_by_cnt(min_cnt=2)
    filtered_num = unfiltered_vocab_size - vocab.size()
    logger.info('After filter {} tokens, the final vocab size is {}'.format(
        filtered_num, vocab.size()))

    logger.info('Assigning embeddings...')
    vocab.randomly_init_embeddings(args.embed_size)

    logger.info('Saving vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout:
        pickle.dump(vocab, fout)

    logger.info('Done with preparing!')
Пример #7
0
Файл: run.py Проект: Yaozeng/MRC
def prepro(args):
    logger = logging.getLogger("QAPointNet")
    logger.info("====== preprocessing ======")
    logger.info('Checking the data files...')
    for data_path in args.train_files + args.dev_files + args.test_files:
        assert os.path.exists(data_path), '{} file does not exist.'.format(data_path)

    logger.info('Preparing the directories...')
    for dir_path in [args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

    logger.info('Building vocabulary...')
    dataloader = BRCDataset(args.max_p_num,args.max_p_len, args.max_q_len,args.save_dir,
                            args.train_files, args.dev_files, args.test_files, prepare=True)

    vocab = Vocab(lower=True)
    for word in dataloader.word_iter('train'):
        vocab.add_word(word)
    del dataloader
    unfiltered_vocab_size = vocab.word_size()
    vocab.filter_words_by_cnt(min_cnt=2)
    filtered_num = unfiltered_vocab_size - vocab.word_size()
    logger.info('After filter {} tokens, the final vocab size is {}'.format(filtered_num,vocab.word_size()))

    logger.info('Assigning embeddings...')
    if args.pretrained_word_path is not None:
        vocab.load_pretrained_word_embeddings(args.pretrained_word_path)
    else:
        vocab.randomly_init_word_embeddings(args.embed_size)
    logger.info('Saving vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout:
        pickle.dump(vocab, fout)

    logger.info('====== Done with preparing! ======')
Пример #8
0
def prepare(args):
    """
    checks data, creates the directories, prepare the vocabulary and embeddings
    """
    logger = logging.getLogger("Cail")
    logger.info('Checking the data files...')
    print("Checking the data files...")
    for data_path in args.train_files + args.dev_files + args.test_files:
        assert os.path.exists(data_path), '{} file does not exist.'.format(data_path)

    logger.info('Preparing the directories...')

    logger.info('Building vocabulary...')
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.train_files, args.dev_files, args.test_files) # 创建这个类

    vocab = Vocab(lower=True)
    for word in brc_data.word_iter('train'):  # 构建词典只包含训练集
        vocab.add(word)
    logger.info("Tokens num {}".format(vocab.size()))
    unfiltered_vocab_size = vocab.size()
    vocab.filter_tokens_by_cnt(min_cnt=5) # 过滤低频token
    filtered_num = unfiltered_vocab_size - vocab.size()
    logger.info('After filter {} tokens, the final vocab size is {}'.format(filtered_num,
                                                                            vocab.size()))

    logger.info('Assigning embeddings...')
    vocab.randomly_init_embeddings(args.embed_size) # 随机初始化的embedding 第一步
    # vocab.load_pretrained_embeddings(args.embedding_path)  # glove pre-trained 第二步 提取vocab
    # logger.info("Vocab size is {} from embedding".format(vocab.size()))
    logger.info('Saving vocab...')
    with open(args.vocab_path, 'wb') as fout:  # vocab存入
        pickle.dump(vocab, fout)

    logger.info('Done with preparing!')
Пример #9
0
def prepare(args):
    logger = logging.getLogger("rc")
    logger.info('train test split...')
    train_test_split(args.all_file, args.train_file, args.test_file,
                     args.train_rate)
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.train_file, args.dev_file, args.test_file)
    data_vocabs = DataVocabs()
    for word, pos, ner in brc_data.word_iter('train'):
        data_vocabs.word_vocab.add(word)
        data_vocabs.pos_vocab.add(pos)
        data_vocabs.ner_vocab.add(ner)
    unfiltered_vocab_size = data_vocabs.word_vocab.size()
    data_vocabs.word_vocab.filter_tokens_by_cnt(min_cnt=2)
    filtered_num = unfiltered_vocab_size - data_vocabs.word_vocab.size()
    logger.info('After filter {} tokens, the final vocab size is {}'.format(
        filtered_num, data_vocabs.word_vocab.size()))
    logger.info('Assigning embeddings...')
    # vocab.randomly_init_embeddings(args.embed_size)
    data_vocabs.word_vocab.load_pretrained_embeddings(args.embedding_path)
    logger.info('embedding size: {}, {}'.format(
        len(data_vocabs.word_vocab.embeddings),
        len(data_vocabs.word_vocab.embeddings[0])))
    logger.info('Saving vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout:
        pickle.dump(data_vocabs, fout)
    logger.info('Done with preparing!')
Пример #10
0
def prepare(args):
    """
    checks data, creates the directories, prepare the vocabulary and embeddings
    """
    for data_path in args.train_files + args.dev_files + args.test_files:
        assert os.path.exists(data_path), '{} file does not exist.'.format(
            data_path)

    for dir_path in [args.vocab_dir, args.model_dir]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.train_files)
    vocab = Vocab(lower=True)
    for word in brc_data.word_iter('train'):
        vocab.add(word)

    # unfiltered_vocab_size = vocab.size()
    print("vocab size is ", vocab.size())
    vocab.filter_tokens_by_cnt(min_cnt=2)
    print("after filtered vocab size is ", vocab.size())
    # filtered_num = unfiltered_vocab_size - vocab.size()

    vocab.randomly_init_embeddings(args.embed_size)
    if args.use_pre_train:
        vocab.load_pretrained_embeddings(args.pre_train_file)

    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout:
        pickle.dump(vocab, fout)
Пример #11
0
def prepare(args):
    """
    检查数据,创建目录,准备词汇表和词嵌入
    checks data, creates the directories, prepare the vocabulary and embeddings
    """
    logger = logging.getLogger("brc")
    logger.info('检查数据文件...')
    for data_path in args.train_files + args.dev_files + args.test_files:
        assert os.path.exists(data_path), '{} 文件不存在.'.format(data_path)
    logger.info('建立目录...')
    for dir_path in [args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

    logger.info('创建词汇表...')
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.train_files, args.dev_files, args.test_files)
    vocab = Vocab(lower=True)
    for word in brc_data.word_iter('train'):
        vocab.add(word)

    unfiltered_vocab_size = vocab.size()
    vocab.filter_tokens_by_cnt(min_cnt=2)
    filtered_num = unfiltered_vocab_size - vocab.size()
    logger.info('过滤掉 {} 个词语, 最终的词汇量是 {}'.format(filtered_num,
                                                vocab.size()))

    logger.info('指定词向量...')
    vocab.randomly_init_embeddings(args.embed_size)

    logger.info('保存词汇表...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout:
        pickle.dump(vocab, fout)

    logger.info('完成预备过程!')
Пример #12
0
def prepare(args):
    """
    checks data, creates the directories, prepare the vocabulary and embeddings
    """
    logger = logging.getLogger("brc")
    logger.info('Checking the data files...')
    for data_path in args.train_files + args.dev_files + args.test_files:
        assert os.path.exists(data_path), '{} file does not exist.'.format(data_path)
    logger.info('Preparing the directories...')
    for dir_path in [args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

    logger.info('Building vocabulary...')
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.train_files, args.dev_files, args.test_files)
    vocab = Vocab(lower=True)
    if args.use_char_embed:
        char_vocab = Vocab(lower=True)
    fout = open(os.path.join(args.vocab_dir, 'word.txt'), 'w')
    for word in brc_data.word_iter('train'):
        if word == 'mosi':
            print('xxxxxxxxxx:%s\n' % word)
        vocab.add(word)
        if args.use_char_embed:
            for char in list(word):
                char_vocab.add(char)
        fout.write(word + ', ' + ' '.join(list(word)) + '\n')
    fout.close()
    idx = vocab.get_id('mosi')
    print('yyyyyyyy:mosi_idx:%d\n' % idx)


    unfiltered_vocab_size = vocab.size()
    vocab.filter_tokens_by_cnt(min_cnt=args.min_cnt)
    logger.info('min_cnt = %d ' % args.min_cnt)
    filtered_num = unfiltered_vocab_size - vocab.size()
    logger.info('After filter {} tokens, the final vocab size is {}'.format(filtered_num,
                                                                            vocab.size()))
    
    #logger.info('The final char vocab size is {}'.format(char_vocab.size()))

    logger.info('Assigning embeddings...')
    vocab.randomly_init_embeddings(args.embed_size)
    if args.use_char_embed:
        char_vocab.randomly_init_embeddings(args.char_embed_size)

    logger.info('Saving vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout:
        pickle.dump(vocab, fout)
    
    if args.use_char_embed:
        logger.info('Saving char vocab...')
        with open(os.path.join(args.vocab_dir, 'char_vocab.data'), 'wb') as fout:
            pickle.dump(char_vocab, fout)

    logger.info('Done with preparing!')
Пример #13
0
def prepare(args):
    """
    checks data, creates the directories, prepare the vocabulary and embeddings
    """
    logger = logging.getLogger("brc")
    logger.info('Building vocabulary...')
    # 载入数据
    brc_data = BRCDataset(args.max_p_num,
                          args.max_p_len,
                          args.max_q_len,
                          args.train_files,
                          args.dev_files,
                          args.test_files,
                          prepare=True)
    # 构建词典
    vocab = Vocab(lower=True)
    for word in brc_data.word_iter('train'):
        vocab.add(word)

    unfiltered_vocab_size = vocab.size()
    logger.info('Assigning embeddings...')
    # 1.随机初始化词向量 2.载入预训练词向量
    if not args.pretrain:
        # 保留至少出现2次的token
        vocab.filter_tokens_by_cnt(min_cnt=2)
        vocab.randomly_init_embeddings(args.embed_size)
    else:
        pre_train(brc_data, args.segmented_dir)
        vocab.load_pretrained_embeddings(
            os.path.join(args.segmented_dir, 'w2v_dic.data'))
    filtered_num = unfiltered_vocab_size - vocab.size()
    logger.info('After filter {} tokens, the final vocab size is {}'.format(
        filtered_num, vocab.size()))

    logger.info('Saving vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.pkl'), 'wb') as fout:
        pickle.dump(vocab, fout)
    fout.close()

    logger.info('Saving sets...')
    with open(os.path.join(args.prepared_dir, 'train_set.pkl'),
              'wb') as f_train_out:
        pickle.dump(brc_data.train_set, f_train_out)
    f_train_out.close()
    with open(os.path.join(args.prepared_dir, 'dev_set.pkl'),
              'wb') as f_dev_out:
        pickle.dump(brc_data.dev_set, f_dev_out)
    f_dev_out.close()
    with open(os.path.join(args.prepared_dir, 'test_set.pkl'),
              'wb') as f_test_out:
        pickle.dump(brc_data.test_set, f_test_out)
    f_test_out.close()

    logger.info('Done with preparing!')
Пример #14
0
def prepare(args):
    """
    checks data, creates the directories, prepare the vocabulary and embeddings
    """
    logger = logging.getLogger("brc")
    logger.info('Checking the data files...')
    # 确保路径存在
    # train_files  dev_files  test_files
    for data_path in args.train_files + args.dev_files + args.test_files:
        assert os.path.exists(data_path), '{} file does not exist.'.format(
            data_path)

    # 准备保存数据的目录,如果目录不存在,则创建
    # vocab_dir model_dir result_dir summary_dir
    logger.info('Preparing the directories...')
    for dir_path in [
            args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir
    ]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

    # 准备数据
    # 传入最大p数量 最大p长度 最大q长度  以及 训练 开发 测试文件目录
    logger.info('Building vocabulary...')
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.train_files, args.dev_files, args.test_files)
    # 创建字典
    vocab = Vocab(lower=True)
    # 得到所有的字词,创建 token2id id2token等
    for word in brc_data.word_iter('train'):
        vocab.add(word)
    # 未过滤之前的大小
    unfiltered_vocab_size = vocab.size()
    # 过滤词频低于2的词
    vocab.filter_tokens_by_cnt(min_cnt=2)
    # 过滤的数量 = 未过滤前的数量 - 过滤后的数量
    filtered_num = unfiltered_vocab_size - vocab.size()
    logger.info('After filter {} tokens, the final vocab size is {}'.format(
        filtered_num, vocab.size()))
    #
    logger.info('Assigning embeddings...')
    # 随机初始化词向量 size * 300
    # 5006 * 300
    vocab.randomly_init_embeddings(args.embed_size)

    logger.info('Saving vocab...')
    # 保存词汇表
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout:
        pickle.dump(vocab, fout)

    logger.info('Done with preparing!')
Пример #15
0
def prepare(args):
    """
    checks data, creates the directories, prepare the vocabulary and embeddings
    """
    logger = logging.getLogger("brc")
    logger.info('Checking the data files...')
    for data_path in args.train_files + args.dev_files + args.test_files:
        assert os.path.exists(data_path), '{} file does not exist.'.format(data_path)
    logger.info('Preparing the directories...')
    for dir_path in [args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

    logger.info('Building vocabulary...')
    #建立数据集
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,args.word_size,
                          args.train_files, args.dev_files, args.test_files)
    vocab = Vocab(lower=True)
    for word in brc_data.word_iter('train'):    #将问题和选中答案的所有词作为vocabulary
        vocab.add(word)

    unfiltered_vocab_size = vocab.size()
    vocab.filter_tokens_by_cnt(min_cnt=1)   #5
    filtered_num = unfiltered_vocab_size - vocab.size() #得到过滤了多少个单词
    logger.info('After filter {} tokens, the final vocab size is {}'.format(filtered_num,
                                                                            vocab.size()))

    logger.info('Assigning embeddings...')
    #vocab.randomly_init_embeddings(args.embed_size)
    vocab.word2vec_init_embeddings(args.embed_size,sentences=brc_data.generate_word2vec_trainset(),min_cnt=1)
    #vocab.word2vec_init_embeddings(args.embed_size,sentences=brc_data.generate_word2vec_testset(),min_cnt=1)
    #vocab.glove_init_embeddings(args.embed_size)
    char_vocab=Vocab(lower=True)
    for char in brc_data.char_iter('train'):
        char_vocab.add(char)

    unfiltered_vocab_size=char_vocab.size()
    char_vocab.filter_tokens_by_cnt(min_cnt=1)
    filtered_num=unfiltered_vocab_size-char_vocab.size()
    logger.info('after filter {} tokens,the char vocab size is {}'.format(filtered_num,char_vocab.size()))

    char_vocab.char2vec_init_embeddings(args.char_embed_size,sentences=brc_data.generate_char2vec_trainset(),min_cnt=1)

    logger.info('Saving vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout:
        pickle.dump(vocab, fout)
    with open(os.path.join(args.vocab_dir,'char_vocab.data'),'wb') as fout:
        pickle.dump(char_vocab,fout)
    logger.info('Done with preparing!')
Пример #16
0
def prepare(args):
    """
    checks data, creates the directories, prepare the vocabulary and embeddings
    """

    logger = logging.getLogger("brc")
    logger.info('Checking the data files...')
    for data_path in args.train_files + args.dev_files + args.test_files:
        assert os.path.exists(data_path), '{} file does not exist.'.format(
            data_path)
    logger.info('Preparing the directories...')
    for dir_path in [
            args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir
    ]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

    logger.info('Building vocabulary...')
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.train_files, args.dev_files, args.test_files)
    vocab = Vocab(lower=True)
    for word in brc_data.word_iter('train'):
        vocab.add(word)

    unfiltered_vocab_size = vocab.size()
    vocab.filter_tokens_by_cnt(min_cnt=2)
    filtered_num = unfiltered_vocab_size - vocab.size()
    logger.info('After filter {} tokens, the final vocab size is {}'.format(
        filtered_num, vocab.size()))

    # logger.info('Assigning embeddings...')
    # vocab.randomly_init_embeddings(args.embed_size)          #random init in prepare!!

    #save the datasets to records files.
    logger.info('Saving the datasets.')
    brc_data.convert_to_ids(vocab)
    pad_id = vocab.get_id(vocab.pad_token)
    brc_data.save_records(pad_id)

    logger.info('Saving vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout:
        pickle.dump(vocab, fout)

    logger.info('Done with preparing!')
Пример #17
0
def prepare(args):
    """
    checks data, creates the directories, prepare the vocabulary and embeddings
    """
    logger = logging.getLogger("brc")
    # 验证输入文件地址正确
    logger.info('Checking the data files...')
    for data_path in args.train_files + args.dev_files + args.test_files:
        assert os.path.exists(data_path), '{} file does not exist.'.format(data_path)
    # 准备vocab,model,result,summary目录
    logger.info('Preparing the directories...')
    for dir_path in [args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
    # brc_data 初始化 默认(5,500,60,/data/demo下的文件)
    logger.info('Building vocabulary...')
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.train_files, args.dev_files, args.test_files)
    # vocab 关键是有几个列表: id2token token2id 问题和文章里 所有的单词和id 对应起来 还统计了单词出现的次数token_cnt
    vocab = Vocab(lower=True)
    for word in brc_data.word_iter('train'):
        vocab.add(word)

    unfiltered_vocab_size = vocab.size() # vocab size id2token列表长度
    vocab.filter_tokens_by_cnt(min_cnt=2)# 过滤出现次数少于2的 然后重建id2token 和 token2id
    # demo里过滤了5225 剩下 5006
    filtered_num = unfiltered_vocab_size - vocab.size()
    logger.info('After filter {} tokens, the final vocab size is {}'.format(filtered_num,
                                                                            vocab.size()))
    # 开始随机分配 embedding
    logger.info('Assigning embeddings...')
    vocab.randomly_init_embeddings(args.embed_size) # 参数 词嵌入向量维度 默认30 注意 <blank> and <unk> 全是0

    # 保存 pickle 保存运行中对象 保存到data/vocab中
    logger.info('Saving vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout:
        pickle.dump(vocab, fout)

    logger.info('Done with preparing!')
        '--train_files',
        nargs='+',
        # default=['./data/demo/trainset/search.train.json'],
        # default=['./data/preprocessed/trainset/search.train.json',
        #          './data/preprocessed/trainset/zhidao.train.json'],
        default=['./data/preprocessed/trainset/search.train.json'],
        help='list of files that contain the preprocessed train data')
    parser.add_argument('--pre_train_file',
                        type=str,
                        default='./data/wiki.zh.new.vec',
                        help='pre_train files')
    args = parser.parse_args()
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.train_files)
    vocab_set = set()
    for word in brc_data.word_iter('train'):
        vocab_set.add(word.lower())

    f = load_model(args.model_name)

    write_file = codecs.open(args.pre_train_file, 'w', 'utf-8')
    for vocab in vocab_set:
        value_str = [str(i) for i in f.get_word_vector(vocab)]
        write_file.write(vocab + " " + " ".join(value_str) + '\n')
    write_file.close()
    print("over!")
    # vocab_set = Vocab(lower=True)
    # for word in brc_data.word_iter('train'):
    #     vocab_set.add(word)
    # print("vocab size is ", vocab_set.size())
    # vocab_set.filter_tokens_by_cnt(min_cnt=2)
Пример #19
0
def train(args):
    """
    checks data, creates the directories, prepare the vocabulary and embeddings
    """
    logger = logging.getLogger("brc")
    logger.info('Checking the data files...')
    for data_path in args.train_files + args.dev_files + args.test_files:
        assert os.path.exists(data_path), '{} file does not exist.'.format(
            data_path)
    logger.info('Preparing the directories...')
    for dir_path in [args.vocab_dir, args.model_dir, args.result_dir]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

    logger.info('Building vocabulary...')

    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.max_train_sample_num, args.train_files)

    vocab = Vocab(lower=True)

    for word in brc_data.word_iter('train'):
        vocab.add(word)

    unfiltered_vocab_size = vocab.size()
    vocab.filter_tokens_by_cnt(min_cnt=2)

    filtered_num = unfiltered_vocab_size - vocab.size()
    logger.info('After filter {} tokens, the final vocab size is {}'.format(
        filtered_num, vocab.size()))

    logger.info('Assigning embeddings...')
    vocab.load_pretrained_embeddings(args.word_embedding_path)

    #vocab.randomly_init_embeddings(300)
    #vocab1.randomly_init_embeddings(300)
    logger.info('Saving vocab...')
    brc_data.convert_to_ids(vocab)
    logger.info('Initialize the model...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout:
        pickle.dump(vocab, fout)

    rc_model = S_netModel(vocab, args)
    logger.info('Training the model...')
    #rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo +'sys')
    #if args.train_as:
    #    rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo + 'syst')
    rc_model.train(brc_data,
                   args.epochs,
                   args.batch_size,
                   save_dir=args.model_dir,
                   save_prefix=args.algo,
                   dropout_keep_prob=args.dropout_keep_prob)
    logger.info('Done with model training!')

    logger.info('evaluate the trained model!')
    test_batches = brc_data.gen_mini_batches('test',
                                             args.batch_size,
                                             pad_id=vocab.get_id(
                                                 vocab.pad_token),
                                             shuffle=False)
    rc_model.evaluate(test_batches,
                      result_dir=args.result_dir,
                      result_prefix='test.predicted')
    logger.info('Done with model evaluating !')
Пример #20
0
def gen_vocab_pretrain_w2v():
    """
    在预训练的过程也将结果写到vocab中
    :return:
    """
    max_p_num = 5
    max_p_len = 500
    max_q_len = 60
    embed_size = 300
    pretrain = True
    pretrained_word_path = ""  # 是否采用第三方预训练的词向量
    train_files = [
        '../data/DuReader2.0/trainset/search.train.json',
        '../data/DuReader2.0/trainset/zhidao.train.json'
    ]
    dev_files = [
        '../data/DuReader2.0/devset/search.dev.json',
        '../data/DuReader2.0/devset/zhidao.dev.json'
    ]
    test_files = [
        '../data/DuReader2.0/testset/search.test.json',
        '../data/DuReader2.0/testset/zhidao.test.json'
    ]
    prepared_dir = ""
    vocab_dir = "../data/test_write_read_tf/vocab"
    segmented_dir = "../data/DuReader2.0/whole_segmented"

    for dir_path in [segmented_dir, vocab_dir]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
    brc_data = BRCDataset(max_p_num,
                          max_p_len,
                          max_q_len,
                          train_files,
                          dev_files,
                          test_files,
                          prepared_dir,
                          prepare=True)

    # 此时占用内存27GB
    vocab = Vocab(lower=True)
    for word in brc_data.word_iter('train'):
        vocab.add(word)

    unfiltered_vocab_size = vocab.size()
    vocab.filter_tokens_by_cnt(min_cnt=2)
    filtered_num = unfiltered_vocab_size - vocab.size()
    logger.info('After filter {} tokens, the final vocab size is {}'.format(
        filtered_num, vocab.size()))

    print('Assigning embeddings...')
    if pretrain:
        # 基于已有的分词结果训练词向量
        if pretrained_word_path:
            # 如果指定了预训练路径,则不需要重新训练,否则根据给定的语料重新训练
            vocab.load_pretrained_embeddings(pretrained_word_path)
        else:
            vocab.load_pretrained_embeddings(
                os.path.join(segmented_dir, 'w2v_dic.data'))
    else:
        vocab.randomly_init_embeddings(embed_size)

    print('Saving vocab,only train data...')
    with open(os.path.join(vocab_dir, 'vocab_traindata_pretrainW2V.data'),
              'wb') as fout:
        pickle.dump(vocab, fout)
Пример #21
0
def gen_vocab():
    """
    一次性遍历数据集,生成vocab。该vocab是为了后期将token转为id
    :return:
    """
    max_p_num = 5
    max_p_len = 500
    max_q_len = 60
    embed_size = 300
    pretrain = True
    pretrained_word_path = ""
    train_files = [
        '../data/DuReader2.0/trainset/search.train.json',
        '../data/DuReader2.0/trainset/zhidao.train.json'
    ]
    dev_files = [
        '../data/DuReader2.0/devset/search.dev.json',
        '../data/DuReader2.0/devset/zhidao.dev.json'
    ]
    test_files = [
        '../data/DuReader2.0/testset/search.test.json',
        '../data/DuReader2.0/testset/zhidao.test.json'
    ]
    prepared_dir = "../data/test_write_read_tf/prepared"
    vocab_dir = "../data/test_write_read_tf/vocab"
    segmented_dir = "../data/test_write_read_tf/segmented"

    for dir_path in [prepared_dir, segmented_dir, vocab_dir]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

    # pdb.set_trace()
    brc_data = BRCDataset(max_p_num,
                          max_p_len,
                          max_q_len,
                          train_files,
                          dev_files,
                          test_files,
                          prepared_dir,
                          prepare=True)

    # 此时占用内存27GB
    vocab = Vocab(lower=True)
    for word in brc_data.word_iter('train'):
        vocab.add(word)

    unfiltered_vocab_size = vocab.size()
    vocab.filter_tokens_by_cnt(min_cnt=2)
    filtered_num = unfiltered_vocab_size - vocab.size()
    logger.info('After filter {} tokens, the final vocab size is {}'.format(
        filtered_num, vocab.size()))

    print('Assigning embeddings...')
    if pretrain:
        # 基于已有的分词结果训练词向量
        if pretrained_word_path:
            # 如果指定了预训练路径,则不需要重新训练,否则根据给定的语料重新训练
            vocab.load_pretrained_embeddings(pretrained_word_path)
        else:
            # 训练耗时20分钟左右
            # pre_train(brc_data, segmented_dir) 预训练和生成vocab是分开的。严格上来说,预训练包括了vocab的生成
            vocab.load_pretrained_embeddings(
                os.path.join(segmented_dir, 'w2v_dic.data'))
    else:
        vocab.randomly_init_embeddings(embed_size)

    print('Saving vocab,only train data...')
    with open(os.path.join(vocab_dir, 'vocab_traindata_pretrainW2V.data'),
              'wb') as fout:
        pickle.dump(vocab, fout)

    del brc_data.train_set

    # 加入test data
    print('Saving vocab,train + test data...')
    for word in brc_data.word_iter('test'):
        vocab.add(word)
    unfiltered_vocab_size = vocab.size()
    vocab.filter_tokens_by_cnt(min_cnt=2)
    filtered_num = unfiltered_vocab_size - vocab.size()
    print('After filter {} tokens, the final vocab size is {}'.format(
        filtered_num, vocab.size()))
    with open(os.path.join(vocab_dir, 'vocab_train_test_data.data'),
              'wb') as fout1:
        pickle.dump(vocab, fout1)

    # 加入dev data
    print('Saving vocab,train + test + dev data...')
    for word in brc_data.word_iter('dev'):
        vocab.add(word)
    unfiltered_vocab_size = vocab.size()
    vocab.filter_tokens_by_cnt(min_cnt=2)
    filtered_num = unfiltered_vocab_size - vocab.size()
    print('After filter {} tokens, the final vocab size is {}'.format(
        filtered_num, vocab.size()))
    with open(os.path.join(vocab_dir, 'vocab_train_test_dev_data.data'),
              'wb') as fout2:
        pickle.dump(vocab, fout2)

    logger.info('Done with preparing!')