Exemplo n.º 1
0
    def __init__(self, args, model_name=None):
        self.args = args
        vocab = args.vocab_path if args.vocab_path is not None else config.vocab_path
        self.vocab = Vocab(vocab, config.vocab_size, config.embeddings_file,
                           args)
        self.train_batcher = Batcher(args.train_data_path,
                                     self.vocab,
                                     mode='train',
                                     batch_size=args.batch_size,
                                     single_pass=False,
                                     args=args)
        self.eval_batcher = Batcher(args.eval_data_path,
                                    self.vocab,
                                    mode='eval',
                                    batch_size=args.batch_size,
                                    single_pass=True,
                                    args=args)
        time.sleep(30)

        if model_name is None:
            self.train_dir = os.path.join(config.log_root,
                                          'train_%d' % (int(time.time())))
        else:
            self.train_dir = os.path.join(config.log_root, model_name)

        if not os.path.exists(self.train_dir):
            os.mkdir(self.train_dir)
        self.model_dir = os.path.join(self.train_dir, 'model')
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)
Exemplo n.º 2
0
def main():

    vocab = VocabBert(config.vocab_path, config.vocab_size)
    if args.mode == 'train':
        batcher = Batcher(config.train_data_path,
                          vocab,
                          mode='train',
                          batch_size=config.batch_size,
                          single_pass=False)
    elif args.mode == 'eval':
        batcher = Batcher(config.eval_data_path,
                          vocab,
                          mode='decode',
                          batch_size=config.batch_size,
                          single_pass=True)
    model = build_model_bert(config)
    criterion = LabelSmoothing(config.vocab_size, batcher.pad_id, smoothing=.1)

    if args.mode == 'train':
        if not os.path.exists(args.save_path):
            os.makedirs(args.save_path)
        train(config.max_iters, batcher, model, criterion, config,
              args.save_path)
    elif args.mode == 'eval':
        eval(config)
Exemplo n.º 3
0
def main():

    vocab = Vocab(config.vocab_path, config.vocab_size)
    train_batcher = Batcher(config.train_data_path, vocab, mode='train', batch_size=config.batch_size, single_pass=False)
    eval_batcher = Batcher(config.eval_data_path , vocab, mode='train', batch_size=config.batch_size, single_pass=False)
    model = build_model(config)
    criterion = LabelSmoothing(config.vocab_size, train_batcher.pad_id, smoothing=.1)
    
    if args.mode=='train':
        train(config.max_iters, train_batcher, eval_batcher, model, criterion, config, args.save_path)
    elif args.mode=='eval':
        eval(config, args.model)
Exemplo n.º 4
0
def train(model):
    train_batcher = Batcher(conf.train_data_path, vocab, mode='train', batch_size=conf.batch_size,single_pass=False)
    eval_batcher = Batcher(conf.eval_data_path, vocab, mode='train', batch_size=conf.batch_size, single_pass=False)
    set_optimizer(model)
    # iterate over batch and print results
    index = 0
    while index < config.iters:
        model.train()
        iterate(index, 10000, train_batcher, model, 'train')
        model.eval()
        iterate(0, 100, eval_batcher, model, 'eval')
        index += 10000
        torch.save(model.state_dict(), conf.save_path + "/model-" + str(index) + ".pt")
def decoder():
    # 配置GPU
    use_gpu = True
    config_gpu(use_gpu=use_gpu)

    # 读取字典 和 词向量矩阵
    vocab = Vocab(config.path_vocab, config.vocab_size)

    wvtool = WVTool(ndim=config.emb_dim)
    embedding_matrix = wvtool.load_embedding_matrix(path_embedding_matrix=config.path_embedding_matrixt)

    # 构建模型
    logger.info('构建Seq2Seq模型 ...')
    model=Seq2Seq(config.beam_size,embedding_matrix=embedding_matrix)


    # 存档点管理
    ckpt = tf.train.Checkpoint(Seq2Seq=model)
    ckpt_manager = tf.train.CheckpointManager(checkpoint=ckpt, directory=config.dir_ckpt, max_to_keep=10)
    if ckpt_manager.latest_checkpoint:
        ckpt.restore(ckpt_manager.latest_checkpoint)
        logger.info('decoder模型存档点加载自: {}'.format(ckpt_manager.latest_checkpoint))
    else:
        logger.info('无可加载的存档点')

    # 获取训练数据
    batcher = Batcher(config.path_seg_test, vocab, mode='decode',
                      batch_size=config.beam_size, single_pass=True)

    time.sleep(20)
    # 训练模型
    # 输入:训练数据barcher,模型,词表,存档点,词向量矩阵
    batch_decode(batcher, model=model,vocab=vocab)
Exemplo n.º 6
0
def test(model, path, vocab):
    model.load_state_dict(torch.load(path))
    model.eval()

    beam_search = LSTMBeamSearch(conf.beam_size, conf.vocab_size, conf.max_decode_len, model)
    batcher = Batcher(config.decode_data_path, vocab, mode='decode', batch_size=1, single_pass=True)

    counter = 0
    batch = batcher.next_batch()

    while batch is not None:
        input_ids, input_mask, input_lens, extended_input_ids, extra_zeros = prepare_src_batch(batch)
        best_summary = beam_search.generate(input_ids, extended_input_ids, extra_zeros)
        output_ids = [int(t) for t in best_summary.tokens[1:]]
        decoded_words = outputids2words(output_ids, vocab, batch.art_oovs[0])

        try:
            fst_stop_idx = decoded_words.index(STOP_DECODING)
            decoded_words = decoded_words[:fst_stop_idx]
        except ValueError:
            decoded_words = decoded_words

        write_for_rouge(batch.original_abstracts_sents[0], decoded_words, counter,
                        conf.rouge_ref_dir, conf.rouge_dec_dir)
        batch = batcher.next_batch()
        counter += 1

    results_dict = rouge_eval(conf.rouge_ref_dir, conf.rouge_dec_dir)
    rouge_log(results_dict, conf.decode_dir)
Exemplo n.º 7
0
 def fit(self, trainData, trainLabels, session, keep_prob = 0.8, batchSize = 100):
     batcher = Batcher(trainData, trainLabels)
     batches = 0
     while batches < trainData.shape[0]:
         batchTrain, batchLabels = batcher.nextBatch(batchSize)
         session.run(self.trainStep, feed_dict = {self.net.inputs : batchTrain,
                                                  self.targetOutput : batchLabels,
                                                  self.net.keep_prob : keep_prob})
         batches += batchSize
def main():
    # Prior to training, please adapt the hyper parameters in the config_parser.py and run the script to generate
    # the training config file use to train your own VOCA model.

    pkg_path, _ = os.path.split(os.path.realpath(__file__))
    init_config_fname = os.path.join(pkg_path, 'training_config.cfg')
    if not os.path.exists(init_config_fname):
        print('Config not found %s' % init_config_fname)
        create_default_config(init_config_fname)

    config = configparser.ConfigParser()
    config.read(init_config_fname)

    # Path to cache the processed audio
    config.set(
        'Input Output', 'processed_audio_path',
        './training_data/processed_audio_%s.pkl' %
        config.get('Audio Parameters', 'audio_feature_type'))

    checkpoint_dir = config.get('Input Output', 'checkpoint_dir')
    if os.path.exists(checkpoint_dir):
        print('Checkpoint dir already exists %s' % checkpoint_dir)
        key = input(
            'Press "q" to quit, "x" to erase existing folder, and any other key to continue training: '
        )
        if key.lower() == 'q':
            return
        elif key.lower() == 'x':
            try:
                shutil.rmtree(checkpoint_dir, ignore_errors=True)
            except:
                print('Failed deleting checkpoint directory')

    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)

    config_fname = os.path.join(checkpoint_dir, 'config.pkl')
    if os.path.exists(config_fname):
        print('Use existing config %s' % config_fname)
    else:
        with open(config_fname, 'w') as fp:
            config.write(fp)
            fp.close()

    config = read_config(config_fname)
    data_handler = DataHandler(config)
    batcher = Batcher(data_handler)

    with tf.Session() as session:
        model = Model(session=session, config=config, batcher=batcher)
        model.build_graph()
        model.load()
        model.train()
Exemplo n.º 9
0
    def __init__(self, model_file_path):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.eval_data_path,
                               self.vocab,
                               mode='eval',
                               batch_size=config.batch_size,
                               single_pass=True)
        time.sleep(15)
        model_name = os.path.basename(model_file_path)

        eval_dir = os.path.join(config.log_root, 'eval_%s' % (model_name))
        if not os.path.exists(eval_dir):
            os.mkdir(eval_dir)
        self.summary_writer = SummaryWriter(eval_dir)
        self.model = Model(model_file_path, is_eval=True)
Exemplo n.º 10
0
    def __init__(self, model):

        self._decode_dir = os.path.join(config.log_root,
                                        'decode_%s' % ("model_bert_coverage"))
        self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref')
        self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir')

        for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]:
            if not os.path.exists(p):
                os.mkdir(p)

        self.vocab = VocabBert(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.decode_data_path,
                               self.vocab,
                               mode='decode',
                               batch_size=config.beam_size,
                               single_pass=True)
        self.model = model
Exemplo n.º 11
0
    def __init__(self, args, model_file_path, save_path):
        model_name = os.path.basename(model_file_path)
        self.args = args
        self._decode_dir = os.path.join(config.log_root, save_path,
                                        'decode_%s' % (model_name))
        self._structures_dir = os.path.join(self._decode_dir, 'structures')
        self._sent_single_heads_dir = os.path.join(self._decode_dir,
                                                   'sent_heads_preds')
        self._sent_single_heads_ref_dir = os.path.join(self._decode_dir,
                                                       'sent_heads_ref')
        self._contsel_dir = os.path.join(self._decode_dir, 'content_sel_preds')
        self._contsel_ref_dir = os.path.join(self._decode_dir,
                                             'content_sel_ref')
        self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref')
        self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir')

        self._rouge_ref_file = os.path.join(self._decode_dir, 'rouge_ref.json')
        self._rouge_pred_file = os.path.join(self._decode_dir,
                                             'rouge_pred.json')
        self.stat_res_file = os.path.join(self._decode_dir, 'stats.txt')
        self.sent_count_file = os.path.join(self._decode_dir,
                                            'sent_used_counts.txt')
        for p in [
                self._decode_dir, self._structures_dir,
                self._sent_single_heads_ref_dir, self._sent_single_heads_dir,
                self._contsel_ref_dir, self._contsel_dir, self._rouge_ref_dir,
                self._rouge_dec_dir
        ]:
            if not os.path.exists(p):
                os.mkdir(p)
        vocab = args.vocab_path if args.vocab_path is not None else config.vocab_path
        self.vocab = Vocab(vocab, config.vocab_size, config.embeddings_file,
                           args)
        self.batcher = Batcher(args.decode_data_path,
                               self.vocab,
                               mode='decode',
                               batch_size=args.beam_size,
                               single_pass=True,
                               args=args)
        self.batcher.setup_queues()
        time.sleep(30)

        self.model = Model(args, self.vocab).to(device)
        self.model.eval()
Exemplo n.º 12
0
def mode_predict(config, input_path, output_path=output_path):
    """
    执行predict模式。对给定数据 进行预测。

    :param config: 配置文件
    :param input_path: 待预测 数据集路径
    :param output_path: 预测结果 保存路径
    :return: 无。预测值已写入指定文件
    """
    # 读入数据
    x_test = load_predict_data(os.path.join(input_path, "data_predict.txt"),
                               sample_ratio=config.predict_data_sample_ratio)
    print("成功载入待预测文件")
    # 读取已有字典
    my_vocab = load_vocabulary(max_vocab_size=config.max_vocab_size)
    config.vocab_size = my_vocab.vocab_size
    print("载入已有字典, 字典实际大小:{} , 字典设置大小: {}".format(
        len(my_vocab.word_index) + 1, config.vocab_size))

    # 数据预处理(转化为id表示,并padding)
    x_test = data_preprocessing(x_test, my_vocab, max_len=config.max_len)
    print("Data Set size: %d" % len(x_test))

    config.keep_prob = 1.0
    # 创建分类器
    classifier = choose_model_by_name(config)
    classifier.build_graph()

    # 创建数据集的batcher
    data_batcher = Batcher(x_test, batch_size=config.batch_size)
    # 开始预测数据
    print('开始预测数据')
    predict_list = predict(classifier, config, data_batcher)

    # 保存预测值到output文件夹
    with open(output_path, "w", encoding="utf8") as fout:
        for (pred, pred_prob) in predict_list:
            fout.write("%d\t%f\n" % (pred, pred_prob))
    print('预测完成,并已将预测值写入输出文件:', output_path)
Exemplo n.º 13
0
def mode_evaluate(config, input_path):
    """
    执行eval模式。评估模型。

    :param config: 配置文件
    :param input_path: 数据集路径
    :return: 无
    """
    # 读入数据
    x_test, y_test = load_data(
        os.path.join(input_path, "data_test.txt"),
        sample_ratio=config.data_sample_ratio,
        n_class=config.n_class,
        one_hot=config.one_hot,
    )
    print("成功载入测试集文件")
    # 读取已有字典
    my_vocab = load_vocabulary(max_vocab_size=config.max_vocab_size)
    config.vocab_size = my_vocab.vocab_size
    print("载入已有字典, 字典实际大小:{} , 字典设置大小: {}".format(
        len(my_vocab.word_index) + 1, config.vocab_size
    ))

    # 数据预处理(转化为id表示,并padding)
    x_test = data_preprocessing(x_test, my_vocab, max_len=config.max_len)
    print("Test  Set size: %d" % len(x_test))

    config.keep_prob = 1.0
    # 创建分类器
    classifier = choose_model_by_name(config)
    classifier.build_graph()

    # 创建测试集的batcher
    test_batcher = Batcher(x_test, y_test, batch_size=config.batch_size)
    # 开始评估模型
    evaluate(classifier, config, test_batcher)
Exemplo n.º 14
0
def mode_train(config, input_path):
    """
    执行train模式。按照给定配置,训练模型。

    :param config: 配置文件
    :param input_path: 数据集路径
    :return: 无
    """
    # 读入训练集和测试集
    x_train, y_train = load_data(
        os.path.join(input_path, "data_train.txt"),
        sample_ratio=config.data_sample_ratio,
        n_class=config.n_class,
        one_hot=config.one_hot,
    )
    print("成功载入训练集文件")
    x_test, y_test = load_data(
        os.path.join(input_path, "data_test.txt"),
        sample_ratio=config.data_sample_ratio,
        n_class=config.n_class,
        one_hot=config.one_hot,
    )
    print("成功载入测试集文件")
    # 获取验证集
    if os.path.isfile(os.path.join(input_path, "data_valid.txt")):
        # 从验证集文件中获取
        x_valid, y_valid = load_data(
            os.path.join(input_path, "data_test.txt"),
            sample_ratio=config.data_sample_ratio,
            n_class=config.n_class,
            one_hot=config.one_hot,
        )
        print("成功载入验证集文件")
    else:
        # 将测试集的一部分分割出来,作为验证集
        split_radio = config.valid_test_split_radio  # 设置分割比例
        x_test, x_valid, y_test, y_valid = split_dataset(
            x_test, y_test, split_radio)
        print("没有发现验证集文件,已分割测试集的 {}% 来作为验证集".format(split_radio * 100))

    # 创建字典
    my_vocab = make_vocabulary(x_train, max_vocab_size=config.max_vocab_size)
    config.vocab_size = my_vocab.vocab_size
    print("使用训练集数据 制作字典完成, 字典实际大小:{} , 字典设置大小: {}".format(
        len(my_vocab.word_index) + 1, config.vocab_size))

    # 数据预处理(转化为id表示,并padding)
    print('开始对数据集进行预处理 (word表示 -> id表示)')
    x_train = data_preprocessing(x_train, my_vocab, max_len=config.max_len)
    x_valid = data_preprocessing(x_valid, my_vocab, max_len=config.max_len)
    x_test = data_preprocessing(x_test, my_vocab, max_len=config.max_len)
    print("Train Set size: %d" % len(x_train))
    print("Valid Set size: %d" % len(x_valid))
    print("Test  Set size: %d" % len(x_test))

    # 创建分类器
    classifier = choose_model_by_name(config)
    classifier.build_graph()

    # 创建训练集、验证集、测试集的 batcher
    train_batcher = Batcher(x_train, y_train, batch_size=config.batch_size)
    valid_batcher = Batcher(x_valid, y_valid, batch_size=config.batch_size)
    test_batcher = Batcher(x_test, y_test, batch_size=config.batch_size)
    # 开始训练模型
    train(classifier, config, train_batcher, valid_batcher, test_batcher)