Пример #1
0
def init():
    if args.seed:
        torch.manual_seed(args.seed)
        torch.cuda.manual_seed(args.seed)
    else:
        seed = np.random.randint(2**31)
        torch.manual_seed(seed)
        torch.cuda.manual_seed(args.seed)
    torch.backends.cudnn.benchmark = True
    batch_size = args.batch_size
    use_chars = args.char_dim > 0
    data = Data_preprocessor()

    data = data.preprocess(data_dir=args.data_dir,
                           max_example=args.max_example,
                           no_training_set=False,
                           use_chars=use_chars)
    training_batch_loader = DataLoader(data.training, batch_size, shuffle=True)
    validation_batch_loader = DataLoader(data.validation,
                                         batch_size,
                                         shuffle=False)
    testing_batch_loader = DataLoader(data.testing, batch_size, shuffle=False)
    print("loading word2vec file")
    embed_path = os.path.join(args.data_dir, args.embed_file)
    embed_init, embed_dim = load_word2vec_embeddings(data.dictionary[0],
                                                     embed_path)
    print("Embedding dimension: {}".format(embed_dim))
    model = GAReader(args.n_layers, data.vocab_size, data.n_chars,args.drop_out, args.gru_size, embed_init, embed_dim, \
        args.train_emb, args.char_dim, args.use_feat,args.gating_fn)
    model.cuda()
    optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad,
                                               model.parameters()),
                                 lr=args.init_learning_rate)
    criterion = nn.CrossEntropyLoss().cuda()
    return model, optimizer, criterion, training_batch_loader, validation_batch_loader, testing_batch_loader
Пример #2
0
def test(args):
    use_chars = args.char_dim > 0
    # load data
    dp = data_preprocessor()
    data = dp.preprocess(question_dir=args.data_dir,
                         no_training_set=True,
                         max_example=args.max_example,
                         use_chars=use_chars)
    #import ipdb; ipdb.set_trace()
    idx_to_word = dict([(v, k) for (k, v) in data.dictionary[0].items()])

    # build minibatch loader
    test_batch_loader = minibatch_loader(data.validation,
                                         args.batch_size,
                                         shuffle=False)

    with tf.device('/device:GPU:0'):
        model = GAReader(args.n_layers,
                         data.vocab_size,
                         data.n_chars,
                         args.gru_size,
                         100,
                         args.train_emb,
                         args.char_dim,
                         args.use_feat,
                         args.gating_fn,
                         save_attn=True)
        with tf.Session(
                config=tf.ConfigProto(log_device_placement=False,
                                      allow_soft_placement=True)) as sess:
            model.restore(sess, args.save_dir, args.ckpt)
            logging.info('-' * 50)
            logging.info("Start testing...")
            test_writer = tf.summary.FileWriter('logs/test', sess.graph)
            model.validate(sess, test_batch_loader, write_results=True)
Пример #3
0
def main(save_path, params):
    nhidden = params['nhidden']
    dropout = params['dropout']
    word2vec = params['word2vec']
    dataset = params['dataset']
    nlayers = params['nlayers']
    train_emb = params['train_emb']
    char_dim = params['char_dim']
    use_feat = params['use_feat']
    gating_fn = params['gating_fn']
    ent_setup = params['ent_setup']  # ent, ent-anonym, no-ent
    data_path = params['data_path']
    # save settings
    shutil.copyfile('config.py', '%s/config.py' % save_path)

    use_chars = char_dim > 0

    if dataset == "clicr":
        dp = DataPreprocessor.DataPreprocessorClicr()
        data = dp.preprocess(
            #"/mnt/b5320167-5dbd-4498-bf34-173ac5338c8d/Datasets/bmj_case_reports_data/dataset_json_concept_annotated/",
            data_path,
            ent_setup=ent_setup,
            no_training_set=False,
            use_chars=use_chars)
    elif dataset == "clicr_novice":
        dp = DataPreprocessor.DataPreprocessorNovice()
        data = dp.preprocess(data_path,
                             ent_setup=ent_setup,
                             no_training_set=False,
                             use_chars=use_chars)
    else:
        dp = DataPreprocessor.DataPreprocessor()
        data = dp.preprocess(data_path,
                             no_training_set=False,
                             use_chars=use_chars)

    print("building minibatch loaders ...")
    batch_loader_train = MiniBatchLoader.MiniBatchLoader(data.training,
                                                         BATCH_SIZE,
                                                         sample=1.0)
    batch_loader_val = MiniBatchLoader.MiniBatchLoader(data.validation,
                                                       BATCH_SIZE)

    print("building network ...")
    W_init, embed_dim, = Helpers.load_word2vec_embeddings(
        data.dictionary[0], word2vec)
    m = GAReader.Model(nlayers, data.vocab_size, data.num_chars, W_init,
                       nhidden, embed_dim, dropout, train_emb, char_dim,
                       use_feat, gating_fn)

    print("training ...")
    num_iter = 0
    max_acc = 0.
    deltas = []

    logger = open(save_path + '/log', 'a')

    if os.path.isfile('%s/best_model.p' % save_path):
        print('loading previously saved model')
        m.load_model('%s/best_model.p' % save_path)
    else:
        print('saving init model')
        m.save_model('%s/model_init.p' % save_path)
        print('loading init model')
        m.load_model('%s/model_init.p' % save_path)

    for epoch in range(NUM_EPOCHS):
        estart = time.time()
        new_max = False

        for dw, dt, qw, qt, a, m_dw, m_qw, tt, tm, c, m_c, cl, fnames in batch_loader_train:
            loss, tr_acc, probs = m.train(dw, dt, qw, qt, c, a, m_dw, m_qw, tt,
                                          tm, m_c, cl)

            message = "Epoch %d TRAIN loss=%.4e acc=%.4f elapsed=%.1f" % (
                epoch, loss, tr_acc, time.time() - estart)
            print(message)
            logger.write(message + '\n')

            num_iter += 1
            if num_iter % VALIDATION_FREQ == 0:
                total_loss, total_acc, n, n_cand = 0., 0., 0, 0.

                for dw, dt, qw, qt, a, m_dw, m_qw, tt, tm, c, m_c, cl, fnames in batch_loader_val:
                    outs = m.validate(dw, dt, qw, qt, c, a, m_dw, m_qw, tt, tm,
                                      m_c, cl)
                    loss, acc, probs = outs[:3]

                    bsize = dw.shape[0]
                    total_loss += bsize * loss
                    total_acc += bsize * acc
                    n += bsize
                val_acc = total_acc / n
                if val_acc > max_acc:
                    max_acc = val_acc
                    m.save_model('%s/best_model.p' % save_path)
                    new_max = True
                message = "Epoch %d VAL loss=%.4e acc=%.4f max_acc=%.4f" % (
                    epoch, total_loss / n, val_acc, max_acc)
                print(message)
                logger.write(message + '\n')

        # m.save_model('%s/model_%d.p'%(save_path,epoch))
        message = "After Epoch %d: Train acc=%.4f, Val acc=%.4f" % (
            epoch, tr_acc, val_acc)
        print(message)
        logger.write(message + '\n')

        # learning schedule
        if epoch >= 2:
            m.anneal()
        # stopping criterion
        if not new_max:
            break

    logger.close()
Пример #4
0
def train(args):
    use_chars = args.char_dim > 0
    # load data
    dp = data_preprocessor()
    data = dp.preprocess(question_dir=args.data_dir,
                         no_training_set=False,
                         max_example=args.max_example,
                         use_chars=use_chars)

    # build minibatch loader
    train_batch_loader = minibatch_loader(data.training,
                                          args.batch_size,
                                          sample=1.0)
    valid_batch_loader = minibatch_loader(data.validation,
                                          args.batch_size,
                                          shuffle=False)
    test_batch_loader = minibatch_loader(data.test,
                                         args.batch_size,
                                         shuffle=False)
    if not args.resume:
        logging.info("loading word2vec file ...")
        embed_init, embed_dim = \
            load_word2vec_embeddings(data.dictionary[0], args.embed_file)
        logging.info("embedding dim: {}".format(embed_dim))
        logging.info("initialize model ...")
        model = GAReader(args.n_layers, data.vocab_size, data.n_chars,
                         args.gru_size, embed_dim, args.train_emb,
                         args.char_dim, args.use_feat, args.gating_fn)
        model.build_graph(args.grad_clip, embed_init)
        init = tf.global_variables_initializer()
        saver = tf.train.Saver(tf.global_variables())
    else:
        model = GAReader(args.n_layers, data.vocab_size, data.n_chars,
                         args.gru_size, 100, args.train_emb, args.char_dim,
                         args.use_feat, args.gating_fn)

    with tf.Session() as sess:
        # training phase
        if not args.resume:
            sess.run(init)
            if args.init_test:
                logging.info('-' * 50)
                logging.info("Initial test ...")
                best_loss, best_acc = model.validate(sess, valid_batch_loader)
            else:
                best_acc = 0.
        else:
            model.restore(sess, args.save_dir)
            saver = tf.train.Saver(tf.global_variables())
        logging.info('-' * 50)
        lr = args.init_learning_rate
        logging.info("Start training ...")
        for epoch in range(args.n_epoch):
            start = time.time()
            it = loss = acc = n_example = 0
            if epoch >= 2:
                lr /= 2
            for dw, dt, qw, qt, a, m_dw, m_qw, tt, \
                    tm, c, m_c, cl, fnames in train_batch_loader:
                loss_, acc_ = model.train(sess, dw, dt, qw, qt, a, m_dw, m_qw,
                                          tt, tm, c, m_c, cl, fnames,
                                          args.drop_out, lr)
                loss += loss_
                acc += acc_
                it += 1
                n_example += dw.shape[0]
                if it % args.print_every == 0 or \
                        it % len(train_batch_loader) == 0:
                    spend = (time.time() - start) / 60
                    statement = "Epoch: {}, it: {} (max: {}), "\
                        .format(epoch, it, len(train_batch_loader))
                    statement += "loss: {:.3f}, acc: {:.3f}, "\
                        .format(loss / args.print_every,
                                acc / n_example)
                    statement += "time: {:.1f}(m)"\
                        .format(spend)
                    logging.info(statement)
                    loss = acc = n_example = 0
                    start = time.time()
                # save model
                if it % args.eval_every == 0 or \
                        it % len(train_batch_loader) == 0:
                    valid_loss, valid_acc = model.validate(
                        sess, valid_batch_loader)
                    if valid_acc >= best_acc:
                        logging.info("Best valid acc: {}".format(best_acc))
                        model.save(sess, saver, args.save_dir)
                    start = time.time()
        # test model
        logging.info("Final test ...")
        model.validate(sess, test_batch_loader)
Пример #5
0
def main(save_path, params):

    nhidden = params['nhidden']
    dropout = params['dropout']
    word2vec = params['word2vec']
    dataset = params['dataset']
    nlayers = params['nlayers']
    train_emb = params['train_emb']
    char_dim = params['char_dim']
    use_feat = params['use_feat']
    gating_fn = params['gating_fn']
    out = 'out'

    # save settings
    shutil.copyfile('config.py', '%s/config.py' % save_path)

    use_chars = char_dim > 0
    dp = DataPreprocessor.DataPreprocessor()
    data = dp.preprocess(dataset, no_training_set=False, use_chars=use_chars)
    word_dictionary = data.dictionary[0]
    the_index = word_dictionary['the']
    #print('the index : {}'.format(word_dictionary['the']))

    idx_to_word = dict([(v, k) for (k, v) in word_dictionary.iteritems()])
    words = [idx_to_word[i] for i in sorted(idx_to_word.keys())]

    print("building minibatch loaders ...")
    batch_loader_train = MiniBatchLoader.MiniBatchLoader(data.training,
                                                         BATCH_SIZE,
                                                         sample=1.0)
    batch_loader_val = MiniBatchLoader.MiniBatchLoader(data.validation,
                                                       BATCH_SIZE)

    print("building network ...")
    W_init, embed_dim, = Helpers.load_word2vec_embeddings(
        data.dictionary[0], word2vec)
    #print('the embedding : {}'.format(W_init[the_index]))
    #print(W_init[0:5])

    print("running GAReader ...")

    m = GAReader.Model(nlayers, data.vocab_size, data.num_chars, W_init,
                       nhidden, embed_dim, dropout, train_emb, char_dim,
                       use_feat, gating_fn, words).build_network()
    m.compile(optimizer=tf.keras.optimizers.Adam(lr=LEARNING_RATE,
                                                 clipnorm=GRAD_CLIP),
              loss=tf.keras.losses.categorical_crossentropy,
              metrics=[tf.keras.metrics.categorical_accuracy])
    #tf.enable_eager_execution(config=tf.ConfigProto(allow_soft_placement = True))
    with tf.Graph().as_default():
        with tf.Session(config=tf.ConfigProto(
                allow_soft_placement=True)) as sess:
            K.set_session(sess)
            #with tf.device('/gpu:0:'):
            tensorboard = TensorBoardCustom(log_dir="logs", words=words)
            modelcheckpoint = tf.keras.callbacks.ModelCheckpoint(
                'output/weights.{epoch:02d}-{val_loss:.2f}.hdf5')
            writer = tf.summary.FileWriter("logs")

            def schedule(epoch, lr):

                if epoch >= 3:
                    return lr * 0.5
                else:
                    return lr

            lrate = LearningRateScheduler(schedule, verbose=1)

            for epoch in xrange(NUM_EPOCHS):
                for (inputs, a) in batch_loader_train:
                    [dw, qw, m_dw, m_qw, c, m_c, cl] = inputs
                    m = GAReader.Model(nlayers, data.vocab_size,
                                       data.num_chars, W_init, nhidden,
                                       embed_dim, dropout, train_emb, char_dim,
                                       use_feat, gating_fn,
                                       words).build_network()
                    m.compile(optimizer=tf.keras.optimizers.Adam(
                        lr=LEARNING_RATE, clipnorm=GRAD_CLIP),
                              loss=tf.keras.losses.categorical_crossentropy,
                              metrics=[tf.keras.metrics.categorical_accuracy])
                    #print(dw.shape)
                    #print('dw : {}'.format(dw))
                    #print('qw : {}'.format(qw))
                    #print('m_dw : {}'.format(m_dw))
                    #print('m_qw : {}'.format(m_qw))
                    #print('c : {}'.format(c))
                    #print([idx_to_word[i] for i in dw[0, :, 0].tolist()])
                    train_summary = m.train_on_batch(
                        inputs,
                        to_categorical(a, batch_loader_train.max_num_cand))
                    print(m.get_weights()[0])
                    print('epoch: {}, train loss: {}, train acc: {}'.format(
                        epoch, train_summary[0], train_summary[1]))
                    lr = tf.summary.scalar('learning_rate', LEARNING_RATE)
                    summary = tf.summary.merge_all()
                    s = sess.run(summary)
                    writer.add_summary(s)
                writer.close()
Пример #6
0
def main(load_path, params, mode='test'):
    nhidden = params['nhidden']
    dropout = params['dropout']
    word2vec = params['word2vec']
    dataset = params['dataset']
    nlayers = params['nlayers']
    train_emb = params['train_emb']
    char_dim = params['char_dim']
    use_feat = params['use_feat']
    gating_fn = params['gating_fn']
    ent_setup = params['ent_setup']
    data_path = params['data_path']
    # save settings
    shutil.copyfile('config.py', '%s/config_test.py' % load_path)
    use_chars = char_dim > 0

    if dataset == "clicr":
        dp = DataPreprocessor.DataPreprocessorClicr()
        #dataset_path = "/mnt/b5320167-5dbd-4498-bf34-173ac5338c8d/Datasets/bmj_case_reports_data/dataset_json_concept_annotated/"
        #dataset_path = "data/"
        data = dp.preprocess(data_path,
                             ent_setup=ent_setup,
                             no_training_set=True)
    elif dataset == "clicr_novice":
        dp = DataPreprocessor.DataPreprocessorNovice()
        data = dp.preprocess(data_path,
                             ent_setup=ent_setup,
                             no_training_set=True)
    else:
        dp = DataPreprocessor.DataPreprocessor()
        data = dp.preprocess(data_path, no_training_set=True)
    inv_vocab = data.inv_dictionary

    assert os.path.exists(params["test_file"] if mode ==
                          "test" else params["validation_file"])

    print("building minibatch loaders ...")
    if mode == 'test':
        batch_loader_test = MiniBatchLoader.MiniBatchLoader(
            data.test, BATCH_SIZE)
    else:
        batch_loader_test = MiniBatchLoader.MiniBatchLoader(
            data.validation, BATCH_SIZE)
    f_to_cand = {i[-1]: i[3] for i in batch_loader_test.questions}

    print("building network ...")
    W_init, embed_dim = Helpers.load_word2vec_embeddings(
        data.dictionary[0], word2vec)
    m = GAReader.Model(nlayers,
                       data.vocab_size,
                       data.num_chars,
                       W_init,
                       nhidden,
                       embed_dim,
                       dropout,
                       train_emb,
                       char_dim,
                       use_feat,
                       gating_fn,
                       save_attn=False)
    print("model load path")
    print('%s/best_model.p' % load_path)
    m.load_model('%s/best_model.p' % load_path)

    print("testing ...")
    pr = np.zeros((len(batch_loader_test.questions),
                   batch_loader_test.max_num_cand)).astype('float32')
    fids, attns = [], []
    pred_ans = {}
    total_loss, total_acc, n = 0., 0., 0
    for dw, dt, qw, qt, a, m_dw, m_qw, tt, tm, c, m_c, cl, fnames in batch_loader_test:
        outs = m.validate(dw, dt, qw, qt, c, a, m_dw, m_qw, tt, tm, m_c, cl)
        loss, acc, probs = outs[:3]
        attns += [[fnames[0], probs[0, :]] + [o[0, :, :] for o in outs[3:]]
                  ]  # store one attention

        for f in range(len(fnames)):
            pred_cand = probs[f].argmax()
            pred_a_ids = f_to_cand[fnames[f]][pred_cand]
            pred_a = " ".join([inv_vocab[i] for i in pred_a_ids])
            if ent_setup == "ent-anonym" and (dataset == "clicr"
                                              or dataset == "clicr_novice"):
                relabeling_dicts = data.test_relabeling_dicts if mode == 'test' else data.val_relabeling_dicts
                pred_a = relabeling_dicts[fnames[f]][pred_a]
            pred_ans[fnames[f]] = pred_a

        bsize = dw.shape[0]
        total_loss += bsize * loss
        total_acc += bsize * acc

        pr[n:n + bsize, :] = probs
        fids += fnames
        n += bsize

    if (params["dataset"] == "clicr" or params["dataset"] == "clicr_plain" or params["dataset"] == "clicr_novice") \
            and (mode == 'test' or mode == 'validation'):
        print("writing predictions")
        preds_data = utils.to_output_preds(pred_ans)
        preds_filepath = load_path + '/{}.preds'.format(mode)
        utils.write_preds(preds_data, file_name=preds_filepath)
        utils.external_eval(preds_filepath,
                            preds_filepath + ".scores",
                            params["test_file"]
                            if mode == "test" else params["validation_file"],
                            extended=True)
    logger = open(load_path + '/log.test', 'a')
    message = '%s Loss %.4e acc=%.4f' % (mode.upper(), total_loss / n,
                                         total_acc / n)
    print(message)
    logger.write(message + '\n')
    logger.close()

    np.save('%s/%s.probs' % (load_path, mode), np.asarray(pr))
    pickle.dump(attns, open('%s/%s.attns' % (load_path, mode), 'wb'))
    f = open('%s/%s.ids' % (load_path, mode), 'w')
    for item in fids:
        f.write(item + '\n')
    f.close()
Пример #7
0
def train(args):
    use_chars = args.char_dim > 0
    # load data
    dp = data_preprocessor()
    data = dp.preprocess(question_dir=args.data_dir,
                         no_training_set=False,
                         max_example=args.max_example,
                         use_chars=use_chars)
    #import ipdb; ipdb.set_trace()
    idx_to_word = dict([(v, k) for (k, v) in data.dictionary[0].items()])

    # build minibatch loader
    train_batch_loader = minibatch_loader(data.training,
                                          args.batch_size,
                                          sample=1.0)
    valid_batch_loader = minibatch_loader(data.validation,
                                          args.batch_size,
                                          shuffle=False)
    test_batch_loader = minibatch_loader(data.test,
                                         args.batch_size,
                                         shuffle=False)
    with tf.device('/device:GPU:0'):
        if not args.resume:
            logging.info("loading word2vec file ...")
            embed_init, embed_dim = \
                load_word2vec_embeddings(data.dictionary[0], args.embed_file)
            logging.info("embedding dim: {}".format(embed_dim))
            logging.info("initialize model ...")
            model = GAReader(args.n_layers, data.vocab_size, data.n_chars,
                             args.gru_size, embed_dim, args.train_emb,
                             args.char_dim, args.use_feat, args.gating_fn,
                             True)
            model.build_graph(args.grad_clip, embed_init)
            init = tf.global_variables_initializer()
            saver = tf.train.Saver(tf.global_variables())
        else:
            model = GAReader(args.n_layers, data.vocab_size, data.n_chars,
                             args.gru_size, 100, args.train_emb, args.char_dim,
                             args.use_feat, args.gating_fn, True)

        with tf.Session(
                config=tf.ConfigProto(log_device_placement=False,
                                      allow_soft_placement=True)) as sess:
            # training phase
            if not args.resume:
                step = 0
                sess.run(init)
            else:
                step = int(
                    re.search('step_([0-9]+?)-(.*?)', args.ckpt).group(1))
                model.restore(sess, args.save_dir, args.ckpt)
                saver = tf.train.Saver(tf.global_variables())
            if args.init_test:
                logging.info('-' * 50)
                logging.info("Initial test ...")
                best_loss, best_acc = model.validate(sess, valid_batch_loader)
            else:
                best_acc = 0.

            logging.info('-' * 50)
            logging.info("Start training ...")
            train_writer = tf.summary.FileWriter('logs/train', sess.graph)
            while step < args.n_epoch * len(train_batch_loader):
                epoch = int(math.floor(step / len(train_batch_loader)))
                start = time.time()
                it = loss = acc = n_example = 0
                lr = args.init_learning_rate
                if epoch >= 2:
                    lr = args.init_learning_rate / 2**(epoch - 1)
                for dw, dt, qw, qt, a, m_dw, m_qw, tt, \
                        tm, c, m_c, cl, fnames in train_batch_loader:
                    step += 1
                    tf.summary.text(
                        'doc',
                        tf.constant(get_text(idx_to_word, dw[0], m_dw[0])))

                    if step % 1000 == 0:
                        logging.info('running train step with summary..')
                        loss_, acc_, summary = model.train(
                            sess, dw, dt, qw, qt, a, m_dw, m_qw, tt, tm, c,
                            m_c, cl, fnames, args.drop_out, lr, True)
                        train_writer.add_summary(summary, step)
                    else:
                        loss_, acc_ = model.train(sess, dw, dt, qw, qt, a,
                                                  m_dw, m_qw, tt, tm, c, m_c,
                                                  cl, fnames, args.drop_out,
                                                  lr)
                    loss += loss_
                    acc += acc_
                    it += 1
                    n_example += dw.shape[0]
                    tf.summary.scalar('train_loss', tf.constant(loss_))
                    tf.summary.scalar('train_accuracy', tf.constant(acc_))
                    if step % args.print_every == 0 or \
                            it % len(train_batch_loader) == 0:
                        spend = (time.time() - start) / 60
                        statement = "Epoch: {}, it: {} (max: {}), "\
                            .format(epoch, it, len(train_batch_loader))
                        statement += "loss: {:.3f}, acc: {:.3f}, "\
                            .format(loss / args.print_every,
                                    acc / n_example)
                        statement += "time: {:.1f}(m)"\
                            .format(spend)
                        logging.info(statement)
                        loss = acc = n_example = 0
                        start = time.time()
                    # save model
                    if step % args.eval_every == 0 or \
                            it % len(train_batch_loader) == 0:
                        valid_loss, valid_acc = model.validate(
                            sess, valid_batch_loader)
                        tf.summary.scalar('val_loss', tf.constant(valid_loss))
                        tf.summary.scalar('val_accuracy',
                                          tf.constant(valid_acc))
                        if valid_acc >= best_acc:
                            best_loss = valid_loss
                            best_acc = valid_acc
                            logging.info("Best valid acc: {}".format(best_acc))
                            model.save(sess, saver, args.save_dir, step,
                                       valid_acc, valid_loss)
                        start = time.time()
                train_writer.close()
            # test model
            logging.info("Final test ...")
            model.validate(sess, test_batch_loader, write_results=True)
Пример #8
0
# NOTE: make sure vocab.txt is already there!
data = dp.preprocess(DATASET, no_training_set=True)
inv_vocab = data.inv_dictionary

print("building minibatch loaders ...")
if not 'CANDIDATE_SUBSET' in locals():
    CANDIDATE_SUBSET = False
if dataset == 'validation':
    batch_loader_test = MiniBatchLoader.MiniBatchLoader(
        data.validation, 128, shuffle=False, candidate_subset=CANDIDATE_SUBSET)
elif dataset == 'test':
    batch_loader_test = MiniBatchLoader.MiniBatchLoader(
        data.test, 128, shuffle=False, candidate_subset=CANDIDATE_SUBSET)

print("building network ...")
m = GAReader.Model(K, data.vocab_size)

print("loading model from file...")
m.load_model(model_path)

print("predicting ...")

fid = open(output_path, 'w', 0)

pr = []
gt = []
for d, q, a, m_d, m_q, c, m_c, fnames in batch_loader_test:
    loss, acc, probs = m.validate(d, q, a, m_d, m_q, m_c)

    probs_sorted = np.argpartition(-probs, top_K - 1)[:, :top_K]
    predicted = map(lambda x: ' '.join(map(lambda i: inv_vocab[i], x)),
Пример #9
0
def main(load_path, params, mode='test'):

    nhidden = params['nhidden']
    dropout = params['dropout']
    word2vec = params['word2vec']
    dataset = params['dataset']
    nlayers = params['nlayers']
    train_emb = params['train_emb']
    char_dim = params['char_dim']
    use_feat = params['use_feat']
    gating_fn = params['gating_fn']

    dp = DataPreprocessor.DataPreprocessor()
    data = dp.preprocess(dataset, no_training_set=True)
    inv_vocab = data.inv_dictionary

    print("building minibatch loaders ...")
    if mode == 'test':
        batch_loader_test = MiniBatchLoader.MiniBatchLoader(
            data.test, BATCH_SIZE)
    else:
        batch_loader_test = MiniBatchLoader.MiniBatchLoader(
            data.validation, BATCH_SIZE)

    print("building network ...")
    W_init, embed_dim = Helpers.load_word2vec_embeddings(
        data.dictionary[0], word2vec)
    m = GAReader.Model(nlayers,
                       data.vocab_size,
                       data.num_chars,
                       W_init,
                       nhidden,
                       embed_dim,
                       dropout,
                       train_emb,
                       char_dim,
                       use_feat,
                       gating_fn,
                       save_attn=True)
    m.load_model('%s/best_model.p' % load_path)

    print("testing ...")
    pr = np.zeros((len(batch_loader_test.questions),
                   batch_loader_test.max_num_cand)).astype('float32')
    fids, attns = [], []
    total_loss, total_acc, n = 0., 0., 0
    for dw, dt, qw, qt, a, m_dw, m_qw, tt, tm, c, m_c, cl, fnames in batch_loader_test:
        outs = m.validate(dw, dt, qw, qt, c, a, m_dw, m_qw, tt, tm, m_c, cl)
        loss, acc, probs = outs[:3]
        attns += [[fnames[0], probs[0, :]] + [o[0, :, :] for o in outs[3:]]
                  ]  # store one attention

        bsize = dw.shape[0]
        total_loss += bsize * loss
        total_acc += bsize * acc

        pr[n:n + bsize, :] = probs
        fids += fnames
        n += bsize

    logger = open(load_path + '/log', 'a', 0)
    message = '%s Loss %.4e acc=%.4f' % (mode.upper(), total_loss / n,
                                         total_acc / n)
    print message
    logger.write(message + '\n')
    logger.close()

    np.save('%s/%s.probs' % (load_path, mode), np.asarray(pr))
    pkl.dump(attns, open('%s/%s.attns' % (load_path, mode), 'w'))
    f = open('%s/%s.ids' % (load_path, mode), 'w')
    for item in fids:
        f.write(item + '\n')
    f.close()