Пример #1
0
def main(in_trainset_file, in_devset_file, in_testset_file, in_model_folder,
         in_config_file, in_custom_vocab):
    with open(in_config_file) as config_in:
        config = json.load(config_in)
    train_utterances = load_txt(in_trainset_file)
    dev_utterances = load_txt(in_devset_file)
    test_utterances = load_txt(in_testset_file)

    if in_custom_vocab is not None:
        with open(in_custom_vocab) as vocab_in:
            rev_vocab = [line.rstrip() for line in vocab_in]
            vocab = {word: idx for idx, word in enumerate(rev_vocab)}
    else:
        vocab, rev_vocab = make_vocabulary(
            train_utterances,
            config['max_vocabulary_size'],
            special_tokens=[PAD, START, UNK, EOS])
    config['vocabulary_size'] = len(vocab)

    train_enc_inp, _, train_dec_out, _ = make_variational_autoencoder_dataset(
        train_utterances, vocab, config['max_sequence_length'])
    dev_enc_inp, _, dev_dec_out, _ = make_variational_autoencoder_dataset(
        dev_utterances, vocab, config['max_sequence_length'])
    test_enc_inp, _, test_dec_out, _ = make_variational_autoencoder_dataset(
        test_utterances, vocab, config['max_sequence_length'])

    with tf.Session() as sess:
        ae = CompatibleRNNAutoencoder(config, rev_vocab)
        sess.run(tf.global_variables_initializer())
        train(sess, ae, (train_enc_inp, train_dec_out),
              (dev_enc_inp, dev_dec_out), in_model_folder, **config)
Пример #2
0
def main(in_trainset_file, in_devset_file, in_testset_file, in_config,
         in_model_folder):
    train_utterances, dev_utterances, test_utterances = load_txt(
        in_trainset_file), load_txt(in_devset_file), load_txt(in_testset_file)

    vocab, rev_vocab = make_vocabulary(train_utterances,
                                       in_config['max_vocabulary_size'],
                                       frequency_threshold=0,
                                       ngram_sizes=(1, ))
    config['vocabulary_size'] = len(vocab)

    train_X = make_variational_autoencoder_dataset(
        train_utterances, vocab, config['max_sequence_length'])
    dev_X = make_variational_autoencoder_dataset(dev_utterances, vocab,
                                                 config['max_sequence_length'])
    test_X = make_variational_autoencoder_dataset(
        test_utterances, vocab, config['max_sequence_length'])

    # save_model(vocab, config, in_model_folder)
    with tf.Session() as sess:
        model = VRAE(config, rev_vocab, sess, standalone=True)
        train(model, train_X, dev_X, config, in_model_folder)
Пример #3
0
def main(in_trainset_file, in_devset_file, in_testset_file, in_model_folder,
         in_config_file):
    with open(in_config_file) as config_in:
        config = json.load(config_in)
    train_utterances = load_txt(in_trainset_file)
    dev_utterances = load_txt(in_devset_file)
    test_utterances = load_txt(in_testset_file)

    vocab, rev_vocab = make_vocabulary(train_utterances,
                                       config['max_vocabulary_size'])
    config['vocabulary_size'] = len(vocab)
    train_data = make_autoencoder_dataset(train_utterances, vocab,
                                          config['max_sequence_length'])
    dev_data = make_autoencoder_dataset(dev_utterances, vocab,
                                        config['max_sequence_length'])
    test_data = make_autoencoder_dataset(test_utterances, vocab,
                                         config['max_sequence_length'])

    with tf.Session() as sess:
        ae = RNNVAE(config, rev_vocab)
        sess.run(tf.global_variables_initializer())
        train(sess, ae, train_data, dev_data, in_model_folder, **config)
Пример #4
0
def main(in_model_folder, in_devset_file, in_evalset_file, in_decision_type):
    dev_utterances = load_txt(in_devset_file)
    evalset = pd.read_json(in_evalset_file)

    eval_utterances = list(map(lambda x: x.lower().split(), evalset.utterance))
    with tf.Session() as sess:
        ae = CompatibleRNNAutoencoder.load(in_model_folder, sess)
        rev_vocab, config = ae.vocab, ae.config
        vocab = {word: idx for idx, word in enumerate(rev_vocab)}
        dev_enc_inp, _, dev_dec_out, _ = make_variational_autoencoder_dataset(dev_utterances, vocab, config['max_sequence_length'])
        eval_enc_inp, _, eval_dec_out, _ = make_variational_autoencoder_dataset(eval_utterances, vocab, config['max_sequence_length'])
        ae_ood = AEOODDetector(ae)
        ae_ood.tune_threshold((dev_enc_inp, dev_dec_out), sess, in_decision_type)
        print('Detector accuracy on the evalset: {:.3f}'.format(evaluate(sess, ae_ood, (eval_enc_inp, eval_dec_out, evalset.label))))
Пример #5
0
def main(in_model_folder, in_devset_file, in_evalset_file, in_decision_type):
    dev_utterances = load_txt(in_devset_file)
    evalset = pd.read_json(in_evalset_file)

    eval_utterances = list(map(lambda x: x.split(), evalset.utterance))
    with tf.Session() as sess:
        vae = RNNVAE.load(in_model_folder, sess)
        rev_vocab, config = vae.vocab, vae.config
        vocab = {word: idx for idx, word in enumerate(rev_vocab)}
        dev_data = make_autoencoder_dataset(dev_utterances, vocab, config['max_sequence_length'])
        eval_data = make_autoencoder_dataset(eval_utterances, vocab, config['max_sequence_length'])
        vae_ood = VAEOODDetector(vae)
        vae_ood.tune_threshold(dev_data, sess, in_decision_type)
        print('Detector accuracy on the evalset: {:.3f}'.format(evaluate(sess, vae_ood, (eval_data, evalset.label))))
Пример #6
0
def main(in_model_folder, in_devset_file, in_testset_file, in_decision_type):
    dev_utterances = load_txt(in_devset_file)
    testset = pd.read_json(in_testset_file)

    test_utterances = list(map(lambda x: x.split(), testset.utterance))
    with tf.Session() as sess:
        ae = RNNAutoencoder.load(in_model_folder, sess)
        rev_vocab, config = ae.vocab, ae.config
        vocab = {word: idx for idx, word in enumerate(rev_vocab)}
        dev_X, dev_masks = make_dataset(dev_utterances, vocab, config['max_sequence_length'])
        test_X, test_masks = make_dataset(test_utterances, vocab, config['max_sequence_length'])
        ae_ood = AEOODDetector(ae)
        ae_ood.tune_threshold((dev_X, dev_masks), sess, in_decision_type)
        print('Decision threshold: {:.3f}'.format(ae_ood.threshold))
        print('Utterance\tloss\tprediction')
        losses, predictions = predict(sess, ae_ood, (test_X, test_masks, testset.label))
        for utterance, loss, prediction in zip(test_utterances, losses, predictions):
            print('{}\t{:.3f}\t{}'.format(' '.join(utterance), loss, prediction))