예제 #1
0
def main_request(debug_mode):
    main_response = requests.post(URL)

    urls_body = main_response.json()
    urls_header = main_response.headers
    urls_count = len(urls_body)
    secret_key = urls_header['Session']

    if (debug_mode):
        utils.debug_data(urls_body, urls_header)

    secret_key_header = {"Session": secret_key}
    return urls_body, secret_key_header, urls_count
def main():
    """Entry point for all training, evaluation, and model compression begins here"""
    args = parse_arguments()
    word_to_id, id_to_vocab, embeddings, start_id, end_id, unk_id, mask_id = load_sentence_embeddings(
    )
    vocab_size, embedding_size = embeddings.shape
    lr = args.lr

    dataset = dataset_config()

    if args.mode not in set(['train', 'dev', 'test', 'infer', 'minimize']):
        raise ValueError("{} is not a valid mode".format(args.mode))

    with tf.Session() as sess:
        start = dt.datetime.now()
        model = lstm_model(sess, args.mode, args.hidden_size, embeddings,
                           start_id, end_id, mask_id)

        # Saver object
        saver = tf.train.Saver()
        name_to_var_map = {var.op.name: var for var in tf.global_variables()}

        # Restore checkpoint
        if args.checkpoint:
            saver.restore(sess, args.checkpoint)

        # Save minimal graph
        if args.minimize_graph:
            compress_graph(sess, args, model)
            return

        # Load dataset only in train, dev, or test mode
        if args.mode in set(['train', 'dev', 'test']):
            logging.info("{}: Loading dataset into memory.".format(
                dt.datetime.now()))
            dataset_generator = ParaphraseDataset(dataset, args.batch_size,
                                                  embeddings, word_to_id,
                                                  start_id, end_id, unk_id,
                                                  mask_id)

        # Evaluate on dev or test
        if args.mode == 'dev' or args.mode == 'test':
            evaluate(sess, model, dataset_generator, args.mode, id_to_vocab)
            return

        # Perform inferencing
        if args.mode == 'infer':
            infer(sess, args, model, id_to_vocab, end_id)
            return

        ###################################
        # Training run proceeds from here #
        ###################################

        # Training summary writer
        train_logdir = os.path.join(args.log_dir,
                                    "train-" + start.strftime("%Y%m%d-%H%M%S"))
        train_writer = tf.summary.FileWriter(train_logdir)

        # Dev summary writer
        dev_logdir = os.path.join(args.log_dir,
                                  "dev-" + start.strftime("%Y%m%d-%H%M%S"))
        dev_writer = tf.summary.FileWriter(dev_logdir)

        chencherry = SmoothingFunction()
        global_step = 0
        tf.global_variables_initializer().run()
        sess.run(model['dummy'], {model['sampling_temperature']: 7.5})

        # Training per epoch
        for epoch in xrange(args.epochs):
            train_losses = []
            train_batch_generator = dataset_generator.generate_batch('train')
            for train_batch in train_batch_generator:
                seq_source_ids = train_batch['seq_source_ids']
                seq_source_words = train_batch['seq_source_words']
                seq_source_len = train_batch['seq_source_len']
                seq_ref_ids = train_batch['seq_ref_ids']
                seq_ref_words = train_batch['seq_ref_words']
                seq_ref_len = train_batch['seq_ref_len']

                feed_dict = {
                    model['lr']: lr,
                    model['seq_source_ids']: seq_source_ids,
                    model['seq_source_lengths']: seq_source_len,
                    model['seq_reference_ids']: seq_ref_ids,
                    model['seq_reference_lengths']: seq_ref_len,
                    model['keep_prob']: args.keep_prob
                }

                feeds = [
                    model['train_step'], model['loss'], model['predictions'],
                    model['summaries'], model['final_sequence_lengths']
                ]

                try:
                    _, batch_loss, predictions, summary, fsl = sess.run(
                        feeds, feed_dict)
                except Exception as e:
                    debug_data(seq_source_ids, seq_ref_ids, seq_source_len,
                               seq_ref_len, id_to_vocab)
                    raise e

                train_losses.append(batch_loss)

                # Status update
                if global_step % 25 == 0:
                    train_writer.add_summary(summary, global_step)
                    train_writer.flush()
                    seq_ref_words = [[ref_words]
                                     for ref_words in seq_ref_words]
                    bleu_pred_words = [[
                        id_to_vocab[vocab_id] for vocab_id in prediction
                        if vocab_id in id_to_vocab
                    ] for prediction in predictions]
                    bleu_pred_words = [
                        pred_words[:pred_words.index('<END>') if '<END>' in
                                   pred_words else len(pred_words)]
                        for pred_words in bleu_pred_words
                    ]
                    bleu_score = corpus_bleu(
                        seq_ref_words,
                        bleu_pred_words,
                        smoothing_function=chencherry.method1)
                    summarize_scalar(train_writer, 'bleu_score', bleu_score,
                                     global_step)
                    train_loss = sum(train_losses) / len(train_losses)
                    summarize_scalar(train_writer, 'loss', train_loss,
                                     global_step)
                    logging.info(
                        "step={} epoch={} batch_loss={:.4f} train_loss={:.4f} bleu={:.4f}"
                        .format(global_step, epoch, batch_loss, train_loss,
                                bleu_score))

                # Print predictions for this batch every 1000 steps
                # Evaluate on dev set
                if global_step % 1000 == 0 and global_step != 0:
                    debug_data(seq_source_ids, seq_ref_ids, seq_source_len,
                               seq_ref_len, id_to_vocab)
                    logging.info("PREDICTIONS!")
                    logging.info("final_seq_lengths: " + str(fsl))
                    logging.info("len(predictions): " + str(len(predictions)))
                    for prediction in predictions:
                        logging.info(
                            str(len(prediction)) + ' ' + ' '.join([
                                id_to_vocab[vocab_id]
                                for vocab_id in prediction
                                if vocab_id in id_to_vocab
                            ]))

                    dev_loss, bleu_score = evaluate(sess, model,
                                                    dataset_generator, 'dev',
                                                    id_to_vocab)
                    summarize_scalar(dev_writer, 'bleu_score', bleu_score,
                                     global_step)
                    summarize_scalar(dev_writer, 'loss', dev_loss, global_step)
                    dev_writer.flush()

                # Checkpoint.
                #if global_step % 50 == 0 and global_step != 0:
                if global_step % 5000 == 0 and global_step != 0:
                    saver.save(sess,
                               os.path.join(train_logdir, 'model'),
                               global_step=global_step)

                global_step += 1
            # End train batch

            saver.save(sess,
                       os.path.join(train_logdir, 'model'),
                       global_step=global_step)
            lr /= 10.
        # End epoch

        evaluate(sess, model, dataset_generator, 'test', id_to_vocab)
def evaluate(sess, model, dataset_generator, mode, id_to_vocab):
    """Evaluate current model on the dev or test set.
    
    Args:
        sess: Tensorflow session
        model: dictionary containing model's tensors of interest for evaluation
        dataset_generator: dataset batch generator
        mode: 'dev' or 'test'
        id_to_vocab: voabulary dictionary id -> word

    Returns:
        loss: the loss after evaluating the dataset
        bleu_score: BLEU score after evaluation
    """

    batch_generator = dataset_generator.generate_batch(mode)
    chencherry = SmoothingFunction()
    batch_losses = []
    all_seq_ref_words = []
    all_bleu_pred_words = []

    for batch in batch_generator:
        seq_source_ids = batch['seq_source_ids']
        seq_source_words = batch['seq_source_words']
        seq_source_len = batch['seq_source_len']
        seq_ref_ids = batch['seq_ref_ids']
        seq_ref_words = batch['seq_ref_words']
        seq_ref_len = batch['seq_ref_len']

        feed_dict = {
            model['seq_source_ids']: seq_source_ids,
            model['seq_source_lengths']: seq_source_len,
            model['seq_reference_ids']: seq_ref_ids,
            model['seq_reference_lengths']: seq_ref_len
        }

        feeds = [
            model['loss'], model['predictions'],
            model['final_sequence_lengths']
        ]

        try:
            batch_loss, predictions, fsl = sess.run(feeds, feed_dict)
        except Exception as e:
            debug_data(seq_source_ids, seq_ref_ids, seq_source_len,
                       seq_ref_len, id_to_vocab)
            raise e

        # batch losses
        batch_losses.append(batch_loss)

        # all ref words
        seq_ref_words = [[ref_words] for ref_words in seq_ref_words]
        all_seq_ref_words.extend(seq_ref_words)

        # all prediction words to compute bleu on
        bleu_pred_words = [[
            id_to_vocab[vocab_id] for vocab_id in prediction
            if vocab_id in id_to_vocab
        ] for prediction in predictions]
        bleu_pred_words = [
            pred_words[:pred_words.index('<END>') if '<END>' in
                       pred_words else len(pred_words)]
            for pred_words in bleu_pred_words
        ]
        all_bleu_pred_words.extend(bleu_pred_words)

    bleu_score = corpus_bleu(all_seq_ref_words,
                             all_bleu_pred_words,
                             smoothing_function=chencherry.method1)
    loss = sum(batch_losses) / len(batch_losses)
    logging.info("{} : Evaluating on {} set loss={:.4f} bleu={:.4f}".format(
        dt.datetime.now(), mode, loss, bleu_score))
    return loss, bleu_score