Exemplo n.º 1
0
def score(source_data: str,
          target_data: str,
          load_from: str,
          corpus_average: bool,
          normalize: float = 0.6,
          **kwargs):
    """Scores a text using a trained translation model. See argument description in `bin/daikon`."""

    # fix batch size at 1 to get individual scores for sentences
    batch_size = 1

    source_vocab = Vocabulary()
    target_vocab = Vocabulary()
    source_vocab.load(os.path.join(load_from, C.SOURCE_VOCAB_FILENAME))
    target_vocab.load(os.path.join(load_from, C.TARGET_VOCAB_FILENAME))

    reader_ids = list(
        reader.read_parallel(source_data, target_data, source_vocab,
                             target_vocab, C.SCORE_MAX_LEN))

    encoder_inputs, decoder_targets, decoder_inputs, loss, _, _, _ = define_computation_graph(
        source_vocab.size, target_vocab.size, batch_size)

    saver = tf.train.Saver()

    with tf.Session() as session:
        # load model
        saver.restore(session, os.path.join(load_from, C.MODEL_FILENAME))

        losses = []
        total_iter = 0
        for x, y, z in reader.iterate(reader_ids, batch_size, shuffle=False):
            feed_dict = {
                encoder_inputs: x,
                decoder_inputs: y,
                decoder_targets: z
            }
            l = session.run([loss], feed_dict=feed_dict)

            # first session variable
            l = l[0]
            if normalize:
                # normalize by length of target sequence (including EOS token)
                l /= np.power(y.shape[1], normalize)

            losses.append(l)
            total_iter += 1

        if corpus_average:
            total_loss = np.sum(losses)
            perplexity = np.exp(total_loss / total_iter)
            return perplexity
        else:
            return np.exp(losses)
Exemplo n.º 2
0
def score(source_data: str, target_data: str, load_from: str, corpus_average: bool, normalize: bool, **kwargs):
    """Scores a text using a trained translation model. See argument description in `bin/daikon`."""

    # fix batch size at 1 to get individual scores for sentences
    batch_size = 1

    source_vocab = Vocabulary()
    target_vocab = Vocabulary()
    source_vocab.load(os.path.join(load_from, C.SOURCE_VOCAB_FILENAME))
    target_vocab.load(os.path.join(load_from, C.TARGET_VOCAB_FILENAME))

    reader_ids = list(reader.read_parallel(source_data, target_data, source_vocab, target_vocab, C.SCORE_MAX_LEN))

    encoder_inputs, decoder_targets, decoder_inputs, loss, _, _, _ = define_computation_graph(source_vocab.size, target_vocab.size, batch_size)

    saver = tf.train.Saver()

    with tf.Session() as session:
        # load model
        saver.restore(session, os.path.join(load_from, C.MODEL_FILENAME))

        losses = []
        total_iter = 0
        for x, y, z in reader.iterate(reader_ids, batch_size, shuffle=False):
            feed_dict = {encoder_inputs: x,
                         decoder_inputs: y,
                         decoder_targets: z}
            l = session.run([loss], feed_dict=feed_dict)

            # first session variable
            l = l[0]
            if normalize:
                # normalize by length of target sequence (including EOS token)
                l /= y.shape[1]

            losses.append(l)
            total_iter += 1

        if corpus_average:
            total_loss = np.sum(losses)
            perplexity = np.exp(total_loss / total_iter)
            return perplexity
        else:
            return np.exp(losses)
Exemplo n.º 3
0
def train(source_data: str,
          target_data: str,
          epochs: int,
          batch_size: int,
          source_vocab_max_size: int,
          target_vocab_max_size: int,
          save_to: str,
          log_to: str,
          sample_after_epoch: bool,
          **kwargs) -> None:
    """Trains a language model. See argument description in `bin/romanesco`."""

    # create folders for model and logs if they don't exist yet
    for folder in [save_to, log_to]:
        if not os.path.exists(folder):
            os.makedirs(folder)

    logger.info("Creating vocabularies.")

    # create vocabulary to map words to ids, for source and target
    source_vocab = create_vocab(source_data, source_vocab_max_size, save_to, C.SOURCE_VOCAB_FILENAME)
    target_vocab = create_vocab(target_data, target_vocab_max_size, save_to, C.TARGET_VOCAB_FILENAME)

    logger.info("Source vocabulary: %s", source_vocab)
    logger.info("Target vocabulary: %s", target_vocab)

    # convert training data to list of word ids
    logger.info("Reading training data.")
    reader_ids = list(reader.read_parallel(source_data, target_data, source_vocab, target_vocab, C.MAX_LEN))

    # define computation graph
    logger.info("Building computation graph.")

    graph_components = define_computation_graph(source_vocab.size, target_vocab.size, batch_size)
    encoder_inputs, decoder_targets, decoder_inputs, loss, train_step, decoder_logits, summary = graph_components

    saver = tf.train.Saver()

    with tf.Session() as session:
        # init
        session.run(tf.global_variables_initializer())
        # write logs (@tensorboard)
        summary_writer = tf.summary.FileWriter(log_to, graph=tf.get_default_graph())

        logger.info("Starting training.")
        tic = time.time()
        num_batches = math.floor(len(reader_ids) / batch_size)

        prev_perplexity = 10000.0
        counter = 0
        # iterate over training data `epochs` times
        for epoch in range(1, epochs + 1):
            total_loss = 0.0
            total_iter = 0

            iter_tic = time.time()

            for x, y, z in reader.iterate(reader_ids, batch_size, shuffle=True):

                feed_dict = {encoder_inputs: x,
                             decoder_inputs: y,
                             decoder_targets: z}

                l, _, s = session.run([loss, train_step, summary],
                                      feed_dict=feed_dict)
                summary_writer.add_summary(s, total_iter)
                total_loss += l
                total_iter += 1
                if total_iter % C.LOGGING_INTERVAL == 0 or total_iter == num_batches:
                    iter_taken = time.time() - iter_tic
                    logger.debug("Epoch=%s, iteration=%s/%s, samples/second=%.2f", epoch, total_iter, num_batches, batch_size * C.LOGGING_INTERVAL / float(iter_taken))
                    iter_tic = time.time()
            perplexity = np.exp(total_loss / total_iter)
            logger.info("Perplexity on training data after epoch %s: %.2f", epoch, perplexity)
            saver.save(session, os.path.join(save_to, C.MODEL_FILENAME))

            if sample_after_epoch:
                # sample from model after epoch
                thread = threading.Thread(target=_sample_after_epoch, args=[reader_ids, source_vocab, target_vocab, save_to, epoch])
                thread.start()

            # early stopping
            if perplexity > prev_perplexity:
                counter += 1
            else:
                if counter != 0:
                    counter -= 1
            if counter == 2:
                taken = time.time() - tic
                m, s = divmod(taken, 60)
                h, m = divmod(m, 60)
                logger.info("Training finished early. Overall time taken to train: %d:%02d:%02d" % (h, m, s))

        taken = time.time() - tic
        m, s = divmod(taken, 60)
        h, m = divmod(m, 60)

        logger.info("Training finished. Overall time taken to train: %d:%02d:%02d" % (h, m, s))
Exemplo n.º 4
0
def train(source_data: str,
          target_data: str,
          epochs: int,
          batch_size: int,
          source_vocab_max_size: int,
          target_vocab_max_size: int,
          save_to: str,
          log_to: str,
          sample_after_epoch: bool,
          source_val_data: str = None,
          target_val_data: str = None,
          val_epochs: int = C.VAL_EPOCHS,
          patience: int = C.PATIENCE,
          overwrite: bool = False,
          **kwargs) -> None:
    """Trains a translation model. See argument description in `bin/daikon`."""

    # enable early stopping if validation data files were specified
    early_stopping = source_val_data is not None and target_val_data is not None

    # create folders for model and logs if they don't exist yet
    for folder in [save_to, log_to]:
        if not os.path.exists(folder):
            os.makedirs(folder)

    if early_stopping:
        val_model_dir = os.path.join(save_to, C.VALIDATION_MODEL_DIR)
        if not os.path.exists(val_model_dir):
            os.makedirs(val_model_dir)

    # create a new graph if the overwrite option is enabled or there is no
    # existing model in the save_to directory
    checkpoint_file = os.path.join(save_to, C.MODEL_CHECKPOINT)
    initialize_graph = overwrite or not os.path.exists(checkpoint_file)

    source_vocab = Vocabulary()
    target_vocab = Vocabulary()

    if not initialize_graph:
        # load existing vocabulary that maps words to ids, for source and target
        logger.info("Loading vocabularies.")
        source_vocab.load(os.path.join(save_to, C.SOURCE_VOCAB_FILENAME))
        target_vocab.load(os.path.join(save_to, C.TARGET_VOCAB_FILENAME))
    else:
        # create vocabulary to map words to ids, for source and target
        logger.info("Creating vocabularies.")
        source_vocab = create_vocab(source_data, source_vocab_max_size,
                                    save_to, C.SOURCE_VOCAB_FILENAME)
        target_vocab = create_vocab(target_data, target_vocab_max_size,
                                    save_to, C.TARGET_VOCAB_FILENAME)

        logger.info("Source vocabulary: %s", source_vocab)
        logger.info("Target vocabulary: %s", target_vocab)

    if early_stopping:
        # create copies of vocabulary files used for checking validation
        # data performance
        source_vocab.save(os.path.join(val_model_dir, C.SOURCE_VOCAB_FILENAME))
        target_vocab.save(os.path.join(val_model_dir, C.TARGET_VOCAB_FILENAME))

    # convert training data to list of word ids
    logger.info("Reading training data.")
    reader_ids = list(
        reader.read_parallel(source_data, target_data, source_vocab,
                             target_vocab, C.MAX_LEN))

    # define computation graph
    logger.info("Building computation graph.")

    graph_components = define_computation_graph(source_vocab.size,
                                                target_vocab.size, batch_size)
    encoder_inputs, decoder_targets, decoder_inputs, loss, train_step, decoder_logits, summary = graph_components

    saver = tf.train.Saver()

    with tf.Session() as session:
        if initialize_graph:
            # init
            session.run(tf.global_variables_initializer())
        else:
            # load/restore model for further training
            saver.restore(session, os.path.join(save_to, C.MODEL_FILENAME))

        # write logs (@tensorboard)
        summary_writer = tf.summary.FileWriter(log_to,
                                               graph=tf.get_default_graph())

        logger.info("Starting training.")
        tic = time.time()
        num_batches = math.floor(len(reader_ids) / batch_size)

        if early_stopping:
            # initialize metrics for checking validation data performance
            best_val_loss = float("inf")
            epochs_without_improvement = 0

        # iterate over training data `epochs` times
        for epoch in range(1, epochs + 1):
            total_loss = 0.0
            total_iter = 0

            iter_tic = time.time()

            for x, y, z in reader.iterate(reader_ids, batch_size,
                                          shuffle=True):

                feed_dict = {
                    encoder_inputs: x,
                    decoder_inputs: y,
                    decoder_targets: z
                }

                l, _, s = session.run([loss, train_step, summary],
                                      feed_dict=feed_dict)
                summary_writer.add_summary(s, total_iter)
                total_loss += l
                total_iter += 1
                if total_iter % C.LOGGING_INTERVAL == 0 or total_iter == num_batches:
                    iter_taken = time.time() - iter_tic
                    logger.debug(
                        "Epoch=%s, iteration=%s/%s, samples/second=%.2f",
                        epoch, total_iter, num_batches,
                        batch_size * C.LOGGING_INTERVAL / float(iter_taken))
                    iter_tic = time.time()
            perplexity = np.exp(total_loss / total_iter)
            logger.info("Perplexity on training data after epoch %s: %.2f",
                        epoch, perplexity)

            save_model = True
            if early_stopping and epoch % val_epochs == 0:
                # save a copy of the current model that can be used to check
                # its performance for the validation data
                saver.save(session,
                           os.path.join(val_model_dir, C.MODEL_FILENAME))

                # spin off a thread to call score() for the validation data
                threadPool = ThreadPool(processes=1)
                scoreRes = threadPool.apply_async(
                    score, (source_val_data, target_val_data, val_model_dir,
                            True, False))
                latest_val_loss = scoreRes.get()
                logging.info(
                    "Current model perplexity on validation data: %.2f",
                    latest_val_loss)

                if latest_val_loss < best_val_loss:
                    logging.info(
                        "Lowest perplexity on validation data achieved")
                    best_val_loss = latest_val_loss
                    epochs_without_improvement = 0
                else:
                    save_model = False
                    epochs_without_improvement += 1
                    if epochs_without_improvement >= patience:
                        logging.info(
                            "No improvement in validation data perplexity for %d epochs: terminating training",
                            epochs_without_improvement)
                        return

            if save_model:
                saver.save(session, os.path.join(save_to, C.MODEL_FILENAME))

            if sample_after_epoch:
                # sample from model after epoch
                thread = threading.Thread(target=_sample_after_epoch,
                                          args=[
                                              reader_ids, source_vocab,
                                              target_vocab, save_to, epoch
                                          ])
                thread.start()

        taken = time.time() - tic
        m, s = divmod(taken, 60)
        h, m = divmod(m, 60)

        logger.info(
            "Training finished. Overall time taken to train: %d:%02d:%02d" %
            (h, m, s))
Exemplo n.º 5
0
def train(source_data: str, target_data: str, epochs: int, batch_size: int,
          vocab_max_size: int, save_to: str, log_to: str,
          sample_after_epoch: bool, **kwargs) -> None:
    """Trains a language model. See argument description in `bin/romanesco`."""

    # create folders for model and logs if they don't exist yet
    for folder in [save_to, log_to]:
        if not os.path.exists(folder):
            os.makedirs(folder)

    # create vocabulary to map words to ids, for source and target
    source_vocab = create_vocab(source_data, vocab_max_size, save_to,
                                C.SOURCE_VOCAB_FILENAME)
    target_vocab = create_vocab(target_data, vocab_max_size, save_to,
                                C.TARGET_VOCAB_FILENAME)

    # convert training data to list of word ids
    reader_ids = list(
        reader.read_parallel(source_data, target_data, source_vocab,
                             target_vocab, C.MAX_LEN))

    # define computation graph
    logging.info("Building computation graph.")

    graph_components = define_computation_graph(source_vocab.size,
                                                target_vocab.size, batch_size)
    encoder_inputs, decoder_targets, decoder_inputs, loss, train_step, decoder_logits, summary = graph_components

    saver = tf.train.Saver()

    with tf.Session() as session:
        # init
        session.run(tf.global_variables_initializer())
        # write logs (@tensorboard)
        summary_writer = tf.summary.FileWriter(log_to,
                                               graph=tf.get_default_graph())

        logging.info("Starting training.")

        # iterate over training data `epochs` times
        for epoch in range(1, epochs + 1):
            total_loss = 0.0
            total_iter = 0
            for x, y, z in reader.iterate(reader_ids, batch_size,
                                          shuffle=True):

                feed_dict = {
                    encoder_inputs: x,
                    decoder_inputs: y,
                    decoder_targets: z
                }

                l, _, s = session.run([loss, train_step, summary],
                                      feed_dict=feed_dict)
                summary_writer.add_summary(s, total_iter)
                total_loss += l
                total_iter += 1
                if total_iter % 100 == 0:
                    logging.debug("Epoch=%s, iteration=%s", epoch, total_iter)
            perplexity = np.exp(total_loss / total_iter)
            logging.info("Perplexity on training data after epoch %s: %.2f",
                         epoch, perplexity)
            saver.save(session, os.path.join(save_to, C.MODEL_FILENAME))

            if sample_after_epoch:
                # sample from model after epoch
                thread = threading.Thread(target=_sample_after_epoch,
                                          args=[
                                              reader_ids, source_vocab,
                                              target_vocab, save_to, epoch
                                          ])
                thread.start()

        logging.info("Training finished.")
Exemplo n.º 6
0
def train(source_data: str,
          target_data: str,
          epochs: int,
          batch_size: int,
          source_vocab_max_size: int,
          target_vocab_max_size: int,
          save_to: str,
          log_to: str,
          sample_after_epoch: bool,
          **kwargs) -> None:
    """Trains a language model. See argument description in `bin/romanesco`."""

    # create folders for model and logs if they don't exist yet
    for folder in [save_to, log_to]:
        if not os.path.exists(folder):
            os.makedirs(folder)

    logger.info("Creating vocabularies.")

    # create vocabulary to map words to ids, for source and target
    source_vocab = create_vocab(source_data, source_vocab_max_size, save_to, C.SOURCE_VOCAB_FILENAME)
    target_vocab = create_vocab(target_data, target_vocab_max_size, save_to, C.TARGET_VOCAB_FILENAME)

    logger.info("Source vocabulary: %s", source_vocab)
    logger.info("Target vocabulary: %s", target_vocab)

    # convert training data to list of word ids
    logger.info("Reading training data.")
    reader_ids = list(reader.read_parallel(source_data, target_data, source_vocab, target_vocab, C.MAX_LEN))

    # define computation graph
    logger.info("Building computation graph.")

    graph_components = define_computation_graph(source_vocab.size, target_vocab.size, batch_size)
    encoder_inputs, decoder_targets, decoder_inputs, loss, train_step, decoder_logits, summary = graph_components

    saver = tf.train.Saver()

    with tf.Session() as session:
        # init
        session.run(tf.global_variables_initializer())
        # write logs (@tensorboard)
        summary_writer = tf.summary.FileWriter(log_to, graph=tf.get_default_graph())

        logger.info("Starting training.")
        tic = time.time()
        num_batches = math.floor(len(reader_ids) / batch_size)

        # iterate over training data `epochs` times
        for epoch in range(1, epochs + 1):
            total_loss = 0.0
            total_iter = 0

            iter_tic = time.time()

            for x, y, z in reader.iterate(reader_ids, batch_size, shuffle=True):

                feed_dict = {encoder_inputs: x,
                             decoder_inputs: y,
                             decoder_targets: z}

                l, _, s = session.run([loss, train_step, summary],
                                      feed_dict=feed_dict)
                summary_writer.add_summary(s, total_iter)
                total_loss += l
                total_iter += 1
                if total_iter % C.LOGGING_INTERVAL == 0 or total_iter == num_batches:
                    iter_taken = time.time() - iter_tic
                    logger.debug("Epoch=%s, iteration=%s/%s, samples/second=%.2f", epoch, total_iter, num_batches, batch_size * C.LOGGING_INTERVAL / float(iter_taken))
                    iter_tic = time.time()
            perplexity = np.exp(total_loss / total_iter)
            logger.info("Perplexity on training data after epoch %s: %.2f", epoch, perplexity)
            saver.save(session, os.path.join(save_to, C.MODEL_FILENAME))

            if sample_after_epoch:
                # sample from model after epoch
                thread = threading.Thread(target=_sample_after_epoch, args=[reader_ids, source_vocab, target_vocab, save_to, epoch])
                thread.start()

        taken = time.time() - tic
        m, s = divmod(taken, 60)
        h, m = divmod(m, 60)

        logger.info("Training finished. Overall time taken to train: %d:%02d:%02d" % (h, m, s))