Exemplo n.º 1
0
def _sample_after_epoch(reader_ids: List[reader.ReaderTuple],
                        source_vocab: Vocabulary, target_vocab: Vocabulary,
                        load_from: str, epoch: int) -> None:
    """
    Samples translations during training. Three sentences are picked at random,
    translated with the current model and logged.
    """
    input_lines, output_lines = zip(*random.sample(reader_ids, 3))

    input_lines = [
        " ".join(source_vocab.get_words(input_line))
        for input_line in input_lines
    ]
    output_lines = [
        " ".join(target_vocab.get_words(output_line))
        for output_line in output_lines
    ]
    translations = translate_lines(load_from=load_from,
                                   input_lines=input_lines,
                                   train_mode=True)

    logger.debug("Sampled translations after epoch %s.", epoch)
    for input_line, output_line, translation in zip(input_lines, output_lines,
                                                    translations):
        logger.debug("-" * 30)
        logger.debug("Input:\t\t%s", input_line)
        logger.debug("Predicted output:\t%s", translation)
        logger.debug("Actual output:\t%s", output_line)
    logger.debug("-" * 30)
Exemplo n.º 2
0
def translate_line(session: tf.Session, line: str,
                   source_vocab: vocab.Vocabulary,
                   target_vocab: vocab.Vocabulary, encoder_inputs: tf.Tensor,
                   decoder_inputs: tf.Tensor, decoder_targets: tf.Tensor,
                   decoder_logits: tf.Tensor) -> str:
    """
    Translates one single input string.
    """

    source_ids = np.array(source_vocab.get_ids(line.split())).reshape(1, -1)

    translated_ids = []  # type: List[int]

    for _ in range(C.TRANSLATION_MAX_LEN):

        # target ids will serve as decoder inputs and decoder targets,
        # but decoder targets will not be used to compute logits
        target_ids = np.array([C.BOS_ID] + translated_ids).reshape(1, -1)

        feed_dict = {
            encoder_inputs: source_ids,
            decoder_inputs: target_ids,
            decoder_targets: target_ids
        }
        logits_result = session.run([decoder_logits], feed_dict=feed_dict)

        # first session result, first item in batch, target symbol at last position
        next_symbol_logits = logits_result[0][0][-1]

        next_id = np.argmax(next_symbol_logits)

        # # get the the id with the highest logit while suppress the <unk> token
        # # get ids of the two items with highest value
        # ind_candidates = np.argpartition(next_symbol_logits, -2)[-2:]
        # ind_candidates = ind_candidates[np.argsort(next_symbol_logits[ind_candidates])]  # sort candidates
        # if ind_candidates[-1] != C.UNK_ID:
        #     next_id = ind_candidates[-1]
        # else:
        #     next_id = ind_candidates[-2]
        #

        if next_id in [C.EOS_ID, C.PAD_ID]:
            break

        translated_ids.append(next_id)

    words = target_vocab.get_words(translated_ids)

    return ' '.join(words)
Exemplo n.º 3
0
def translate_line(session: tf.Session,
                   line: str,
                   source_vocab: vocab.Vocabulary,
                   target_vocab: vocab.Vocabulary,
                   encoder_inputs: tf.Tensor,
                   decoder_inputs: tf.Tensor,
                   decoder_targets: tf.Tensor,
                   decoder_logits: tf.Tensor) -> str:
    """
    Translates one single input string.
    """

    source_ids = np.array(source_vocab.get_ids(line.split())).reshape(1, -1)

    translated_ids = []  # type: List[int]

    for _ in range(C.TRANSLATION_MAX_LEN):

        # target ids will serve as decoder inputs and decoder targets,
        # but decoder targets will not be used to compute logits
        target_ids = np.array([C.BOS_ID] + translated_ids).reshape(1, -1)

        feed_dict = {encoder_inputs: source_ids,
                     decoder_inputs: target_ids,
                     decoder_targets: target_ids}
        logits_result = session.run([decoder_logits], feed_dict=feed_dict)

        # first session result, first item in batch, target symbol at last position
        next_symbol_logits = logits_result[0][0][-1]
        next_id = np.argmax(next_symbol_logits)

        if next_id in [C.EOS_ID, C.PAD_ID]:
            break
            
        if next_id == C.UNK_ID:
            max = 0
            for number in next_symbol_logits:
                if number> max:
                    max = number
                else:
                    pass
            next_id = max

        translated_ids.append(next_id)

    words = target_vocab.get_words(translated_ids)

    return ' '.join(words)
Exemplo n.º 4
0
def translate_line(session: tf.Session, line: str,
                   source_vocab: vocab.Vocabulary,
                   target_vocab: vocab.Vocabulary, encoder_inputs: tf.Tensor,
                   decoder_inputs: tf.Tensor, decoder_targets: tf.Tensor,
                   decoder_logits: tf.Tensor) -> str:
    """
    Translates one single input string.
    """

    source_ids = np.array(source_vocab.get_ids(line.split())).reshape(1, -1)

    translated_ids = []  # type: List[int]

    for _ in range(C.TRANSLATION_MAX_LEN):

        # target ids will serve as decoder inputs and decoder targets,
        # but decoder targets will not be used to compute logits
        target_ids = np.array([C.BOS_ID] + translated_ids).reshape(1, -1)

        feed_dict = {
            encoder_inputs: source_ids,
            decoder_inputs: target_ids,
            decoder_targets: target_ids
        }
        logits_result = session.run([decoder_logits], feed_dict=feed_dict)

        # first session result, first item in batch, target symbol at last position
        next_symbol_logits = logits_result[0][0][-1]

        # Original token selection with argmax
        next_id = np.argmax(next_symbol_logits)

        ####### if np.argmax == unknown word ID, get 2nd argmax value.
        if next_id == 0:
            next_id = np.argsort(next_symbol_logits)[-2]

        if next_id in [C.EOS_ID, C.PAD_ID]:
            break

        translated_ids.append(next_id)

    words = target_vocab.get_words(translated_ids)

    return ' '.join(words)
Exemplo n.º 5
0
def score(source_data: str,
          target_data: str,
          load_from: str,
          corpus_average: bool,
          normalize: float = 0.6,
          **kwargs):
    """Scores a text using a trained translation model. See argument description in `bin/daikon`."""

    # fix batch size at 1 to get individual scores for sentences
    batch_size = 1

    source_vocab = Vocabulary()
    target_vocab = Vocabulary()
    source_vocab.load(os.path.join(load_from, C.SOURCE_VOCAB_FILENAME))
    target_vocab.load(os.path.join(load_from, C.TARGET_VOCAB_FILENAME))

    reader_ids = list(
        reader.read_parallel(source_data, target_data, source_vocab,
                             target_vocab, C.SCORE_MAX_LEN))

    encoder_inputs, decoder_targets, decoder_inputs, loss, _, _, _ = define_computation_graph(
        source_vocab.size, target_vocab.size, batch_size)

    saver = tf.train.Saver()

    with tf.Session() as session:
        # load model
        saver.restore(session, os.path.join(load_from, C.MODEL_FILENAME))

        losses = []
        total_iter = 0
        for x, y, z in reader.iterate(reader_ids, batch_size, shuffle=False):
            feed_dict = {
                encoder_inputs: x,
                decoder_inputs: y,
                decoder_targets: z
            }
            l = session.run([loss], feed_dict=feed_dict)

            # first session variable
            l = l[0]
            if normalize:
                # normalize by length of target sequence (including EOS token)
                l /= np.power(y.shape[1], normalize)

            losses.append(l)
            total_iter += 1

        if corpus_average:
            total_loss = np.sum(losses)
            perplexity = np.exp(total_loss / total_iter)
            return perplexity
        else:
            return np.exp(losses)
Exemplo n.º 6
0
def _sample_after_epoch(reader_ids: List[reader.ReaderTuple],
                        source_vocab: Vocabulary,
                        target_vocab: Vocabulary,
                        load_from: str,
                        epoch: int) -> None:
    """
    Samples translations during training. Three sentences are picked at random,
    translated with the current model and logged.
    """
    input_lines, output_lines = zip(*random.sample(reader_ids, 3))

    input_lines = [" ".join(source_vocab.get_words(input_line)) for input_line in input_lines]
    output_lines = [" ".join(target_vocab.get_words(output_line)) for output_line in output_lines]
    translations = translate_lines(load_from=load_from, input_lines=input_lines, train_mode=True)

    logger.debug("Sampled translations after epoch %s.", epoch)
    for input_line, output_line, translation in zip(input_lines, output_lines, translations):
        logger.debug("-" * 30)
        logger.debug("Input:\t\t%s", input_line)
        logger.debug("Predicted output:\t%s", translation)
        logger.debug("Actual output:\t%s", output_line)
    logger.debug("-" * 30)
Exemplo n.º 7
0
def score(source_data: str, target_data: str, load_from: str, corpus_average: bool, normalize: bool, **kwargs):
    """Scores a text using a trained translation model. See argument description in `bin/daikon`."""

    # fix batch size at 1 to get individual scores for sentences
    batch_size = 1

    source_vocab = Vocabulary()
    target_vocab = Vocabulary()
    source_vocab.load(os.path.join(load_from, C.SOURCE_VOCAB_FILENAME))
    target_vocab.load(os.path.join(load_from, C.TARGET_VOCAB_FILENAME))

    reader_ids = list(reader.read_parallel(source_data, target_data, source_vocab, target_vocab, C.SCORE_MAX_LEN))

    encoder_inputs, decoder_targets, decoder_inputs, loss, _, _, _ = define_computation_graph(source_vocab.size, target_vocab.size, batch_size)

    saver = tf.train.Saver()

    with tf.Session() as session:
        # load model
        saver.restore(session, os.path.join(load_from, C.MODEL_FILENAME))

        losses = []
        total_iter = 0
        for x, y, z in reader.iterate(reader_ids, batch_size, shuffle=False):
            feed_dict = {encoder_inputs: x,
                         decoder_inputs: y,
                         decoder_targets: z}
            l = session.run([loss], feed_dict=feed_dict)

            # first session variable
            l = l[0]
            if normalize:
                # normalize by length of target sequence (including EOS token)
                l /= y.shape[1]

            losses.append(l)
            total_iter += 1

        if corpus_average:
            total_loss = np.sum(losses)
            perplexity = np.exp(total_loss / total_iter)
            return perplexity
        else:
            return np.exp(losses)
Exemplo n.º 8
0
def train(source_data: str,
          target_data: str,
          epochs: int,
          batch_size: int,
          source_vocab_max_size: int,
          target_vocab_max_size: int,
          save_to: str,
          log_to: str,
          sample_after_epoch: bool,
          source_val_data: str = None,
          target_val_data: str = None,
          val_epochs: int = C.VAL_EPOCHS,
          patience: int = C.PATIENCE,
          overwrite: bool = False,
          **kwargs) -> None:
    """Trains a translation model. See argument description in `bin/daikon`."""

    # enable early stopping if validation data files were specified
    early_stopping = source_val_data is not None and target_val_data is not None

    # create folders for model and logs if they don't exist yet
    for folder in [save_to, log_to]:
        if not os.path.exists(folder):
            os.makedirs(folder)

    if early_stopping:
        val_model_dir = os.path.join(save_to, C.VALIDATION_MODEL_DIR)
        if not os.path.exists(val_model_dir):
            os.makedirs(val_model_dir)

    # create a new graph if the overwrite option is enabled or there is no
    # existing model in the save_to directory
    checkpoint_file = os.path.join(save_to, C.MODEL_CHECKPOINT)
    initialize_graph = overwrite or not os.path.exists(checkpoint_file)

    source_vocab = Vocabulary()
    target_vocab = Vocabulary()

    if not initialize_graph:
        # load existing vocabulary that maps words to ids, for source and target
        logger.info("Loading vocabularies.")
        source_vocab.load(os.path.join(save_to, C.SOURCE_VOCAB_FILENAME))
        target_vocab.load(os.path.join(save_to, C.TARGET_VOCAB_FILENAME))
    else:
        # create vocabulary to map words to ids, for source and target
        logger.info("Creating vocabularies.")
        source_vocab = create_vocab(source_data, source_vocab_max_size,
                                    save_to, C.SOURCE_VOCAB_FILENAME)
        target_vocab = create_vocab(target_data, target_vocab_max_size,
                                    save_to, C.TARGET_VOCAB_FILENAME)

        logger.info("Source vocabulary: %s", source_vocab)
        logger.info("Target vocabulary: %s", target_vocab)

    if early_stopping:
        # create copies of vocabulary files used for checking validation
        # data performance
        source_vocab.save(os.path.join(val_model_dir, C.SOURCE_VOCAB_FILENAME))
        target_vocab.save(os.path.join(val_model_dir, C.TARGET_VOCAB_FILENAME))

    # convert training data to list of word ids
    logger.info("Reading training data.")
    reader_ids = list(
        reader.read_parallel(source_data, target_data, source_vocab,
                             target_vocab, C.MAX_LEN))

    # define computation graph
    logger.info("Building computation graph.")

    graph_components = define_computation_graph(source_vocab.size,
                                                target_vocab.size, batch_size)
    encoder_inputs, decoder_targets, decoder_inputs, loss, train_step, decoder_logits, summary = graph_components

    saver = tf.train.Saver()

    with tf.Session() as session:
        if initialize_graph:
            # init
            session.run(tf.global_variables_initializer())
        else:
            # load/restore model for further training
            saver.restore(session, os.path.join(save_to, C.MODEL_FILENAME))

        # write logs (@tensorboard)
        summary_writer = tf.summary.FileWriter(log_to,
                                               graph=tf.get_default_graph())

        logger.info("Starting training.")
        tic = time.time()
        num_batches = math.floor(len(reader_ids) / batch_size)

        if early_stopping:
            # initialize metrics for checking validation data performance
            best_val_loss = float("inf")
            epochs_without_improvement = 0

        # iterate over training data `epochs` times
        for epoch in range(1, epochs + 1):
            total_loss = 0.0
            total_iter = 0

            iter_tic = time.time()

            for x, y, z in reader.iterate(reader_ids, batch_size,
                                          shuffle=True):

                feed_dict = {
                    encoder_inputs: x,
                    decoder_inputs: y,
                    decoder_targets: z
                }

                l, _, s = session.run([loss, train_step, summary],
                                      feed_dict=feed_dict)
                summary_writer.add_summary(s, total_iter)
                total_loss += l
                total_iter += 1
                if total_iter % C.LOGGING_INTERVAL == 0 or total_iter == num_batches:
                    iter_taken = time.time() - iter_tic
                    logger.debug(
                        "Epoch=%s, iteration=%s/%s, samples/second=%.2f",
                        epoch, total_iter, num_batches,
                        batch_size * C.LOGGING_INTERVAL / float(iter_taken))
                    iter_tic = time.time()
            perplexity = np.exp(total_loss / total_iter)
            logger.info("Perplexity on training data after epoch %s: %.2f",
                        epoch, perplexity)

            save_model = True
            if early_stopping and epoch % val_epochs == 0:
                # save a copy of the current model that can be used to check
                # its performance for the validation data
                saver.save(session,
                           os.path.join(val_model_dir, C.MODEL_FILENAME))

                # spin off a thread to call score() for the validation data
                threadPool = ThreadPool(processes=1)
                scoreRes = threadPool.apply_async(
                    score, (source_val_data, target_val_data, val_model_dir,
                            True, False))
                latest_val_loss = scoreRes.get()
                logging.info(
                    "Current model perplexity on validation data: %.2f",
                    latest_val_loss)

                if latest_val_loss < best_val_loss:
                    logging.info(
                        "Lowest perplexity on validation data achieved")
                    best_val_loss = latest_val_loss
                    epochs_without_improvement = 0
                else:
                    save_model = False
                    epochs_without_improvement += 1
                    if epochs_without_improvement >= patience:
                        logging.info(
                            "No improvement in validation data perplexity for %d epochs: terminating training",
                            epochs_without_improvement)
                        return

            if save_model:
                saver.save(session, os.path.join(save_to, C.MODEL_FILENAME))

            if sample_after_epoch:
                # sample from model after epoch
                thread = threading.Thread(target=_sample_after_epoch,
                                          args=[
                                              reader_ids, source_vocab,
                                              target_vocab, save_to, epoch
                                          ])
                thread.start()

        taken = time.time() - tic
        m, s = divmod(taken, 60)
        h, m = divmod(m, 60)

        logger.info(
            "Training finished. Overall time taken to train: %d:%02d:%02d" %
            (h, m, s))
Exemplo n.º 9
0
def translate_line(session: tf.Session, line: str,
                   source_vocab: vocab.Vocabulary,
                   target_vocab: vocab.Vocabulary, encoder_inputs: tf.Tensor,
                   decoder_inputs: tf.Tensor, decoder_targets: tf.Tensor,
                   decoder_logits: tf.Tensor) -> str:
    """
    Translates one single input string.
    """

    source_ids = np.array(source_vocab.get_ids(line.split())).reshape(1, -1)

    #print(line)

    # instead of one list, we have a dictionary of float : list pairs
    # the float is always equal to the probability of this sentence
    #~ translated_ids = []  # type: List[int]

    # num of beams
    k = 5

    # new list of finished translations
    sent_dict = {}

    finished_sent_dict = {}

    for _ in range(C.TRANSLATION_MAX_LEN):

        # target ids will serve as decoder inputs and decoder targets,
        # but decoder targets will not be used to compute logits

        potential_sentences = {}

        if k == 0:
            break

        if len(sent_dict) == 0:
            target_ids = np.array([C.BOS_ID]).reshape(1, -1)

            feed_dict = {
                encoder_inputs: source_ids,
                decoder_inputs: target_ids,
                decoder_targets: target_ids
            }
            logits_result = session.run([decoder_logits], feed_dict=feed_dict)

            next_symbol_logits = softmax(logits_result[0][0][-1])

            potential_next_ids = []

            for __ in range(k):
                next_id = np.argmax(next_symbol_logits)
                next_id_value = next_symbol_logits[next_id]
                potential_next_ids.append((next_id, next_id_value))
                # after finding, we delete the element
                next_symbol_logits = np.delete(next_symbol_logits, next_id)

            #print("POTENTIAL NEXTS", potential_next_ids)
            #print(target_vocab.get_words([x[0] for x in potential_next_ids]))

            #print("POTENTIAL START", potential_next_ids)

            for new_id in potential_next_ids:
                if new_id not in [C.EOS_ID, C.PAD_ID]:
                    sent_dict[new_id[1]] = (new_id[0], )
            #print("START", sent_dict)

        else:
            for prob, sent in sent_dict.items():
                target_ids = np.array([C.BOS_ID] + list(sent)).reshape(1, -1)

                feed_dict = {
                    encoder_inputs: source_ids,
                    decoder_inputs: target_ids,
                    decoder_targets: target_ids
                }
                logits_result = session.run([decoder_logits],
                                            feed_dict=feed_dict)

                # first session result, first item in batch, target symbol at last position
                next_symbol_logits = softmax(logits_result[0][0][-1])
                # 1. change:
                # retrieve k number of highest elements
                # loop argmax, everytime the highest has been found, delete it and argmax again
                # till k highest have been found.
                #~ next_id = np.argmax(next_symbol_logits)
                potential_next_ids = []

                for __ in range(k):
                    next_id = np.argmax(next_symbol_logits)
                    next_id_value = next_symbol_logits[next_id]
                    potential_next_ids.append((next_id, next_id_value))
                    # after finding, we delete the element
                    next_symbol_logits = np.delete(next_symbol_logits, next_id)

                #print("POTENTIAL NEXTS", potential_next_ids)
                #print(target_vocab.get_words([x[0] for x in potential_next_ids]))

                for new_id in potential_next_ids:
                    #print(sent)
                    #print(new_id)
                    new_sent = list(sent)
                    new_sent.append(new_id[0])
                    #print(new_sent)
                    new_value = prob * new_id[1]
                    potential_sentences[new_value] = new_sent

            # clear sent dict for the next loop
            sent_dict = {}
            # decide which k sentences are taken
            potential_sentences = sorted(potential_sentences.items(),
                                         reverse=True)[:k]
            #print("CHOSEN:", potential_sentences)
            for val, sent in potential_sentences:
                #print(sent)
                # if ending in <EOS>, add to finished
                if sent[-1] in [C.EOS_ID, C.PAD_ID]:
                    finished_sent_dict[val] = sent
                    k -= 1
                # else continue
                else:
                    sent_dict[val] = sent

            #print("CHOSEN", sent_dict)

    # normalize the remaining sentences by length-alpha
    norm_dict = {}
    for val, sent in finished_sent_dict.items():
        if len(sent) > 0:
            val = np.log10(val) / len(sent)**0.65
            norm_dict[val] = sent

    #print("LEN_NORM", norm_dict)

    # only return our best translation
    try:
        best_sent = sorted(norm_dict.items(), reverse=True)[0][1]
    except:
        print("empty line...")
        print(norm_dict)
        best_sent = []

    #print("BEST", best_sent)

    words = target_vocab.get_words(best_sent)

    #print("WORDS", words)

    return ' '.join(words)