Пример #1
0
def main(_):
    assert FLAGS.checkpoint_dir, "--checkpoint_dir is required."
    assert FLAGS.source_test_path, "--source_test_path is required."
    assert FLAGS.target_test_path, "--target_test_path is required."
    assert FLAGS.reference_test_path, "--reference_test_path is required."
    assert FLAGS.source_vocab_path, "--souce_vocab_path is required."
    assert FLAGS.target_vocab_path, "--target_vocab_path is required."

    # Read vocabularies.
    source_vocab, _ = utils.initialize_vocabulary(FLAGS.source_vocab_path)
    target_vocab, _ = utils.initialize_vocabulary(FLAGS.target_vocab_path)

    # Read test set.
    source_sentences, target_sentences, references = utils.read_data_with_ref(
        FLAGS.source_test_path, FLAGS.target_test_path,
        FLAGS.reference_test_path)

    # Convert sentences to token ids sequences.
    source_sentences_ids = [
        utils.sentence_to_token_ids(sent, source_vocab, FLAGS.max_seq_length)
        for sent in source_sentences
    ]
    target_sentences_ids = [
        utils.sentence_to_token_ids(sent, target_vocab, FLAGS.max_seq_length)
        for sent in target_sentences
    ]

    utils.reset_graph()
    with tf.Session() as sess:
        # Restore saved model.
        utils.restore_model(sess, FLAGS.checkpoint_dir)

        # Recover placeholders and ops for evaluation.
        x_source = sess.graph.get_tensor_by_name("x_source:0")
        source_seq_length = sess.graph.get_tensor_by_name(
            "source_seq_length:0")

        x_target = sess.graph.get_tensor_by_name("x_target:0")
        target_seq_length = sess.graph.get_tensor_by_name(
            "target_seq_length:0")

        labels = sess.graph.get_tensor_by_name("labels:0")

        placeholders = [
            x_source, source_seq_length, x_target, target_seq_length, labels
        ]

        probs = sess.graph.get_tensor_by_name("feed_forward/output/probs:0")

        # Run evaluation.
        evaluate(sess, source_sentences, target_sentences, references,
                 source_sentences_ids, target_sentences_ids, probs,
                 placeholders)
Пример #2
0
def main(_):
    assert FLAGS.checkpoint_dir, "--checkpoint_dir is required."
    assert FLAGS.extract_dir, "--extract_dir is required."
    assert FLAGS.source_vocab_path, "--source_vocab_path is required."
    assert FLAGS.target_vocab_path, "--target_vocab_path is required."
    assert FLAGS.source_output_path, "--source_output_path is required."
    assert FLAGS.target_output_path, "--target_output_path is required."
    assert FLAGS.score_output_path, "--score_output_path is required."
    assert FLAGS.source_language, "--source_language is required."
    assert FLAGS.target_language, "--target_language is required."

    # Read vocabularies.
    source_vocab, _ = utils.initialize_vocabulary(FLAGS.source_vocab_path)
    target_vocab, _ = utils.initialize_vocabulary(FLAGS.target_vocab_path)

    source_vocab_words = read_vocabulary(FLAGS.source_vocab_path)
    target_vocab_words = read_vocabulary(FLAGS.target_vocab_path)

    utils.reset_graph()
    with tf.Session() as sess:
        # Restore saved model.
        utils.restore_model(sess, FLAGS.checkpoint_dir)

        # Recover placeholders and ops for extraction.
        x_source = sess.graph.get_tensor_by_name("x_source:0")
        source_seq_length = sess.graph.get_tensor_by_name(
            "source_seq_length:0")

        x_target = sess.graph.get_tensor_by_name("x_target:0")
        target_seq_length = sess.graph.get_tensor_by_name(
            "target_seq_length:0")

        labels = sess.graph.get_tensor_by_name("labels:0")

        placeholders = [
            x_source, source_seq_length, x_target, target_seq_length, labels
        ]

        probs = sess.graph.get_tensor_by_name("feed_forward/output/probs:0")

        with open(FLAGS.source_output_path, mode="w", encoding="utf-8") as source_output_file, \
                open(FLAGS.target_output_path, mode="w", encoding="utf-8") as target_output_file, \
                open(FLAGS.score_output_path, mode="w", encoding="utf-8") as score_output_file:

            source_docs, target_docs = read_docs(FLAGS.extract_dir,
                                                 source_vocab, target_vocab)
            pairs = extract_pairs(sess, source_docs, target_docs,
                                  source_sentences_ids, target_sentences_ids,
                                  probs, placeholders)

            #for source_path, target_path in zip(source_paths, target_paths):
            for source_path, target_path in itertools.product(
                    source_paths, target_paths):
                #print("paths", source_path, target_path)
                # Read sentences from articles.
                source_sentences, target_sentences = read_articles(
                    source_path, target_path)

                # Convert sentences to token ids sequences.
                source_sentences_ids = [
                    utils.sentence_to_token_ids(sent, source_vocab,
                                                FLAGS.max_seq_length)
                    for sent in source_sentences
                ]
                target_sentences_ids = [
                    utils.sentence_to_token_ids(sent, target_vocab,
                                                FLAGS.max_seq_length)
                    for sent in target_sentences
                ]

                # Extract sentence pairs.
                pairs = extract_pairs(sess, source_sentences, target_sentences,
                                      source_sentences_ids,
                                      target_sentences_ids, probs,
                                      placeholders)
                if not pairs:
                    continue
                for source_sentence, target_sentence, score in pairs:
                    source_output_file.write(source_sentence)
                    target_output_file.write(target_sentence)
                    score_output_file.write(str(score) + "\n")
Пример #3
0
    def build_graph(self):
        # Reset previous graph.
        reset_graph()

        # Placeholders.
        x_source = tf.placeholder(tf.int32,
                                  shape=[None, None],
                                  name="x_source")

        source_seq_length = tf.placeholder(tf.int32,
                                           shape=[None],
                                           name="source_seq_length")

        x_target = tf.placeholder(tf.int32,
                                  shape=[None, None],
                                  name="x_target")

        target_seq_length = tf.placeholder(tf.int32,
                                           shape=[None],
                                           name="target_seq_length")

        labels = tf.placeholder(tf.float32,
                                shape=[None],
                                name="labels")

        input_dropout = tf.placeholder_with_default(1.0,
                                                    shape=[],
                                                    name="input_dropout")

        output_dropout = tf.placeholder_with_default(1.0,
                                                     shape=[],
                                                     name="output_dropout")

        decision_threshold = tf.placeholder_with_default(0.5,
                                                         shape=[],
                                                         name="decision_threshold")

        # Embedding layer.
        with tf.variable_scope("embeddings"):
            if self.config.source_embeddings_path is not None and self.config.target_embeddings_path is not None:
                source_pretrained_embeddings,\
                target_pretrained_embeddings = get_pretrained_embeddings(
                    source_embeddings_path,
                    target_embeddings_path,
                    source_vocab,
                    target_vocab)
                assert source_pretrained_embeddings.shape[1] == target_pretrained_embeddings.shape[1]
                self.config.embedding_size = source_pretrained_embeddings.shape[1]
                if self.config.fix_pretrained:
                    source_embeddings = tf.get_variable(
                        name="source_embeddings_matrix",
                        shape=[self.config.source_vocab_size, self.config.embedding_size],
                        initializer=tf.constant_initializer(source_pretrained_embeddings),
                        trainable=False)
                    target_embeddings = tf.get_variable(
                        name="target_embeddings_matrix",
                        shape=[self.config.target_vocab_size, self.config.embedding_size],
                        initializer=tf.constant_initializer(target_pretrained_embeddings),
                        trainable=False)
                else:
                    source_embeddings = tf.get_variable(
                        name="source_embeddings_matrix",
                        shape=[self.config.source_vocab_size, self.config.embedding_size],
                        initializer=tf.constant_initializer(source_pretrained_embeddings))
                    target_embeddings = tf.get_variable(
                        name="target_embeddings_matrix",
                        shape=[self.config.target_vocab_size, self.config.embedding_size],
                        initializer=tf.constant_initializer(target_pretrained_embeddings))
            else:
                source_embeddings = tf.get_variable(
                    name="source_embeddings_matrix",
                    shape=[self.config.source_vocab_size, self.config.embedding_size])
                target_embeddings = tf.get_variable(
                    name="target_embeddings_matrix",
                    shape=[self.config.target_vocab_size, self.config.embedding_size])

            source_rnn_inputs = tf.nn.embedding_lookup(source_embeddings, x_source)
            target_rnn_inputs = tf.nn.embedding_lookup(target_embeddings, x_target)
            source_rnn_inputs = tf.nn.dropout(source_rnn_inputs,
                                              keep_prob=input_dropout,
                                              name="source_seq_embeddings")
            target_rnn_inputs = tf.nn.dropout(target_rnn_inputs,
                                              keep_prob=input_dropout,
                                              name="target_seq_embeddings")

        # BiRNN encoder.
        with tf.variable_scope("birnn") as scope:
            if self.config.use_lstm:
                cell_fw = tf.nn.rnn_cell.LSTMCell(self.config.state_size, use_peepholes=True)
                cell_bw = tf.nn.rnn_cell.LSTMCell(self.config.state_size, use_peepholes=True)
            else:
                cell_fw = tf.nn.rnn_cell.GRUCell(self.config.state_size)
                cell_bw = tf.nn.rnn_cell.GRUCell(self.config.state_size)

            cell_fw = tf.nn.rnn_cell.DropoutWrapper(cell_fw, output_keep_prob=output_dropout)
            cell_bw = tf.nn.rnn_cell.DropoutWrapper(cell_bw, output_keep_prob=output_dropout)

            if self.config.num_layers > 1:
                if self.config.use_lstm:
                    cell_fw = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.LSTMCell(self.config.state_size,
                                                                                   use_peepholes=True)
                                                           for _ in range(self.config.num_layers)])
                    cell_bw = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.LSTMCell(self.config.state_size,
                                                                                   use_peepholes=True)
                                                           for _ in range(self.config.num_layers)])
                else:
                    cell_fw = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.GRUCell(self.config.state_size)
                                                           for _ in range(self.config.num_layers)])
                    cell_bw = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.GRUCell(self.config.state_size)
                                                           for _ in range(self.config.num_layers)])

            with tf.variable_scope(scope):
                source_rnn_outputs, source_final_state = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw=cell_fw,
                    cell_bw=cell_bw,
                    inputs=source_rnn_inputs,
                    sequence_length=source_seq_length,
                    dtype=tf.float32)

            with tf.variable_scope(scope, reuse=True):
                target_rnn_outputs, target_final_state = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw=cell_fw,
                    cell_bw=cell_bw,
                    inputs=target_rnn_inputs,
                    sequence_length=target_seq_length,
                    dtype=tf.float32)

            self.config.state_size *= 2
            # Mean and max pooling only work for 1 layer BiRNN.
            if self.config.use_mean_pooling:
                source_final_state = self.average_pooling(source_rnn_outputs, source_seq_length)
                target_final_state = self.average_pooling(target_rnn_outputs, target_seq_length)
            elif self.config.use_max_pooling:
                source_final_state = self.max_pooling(source_rnn_outputs)
                target_final_state = self.max_pooling(target_rnn_outputs)
            else:
                source_final_state_fw, source_final_state_bw = source_final_state
                target_final_state_fw, target_final_state_bw = target_final_state
                if self.config.num_layers > 1:
                    source_final_state_fw = source_final_state_fw[-1]
                    source_final_state_bw = source_final_state_bw[-1]
                    target_final_state_fw = target_final_state_fw[-1]
                    target_final_state_bw = target_final_state_bw[-1]
                if self.config.use_lstm:
                    source_final_state_fw = source_final_state_fw.h
                    source_final_state_bw = source_final_state_bw.h
                    target_final_state_fw = target_final_state_fw.h
                    target_final_state_bw = target_final_state_bw.h
                source_final_state = tf.concat([source_final_state_fw, source_final_state_bw],
                                               axis=1, name="source_final_state_ph")
                target_final_state = tf.concat([target_final_state_fw, target_final_state_bw],
                                               axis=1)

        # Feed-forward neural network.
        with tf.variable_scope("feed_forward"):
            h_multiply = tf.multiply(source_final_state, target_final_state)
            h_abs_diff = tf.abs(tf.subtract(source_final_state, target_final_state))

            W_1 = tf.get_variable(name="W_1",
                                  shape=[self.config.state_size, self.config.hidden_size])
            W_2 = tf.get_variable(name="W_2",
                                  shape=[self.config.state_size, self.config.hidden_size])
            b_1 = tf.get_variable(name="b_1",
                                  shape=[self.config.hidden_size],
                                  initializer=tf.constant_initializer(0.0))

            h_semantic = tf.tanh(tf.matmul(h_multiply, W_1) + tf.matmul(h_abs_diff, W_2) + b_1)

            W_3 = tf.get_variable(name="W_3",
                                  shape=[self.config.hidden_size, 1])
            b_2 = tf.get_variable(name="b_2",
                                  shape=[1],
                                  initializer=tf.constant_initializer(0.0))

            logits = tf.matmul(h_semantic, W_3) + b_2
            logits = tf.squeeze(logits,
                                name="logits")

            # Sigmoid output layer.
            with tf.name_scope("output"):
                probs = tf.sigmoid(logits,
                                   name="probs")
                predicted_class = tf.cast(tf.greater(probs, decision_threshold),
                                          tf.float32,
                                          name="predicted_class")

        # Loss.
        with tf.name_scope("cross_entropy"):
            losses = tf.nn.sigmoid_cross_entropy_with_logits(
                logits=logits,
                labels=labels,
                name="cross_entropy_per_sequence")
            mean_loss = tf.reduce_mean(losses,
                                       name="cross_entropy_loss")

        # Optimization.
        with tf.name_scope("optimization"):
            global_step = tf.Variable(initial_value=0,
                                      trainable=False,
                                      name="global_step")
            optimizer = tf.train.AdamOptimizer(self.config.learning_rate)
            trainable_variables = tf.trainable_variables()
            gradients = tf.gradients(mean_loss, trainable_variables,
                                     name="gradients")
            clipped_gradients, global_norm = tf.clip_by_global_norm(
                gradients,
                clip_norm=self.config.max_gradient_norm,
                name="clipped_gradients")
            train_op = optimizer.apply_gradients(zip(clipped_gradients, trainable_variables),
                                                 global_step=global_step)

        # Evaluation metrics.
        accuracy = tf.metrics.accuracy(labels, predicted_class,
                                       name="accuracy")
        precision = tf.metrics.precision(labels, predicted_class,
                                         name="precision")
        recall = tf.metrics.recall(labels, predicted_class,
                                   name="recall")

        # Add summaries.
        tf.summary.scalar("loss", mean_loss)
        tf.summary.scalar("global_norm", global_norm)
        tf.summary.scalar("accuracy", accuracy[0])
        tf.summary.scalar("precision", precision[0])
        tf.summary.scalar("recall", recall[0])
        tf.summary.scalar("logits" + "/sparsity", tf.nn.zero_fraction(logits))
        tf.summary.histogram("logits" + "/activations", logits)
        tf.summary.histogram("probs", probs)

        # Add histogram for trainable variables.
        for var in trainable_variables:
            tf.summary.histogram(var.op.name, var)

        # Add histogram for gradients.
        for grad, var in zip(clipped_gradients, trainable_variables):
            if grad is not None:
                tf.summary.histogram(var.op.name + "/gradients", grad)

        # Assign placeholders and operations.
        self.x_source = x_source
        self.x_target = x_target
        self.source_seq_length = source_seq_length
        self.target_seq_length = target_seq_length
        self.labels = labels
        self.input_dropout = input_dropout
        self.output_dropout = output_dropout
        self.decision_threshold = decision_threshold
        self.train_op = train_op
        self.probs = probs
        self.predicted_class = predicted_class
        self.mean_loss = mean_loss
        self.accuracy = accuracy
        self.precision = precision
        self.recall = recall
        self.summaries = tf.summary.merge_all()
        self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=5)
Пример #4
0
def main(_):
    assert FLAGS.checkpoint_dir, "--checkpoint_dir is required."
    assert FLAGS.extract_dir, "--extract_dir is required."
    assert FLAGS.source_vocab_path, "--source_vocab_path is required."
    assert FLAGS.target_vocab_path, "--target_vocab_path is required."
    assert FLAGS.source_output_path, "--source_output_path is required."
    assert FLAGS.target_output_path, "--target_output_path is required."
    assert FLAGS.score_output_path, "--score_output_path is required."
    assert FLAGS.source_language, "--source_language is required."
    assert FLAGS.target_language, "--target_language is required."

    # Read vocabularies.
    source_vocab, _ = utils.initialize_vocabulary(FLAGS.source_vocab_path)
    target_vocab, _ = utils.initialize_vocabulary(FLAGS.target_vocab_path)

    # Read source and target paths for sentence extraction.
    source_paths = []
    target_paths = []
    for file in os.listdir(FLAGS.extract_dir):
        if file.endswith(FLAGS.source_language):
            source_paths.append(os.path.join(FLAGS.extract_dir, file))
        elif file.endswith(FLAGS.target_language):
            target_paths.append(os.path.join(FLAGS.extract_dir, file))
    source_paths.sort()
    target_paths.sort()

    utils.reset_graph()
    with tf.Session() as sess:
        # Restore saved model.
        utils.restore_model(sess, FLAGS.checkpoint_dir)

        # Recover placeholders and ops for extraction.
        x_source = sess.graph.get_tensor_by_name("x_source:0")
        source_seq_length = sess.graph.get_tensor_by_name("source_seq_length:0")

        x_target = sess.graph.get_tensor_by_name("x_target:0")
        target_seq_length = sess.graph.get_tensor_by_name("target_seq_length:0")

        labels = sess.graph.get_tensor_by_name("labels:0")

        placeholders = [x_source, source_seq_length, x_target, target_seq_length, labels]

        probs = sess.graph.get_tensor_by_name("feed_forward/output/probs:0")

        source_final_state_ph = sess.graph.get_tensor_by_name("birnn/source_final_state_ph:0")

        with open(FLAGS.source_output_path, mode="w", encoding="utf-8") as source_output_file,\
             open(FLAGS.target_output_path, mode="w", encoding="utf-8") as target_output_file,\
             open(FLAGS.score_output_path, mode="w", encoding="utf-8") as score_output_file:

            for source_path, target_path in zip(source_paths, target_paths):
                # Read sentences from articles.
                source_sentences, target_sentences = read_articles(source_path, target_path)

                # Convert sentences to token ids sequences.
                source_sentences_ids = [utils.sentence_to_token_ids(sent, source_vocab, FLAGS.max_seq_length)
                                        for sent in source_sentences]
                target_sentences_ids = [utils.sentence_to_token_ids(sent, target_vocab, FLAGS.max_seq_length)
                                        for sent in target_sentences]

                # Extract sentence pairs.
                pairs = extract_pairs(sess, source_sentences, target_sentences,
                                      source_sentences_ids, target_sentences_ids,
                                      probs, placeholders, source_final_state_ph)
                if not pairs:
                    continue
                for source_sentence, target_sentence, score in pairs:
                    source_output_file.write(source_sentence)
                    target_output_file.write(target_sentence)
                    score_output_file.write(str(score) + "\n")