示例#1
0
    def return_simple_data(self, debug, data, model, name, train):
        data_paths = self._prepare_paths(data, debug)

        if name == "train":
            train = morpho_dataset.MorphoDataset(
                data_paths[0],
                embeddings=None,
                bert=model,
                lemma_re_strip=r"(?<=.)(?:`|_|-[^0-9]).*$",
                lemma_rule_min=2,
                simple=True)

        if name == "dev":
            if os.path.exists(data_paths[1]):
                dev = morpho_dataset.MorphoDataset(data_paths[1],
                                                   train=train,
                                                   shuffle_batches=False,
                                                   bert=model,
                                                   simple=True)
            else:
                dev = None
            return dev

        if name == "test":
            if os.path.exists(data_paths[2]):
                test = morpho_dataset.MorphoDataset(data_paths[2],
                                                    train=train,
                                                    shuffle_batches=False,
                                                    bert=model,
                                                    simple=True)
            else:
                test = None
            return test

        return train
示例#2
0
                        default=64,
                        type=int,
                        help="Word embedding dimension.")
    args = parser.parse_args()

    # Create logdir name
    args.logdir = "logs/{}-{}-{}".format(
        os.path.basename(__file__),
        datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S"), ",".join(
            ("{}={}".format(re.sub("(.)[^_]*_?", r"\1", key), value)
             for key, value in sorted(vars(args).items()))))
    if not os.path.exists("logs"):
        os.mkdir("logs")  # TF 1.6 will do this by itself

    # Load the data
    train = morpho_dataset.MorphoDataset("czech-cac-train.txt",
                                         max_sentences=5000)
    dev = morpho_dataset.MorphoDataset("czech-cac-dev.txt",
                                       train=train,
                                       shuffle_batches=False)

    # Construct the network
    network = Network(threads=args.threads)
    network.construct(args, len(train.factors[train.FORMS].words),
                      len(train.factors[train.FORMS].alphabet),
                      len(train.factors[train.TAGS].words))

    # Train
    for i in range(args.epochs):
        network.train_epoch(train, args.batch_size)

        accuracy = network.evaluate("dev", dev, args.batch_size)
示例#3
0
                        default=1,
                        type=int,
                        help="Maximum number of threads to use.")
    args = parser.parse_args()

    # Create logdir name
    args.logdir = "logs/{}-{}-{}".format(
        os.path.basename(__file__),
        datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S"), ",".join(
            ("{}={}".format(re.sub("(.)[^_]*_?", r"\1", key), value)
             for key, value in sorted(vars(args).items()))))
    if not os.path.exists("logs"):
        os.mkdir("logs")  # TF 1.6 will do this by itself

    home = expanduser('~')
    train = morpho_dataset.MorphoDataset(home + "/data/cs/czech-pdt-train.txt",
                                         lowercase=True)
    dev = morpho_dataset.MorphoDataset(home + "/data/cs/czech-pdt-dev.txt",
                                       train=train,
                                       shuffle_batches=False,
                                       lowercase=True)
    test = morpho_dataset.MorphoDataset(home + "/data/cs/czech-pdt-test.txt",
                                        train=train,
                                        shuffle_batches=False,
                                        lowercase=True)
    #train = morpho_dataset.MorphoDataset(home + "/data/cs/czech-pdt-train.txt", lowercase=False)

    #train = morpho_dataset.MorphoDataset("czech-pdt-train.txt")
    #dev = morpho_dataset.MorphoDataset("czech-pdt-dev.txt", train=train, shuffle_batches=False)
    #test = morpho_dataset.MorphoDataset("czech-pdt-test.txt", train=train, shuffle_batches=False)

    batches = len(train.sentence_lens) // args.batch_size
示例#4
0
                param['logdir'] = logdir
                param['epochs'] = args.epochs
                param['threads'] = args.threads
                param = namedtuple('Params', param.keys())(*param.values())
                break
            num_retry += 1
            if num_retry > n_params:
                exit(111)

    os.makedirs(param.logdir)

    print("=====================================================")
    print(param.logdir)
    print("=====================================================")
    # Load the data
    train = morpho_dataset.MorphoDataset("czech-pdt-train.txt")
    dev = morpho_dataset.MorphoDataset("czech-pdt-dev.txt",
                                       train=train,
                                       shuffle_batches=False)
    test = morpho_dataset.MorphoDataset("czech-pdt-test.txt",
                                        train=train,
                                        shuffle_batches=False)

    analyzer_dictionary = MorphoAnalyzer("czech-pdt-analysis-dictionary.txt")
    analyzer_guesser = MorphoAnalyzer("czech-pdt-analysis-guesser.txt")

    # Construct the network
    network = get_model(param.name)(param,
                                    len(train.factors[train.FORMS].words),
                                    len(train.factors[train.FORMS].alphabet),
                                    len(train.factors[train.TAGS].words))
示例#5
0
class Network:
    def __init__(self, threads, seed=42):
        # Create an empty graph and a session
        graph = tf.Graph()
        graph.seed = seed
        self.session = tf.Session(graph = graph, config=tf.ConfigProto(inter_op_parallelism_threads=threads,
                                                                       intra_op_parallelism_threads=threads))

    def construct(self, args, source_chars, target_chars, bow, eow):
        with self.session.graph.as_default():
            if args.recodex:
                tf.get_variable_scope().set_initializer(tf.glorot_uniform_initializer(seed=42))

            # Inputs
            self.sentence_lens = tf.placeholder(tf.int32, [None], name="sentence_lens")
            self.source_ids = tf.placeholder(tf.int32, [None, None], name="source_ids")
            self.source_seqs = tf.placeholder(tf.int32, [None, None], name="source_seqs")
            self.source_seq_lens = tf.placeholder(tf.int32, [None], name="source_seq_lens")
            self.target_ids = tf.placeholder(tf.int32, [None, None], name="target_ids")
            self.target_seqs = tf.placeholder(tf.int32, [None, None], name="target_seqs")
            self.target_seq_lens = tf.placeholder(tf.int32, [None], name="target_seq_lens")

            # Append EOW after target_seqs
            target_seqs = tf.reverse_sequence(self.target_seqs, self.target_seq_lens, 1)
            target_seqs = tf.pad(target_seqs, [[0, 0], [1, 0]], constant_values=eow)
            target_seq_lens = self.target_seq_lens + 1
            target_seqs = tf.reverse_sequence(target_seqs, target_seq_lens, 1)

            # Encoder
            # TODO: Generate source embeddings for source chars, of shape [source_chars, args.char_dim].

            # TODO: Embed the self.source_seqs using the source embeddings.

            # TODO: Using a GRU with dimension args.rnn_dim, process the embedded self.source_seqs
            # using bidirectional RNN. Store the summed fwd and bwd outputs in `source_encoded`
            # and the summed fwd and bwd states into `source_states`.

            # Index the unique words using self.source_ids and self.target_ids.
            sentence_mask = tf.sequence_mask(self.sentence_lens)
            source_encoded = tf.boolean_mask(tf.nn.embedding_lookup(source_encoded, self.source_ids), sentence_mask)
            source_states = tf.boolean_mask(tf.nn.embedding_lookup(source_states, self.source_ids), sentence_mask)
            source_lens = tf.boolean_mask(tf.nn.embedding_lookup(self.source_seq_lens, self.source_ids), sentence_mask)

            target_seqs = tf.boolean_mask(tf.nn.embedding_lookup(target_seqs, self.target_ids), sentence_mask)
            target_lens = tf.boolean_mask(tf.nn.embedding_lookup(target_seq_lens, self.target_ids), sentence_mask)

            # Decoder
            # TODO: Generate target embeddings for target chars, of shape [target_chars, args.char_dim].

            # TODO: Embed the target_seqs using the target embeddings.

            # TODO: Generate a decoder GRU with wimension args.rnn_dim.

            # TODO: Create a `decoder_layer` -- a fully connected layer with
            # target_chars neurons used in the decoder to classify into target characters.

            # Attention
            # TODO: Generate three fully connected layers without activations:
            # - `source_layer` with args.rnn_dim units
            # - `state_layer` with args.rnn_dim units
            # - `weight_layer` with 1 unit

            def with_attention(inputs, states):
                # Generate the attention

                # TODO: Project source_encoded using source_layer.

                # TODO: Change shape of states from [a, b] to [a, 1, b] and project it using state_layer.

                # TODO: Sum the two above projections, apply tf.tanh and project the result using weight_layer.
                # The result has shape [x, y, 1].

                # TODO: Apply tf.nn.softmax to the latest result, using axis corresponding to source characters.

                # TODO: Multiply the source_encoded by the latest result, and sum the results with respect
                # to the axis corresponding to source characters. This is the final attention.

                # TODO: Return concatenation of inputs and the computed attention.

            # The DecoderTraining will be used during training. It will output logits for each
            # target character.
            class DecoderTraining(tf.contrib.seq2seq.Decoder):
                @property
                def batch_size(self): return # TODO: Return size of the batch, using for example source_states size
                @property
                def output_dtype(self): return tf.float32 # Type for logits of target characters
                @property
                def output_size(self): return target_chars # Length of logits for every output

                def initialize(self, name=None):
                    finished = # TODO: False if target_lens > 0, True otherwise
                    states = # TODO: Initial decoder state to use
                    inputs = # TODO: Call with_attention on the embedded BOW characters of shape [self.batch_size].
                             # You can use tf.fill to generate BOWs of appropriate size.
                    return finished, inputs, states

                def step(self, time, inputs, states, name=None):
                    outputs, states = # TODO: Run the decoder GRU cell using inputs and states.
                    outputs = # TODO: Apply the decoder_layer on outputs.
                    next_input = # TODO: Next input is with_attention called on words with index `time` in target_embedded.
                    finished = # TODO: False if target_lens > time + 1, True otherwise.
                    return outputs, states, next_input, finished
            output_layer, _, _ = tf.contrib.seq2seq.dynamic_decode(DecoderTraining())
            self.predictions_training = tf.argmax(output_layer, axis=2, output_type=tf.int32)

            # The DecoderPrediction will be used during prediction. It will
            # directly output the predicted target characters.
            class DecoderPrediction(tf.contrib.seq2seq.Decoder):
                @property
                def batch_size(self): return # TODO: Return size of the batch, using for example source_states size
                @property
                def output_dtype(self): return tf.int32 # Type for predicted target characters
                @property
                def output_size(self): return 1 # Will return just one output

                def initialize(self, name=None):
                    finished = # TODO: False of shape [self.batch_size].
                    states = # TODO: Initial decoder state to use.
                    inputs = # TODO: Call with_attention on the embedded BOW characters of shape [self.batch_size].
                             # You can use tf.fill to generate BOWs of appropriate size.
                    return finished, inputs, states

                def step(self, time, inputs, states, name=None):
                    outputs, states = # TODO: Run the decoder GRU cell using inputs and states.
                    outputs = # TODO: Apply the decoder_layer on outputs.
                    outputs = # TODO: Use tf.argmax to choose most probable class (supply parameter `output_type=tf.int32`).
                    next_input = # TODO: Embed `outputs` using target_embeddings and pass it to with_attention.
                    finished = # TODO: True where outputs==eow, False otherwise
                               # Use tf.equal for the comparison, Python's '==' is not overloaded
                    return outputs, states, next_input, finished
            self.predictions, _, self.prediction_lens = tf.contrib.seq2seq.dynamic_decode(
                DecoderPrediction(), maximum_iterations=tf.reduce_max(source_lens) + 10)

            # Training
            weights = tf.sequence_mask(target_lens, dtype=tf.float32)
            loss = tf.losses.sparse_softmax_cross_entropy(target_seqs, output_layer, weights=weights)
            global_step = tf.train.create_global_step()
            self.training = tf.train.AdamOptimizer().minimize(loss, global_step=global_step, name="training")

            # Summaries
            accuracy_training = tf.reduce_all(tf.logical_or(
                tf.equal(self.predictions_training, target_seqs),
                tf.logical_not(tf.sequence_mask(target_lens))), axis=1)
            self.current_accuracy_training, self.update_accuracy_training = tf.metrics.mean(accuracy_training)

            minimum_length = tf.minimum(tf.shape(self.predictions)[1], tf.shape(target_seqs)[1])
            accuracy = tf.logical_and(
                tf.equal(self.prediction_lens, target_lens),
                tf.reduce_all(tf.logical_or(
                    tf.equal(self.predictions[:, :minimum_length], target_seqs[:, :minimum_length]),
                    tf.logical_not(tf.sequence_mask(target_lens, maxlen=minimum_length))), axis=1))
            self.current_accuracy, self.update_accuracy = tf.metrics.mean(accuracy)

            self.current_loss, self.update_loss = tf.metrics.mean(loss, weights=tf.reduce_sum(weights))
            self.reset_metrics = tf.variables_initializer(tf.get_collection(tf.GraphKeys.METRIC_VARIABLES))

            summary_writer = tf.contrib.summary.create_file_writer(args.logdir, flush_millis=10 * 1000)
            self.summaries = {}
            with summary_writer.as_default(), tf.contrib.summary.record_summaries_every_n_global_steps(10):
                self.summaries["train"] = [tf.contrib.summary.scalar("train/loss", self.update_loss),
                                           tf.contrib.summary.scalar("train/accuracy", self.update_accuracy_training)]
            with summary_writer.as_default(), tf.contrib.summary.always_record_summaries():
                for dataset in ["dev", "test"]:
                    self.summaries[dataset] = [tf.contrib.summary.scalar(dataset + "/loss", self.current_loss),
                                               tf.contrib.summary.scalar(dataset + "/accuracy", self.current_accuracy)]

            # Initialize variables
            self.session.run(tf.global_variables_initializer())
            with summary_writer.as_default():
                tf.contrib.summary.initialize(session=self.session, graph=self.session.graph)

    def train_epoch(self, train, batch_size):
        import sys

        while not train.epoch_finished():
            sentence_lens, _, charseq_ids, charseqs, charseq_lens = train.next_batch(batch_size, including_charseqs=True)
            self.session.run(self.reset_metrics)
            predictions, _, _ = self.session.run(
                [self.predictions_training, self.training, self.summaries["train"]],
                {self.sentence_lens: sentence_lens,
                 self.source_ids: charseq_ids[train.FORMS], self.target_ids: charseq_ids[train.LEMMAS],
                 self.source_seqs: charseqs[train.FORMS], self.target_seqs: charseqs[train.LEMMAS],
                 self.source_seq_lens: charseq_lens[train.FORMS], self.target_seq_lens: charseq_lens[train.LEMMAS]})

            form, gold_lemma, system_lemma = "", "", ""
            for i in range(charseq_lens[train.FORMS][0]):
                form += train.factors[train.FORMS].alphabet[charseqs[train.FORMS][0][i]]
            for i in range(charseq_lens[train.LEMMAS][0]):
                gold_lemma += train.factors[train.LEMMAS].alphabet[charseqs[train.LEMMAS][0][i]]
                system_lemma += train.factors[train.LEMMAS].alphabet[predictions[0][i]]
            print("Gold form: {}, gold lemma: {}, predicted lemma: {}".format(form, gold_lemma, system_lemma), file=sys.stderr)

    def evaluate(self, dataset_name, dataset, batch_size):
        self.session.run(self.reset_metrics)
        while not dataset.epoch_finished():
            sentence_lens, _, charseq_ids, charseqs, charseq_lens = dataset.next_batch(batch_size, including_charseqs=True)
            self.session.run([self.update_accuracy, self.update_loss],
                             {self.sentence_lens: sentence_lens,
                              self.source_ids: charseq_ids[train.FORMS], self.target_ids: charseq_ids[train.LEMMAS],
                              self.source_seqs: charseqs[train.FORMS], self.target_seqs: charseqs[train.LEMMAS],
                              self.source_seq_lens: charseq_lens[train.FORMS], self.target_seq_lens: charseq_lens[train.LEMMAS]})
        return self.session.run([self.current_accuracy, self.summaries[dataset_name]])[0]


if __name__ == "__main__":
    import argparse
    import datetime
    import os
    import re

    # Fix random seed
    np.random.seed(42)

    # Parse arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--batch_size", default=10, type=int, help="Batch size.")
    parser.add_argument("--char_dim", default=64, type=int, help="Character embedding dimension.")
    parser.add_argument("--epochs", default=10, type=int, help="Number of epochs.")
    parser.add_argument("--recodex", default=False, action="store_true", help="ReCodEx mode.")
    parser.add_argument("--rnn_dim", default=64, type=int, help="Dimension of the encoder and the decoder.")
    parser.add_argument("--threads", default=1, type=int, help="Maximum number of threads to use.")
    args = parser.parse_args()

    # Create logdir name
    args.logdir = "logs/{}-{}-{}".format(
        os.path.basename(__file__),
        datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S"),
        ",".join(("{}={}".format(re.sub("(.)[^_]*_?", r"\1", key), value) for key, value in sorted(vars(args).items())))
    )
    if not os.path.exists("logs"): os.mkdir("logs") # TF 1.6 will do this by itself

    # Load the data
    train = morpho_dataset.MorphoDataset("czech-cac-train.txt", max_sentences=5000)
    dev = morpho_dataset.MorphoDataset("czech-cac-dev.txt", train=train, shuffle_batches=False)

    # Construct the network
    network = Network(threads=args.threads)
    network.construct(args, len(train.factors[train.FORMS].alphabet), len(train.factors[train.LEMMAS].alphabet),
                      train.factors[train.LEMMAS].alphabet_map["<bow>"], train.factors[train.LEMMAS].alphabet_map["<eow>"])

    # Train
    for i in range(args.epochs):
        network.train_epoch(train, args.batch_size)

        accuracy = network.evaluate("dev", dev, args.batch_size)
        print("{:.2f}".format(100 * accuracy))
示例#6
0
                        type=int,
                        help="Maximum number of threads to use.")
    args = parser.parse_args()

    # Create logdir name
    args.logdir = "logs/{}-{}-{}".format(
        os.path.basename(__file__),
        datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S"), ",".join(
            ("{}={}".format(re.sub("(.)[^_]*_?", r"\1", key), value)
             for key, value in sorted(vars(args).items()))))
    if not os.path.exists("logs"):
        os.mkdir("logs")  # TF 1.6 will do this by itself

    # Load the data
    if not args.recodex:
        train = morpho_dataset.MorphoDataset(
            "../19_lemmatizer_noattn/czech-cac-train.txt", max_sentences=5000)
        dev = morpho_dataset.MorphoDataset(
            "../19_lemmatizer_noattn/czech-cac-dev.txt",
            train=train,
            shuffle_batches=False)
    else:
        train = morpho_dataset.MorphoDataset("czech-cac-train.txt",
                                             max_sentences=5000)
        dev = morpho_dataset.MorphoDataset("czech-cac-dev.txt",
                                           train=train,
                                           shuffle_batches=False)

    # Construct the network
    network = Network(threads=args.threads)
    network.construct(args, len(train.factors[train.FORMS].alphabet),
                      len(train.factors[train.LEMMAS].alphabet),
示例#7
0
        # Dump passed options to allow future prediction.
        with open("{}/options.json".format(args.logdir),
                  mode="w") as options_file:
            json.dump(vars(args), options_file, sort_keys=True)

    # Postprocess args
    args.epochs = [(int(epochs), float(lr))
                   for epochs, lr in (epochs_lr.split(":")
                                      for epochs_lr in args.epochs.split(","))]

    # Load the data
    seq2seq = args.decoding == "seq2seq"
    train = morpho_dataset.MorphoDataset(
        args.train_data,
        max_sentences=args.max_sentences,
        seq2seq=seq2seq,
        bert_embeddings_filename=args.bert_embeddings_train,
        flair_filename=args.flair_train,
        elmo_filename=args.elmo_train)
    if args.dev_data:
        dev = morpho_dataset.MorphoDataset(
            args.dev_data,
            train=train,
            shuffle_batches=False,
            seq2seq=seq2seq,
            bert_embeddings_filename=args.bert_embeddings_dev,
            flair_filename=args.flair_dev,
            elmo_filename=args.elmo_dev)
    test = morpho_dataset.MorphoDataset(
        args.test_data,
        train=train,
示例#8
0
            else:
                f.read(binary_len)  # skip

        return we  #, word_to_index (optional)

    #sess.run(cnn.W.assign(initW))


if __name__ == "__main__":

    import numpy as np
    import tensorflow as tf
    from tensorflow.contrib import learn
    import morpho_dataset

    train = morpho_dataset.MorphoDataset("/home/liefe/data/cs/train.txt",
                                         lowercase=True)

    # To read as text
    #file = 'word2vec_cs.txt'
    #we, index_to_word, word_to_index = get_params(file)
    #print(we)
    #print(index_to_word[14])
    #print(word_to_index['odkazy'])

    # Read bin file
    with open('wv_we', 'wb') as f:
        file = 'word2vec_cs.bin'
        we = load(file)
        print(we.shape)
        #print(index_to_word[14])
        idx = train.factors[train.FORMS].words_map.get('odkazy')
示例#9
0
                        type=float,
                        help="Norm for gradient clipping.")

    args = parser.parse_args()

    # Create logdir name
    args.logdir = "logs/{}-{}-{}".format(
        os.path.basename(__file__),
        datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S"), ",".join(
            ("{}={}".format(re.sub("(.)[^_]*_?", r"\1", key), value)
             for key, value in sorted(vars(args).items()))))
    if not os.path.exists("logs"):
        os.mkdir("logs")  # TF 1.6 will do this by itself

    # Load the data
    train = morpho_dataset.MorphoDataset("esp.train")
    dev = morpho_dataset.MorphoDataset("esp.testa",
                                       train=train,
                                       shuffle_batches=False)
    test = morpho_dataset.MorphoDataset("esp.testb",
                                        train=train,
                                        shuffle_batches=False)

    print(len(train.factors[train.FORMS].words),
          len(train.factors[train.FORMS].alphabet),
          len(train.factors[train.NE].words))

    print(train.factors[train.NE].words)

    # Construct the network
    network = Network(threads=args.threads)
示例#10
0
                        default="GRU",
                        type=str,
                        help="RNN cell type.")
    parser.add_argument("--rnn_cell_dim",
                        default=100,
                        type=int,
                        help="RNN cell dimension.")
    parser.add_argument("--threads",
                        default=1,
                        type=int,
                        help="Maximum number of threads to use.")
    args = parser.parse_args()

    # Load the data
    print("Loading the data.", file=sys.stderr)
    data_train = morpho_dataset.MorphoDataset(args.data_train,
                                              add_bow_eow=True)
    data_dev = morpho_dataset.MorphoDataset(args.data_dev,
                                            add_bow_eow=True,
                                            train=data_train)
    data_test = morpho_dataset.MorphoDataset(args.data_test,
                                             add_bow_eow=True,
                                             train=data_train)
    bow_char = data_train.alphabet.index("<bow>")
    eow_char = data_train.alphabet.index("<eow>")

    # Construct the network
    print("Constructing the network.", file=sys.stderr)
    expname = "lemmatizer-{}{}-bs{}-epochs{}".format(args.rnn_cell,
                                                     args.rnn_cell_dim,
                                                     args.batch_size,
                                                     args.epochs)
示例#11
0
def main(args):
    import argparse
    import datetime
    import json
    import os
    import re

    np.random.seed(42)
    tf.random.set_seed(42)

    #command_line = " ".join(sys.argv[1:])

    # Parse arguments
    parser = argparse.ArgumentParser()
    # parser.add_argument("--threads", default=4, type=int, help="Maximum number of threads to use.")
    parser.add_argument("--accu",
                        default=1,
                        type=int,
                        help="accumulate batch size")
    parser.add_argument("--batch_size",
                        default=64,
                        type=int,
                        help="Batch size.")
    parser.add_argument("--bert",
                        default=None,
                        type=str,
                        help="Bert model for embeddings")
    parser.add_argument("--bert_model",
                        default=None,
                        type=str,
                        help="Bert model for training")
    parser.add_argument("--beta_2",
                        default=0.99,
                        type=float,
                        help="Adam beta 2")
    parser.add_argument("--char_dropout",
                        default=0,
                        type=float,
                        help="Character dropout")
    parser.add_argument("--checkp",
                        default=None,
                        type=str,
                        help="Checkpoint name")
    parser.add_argument("--cle_dim",
                        default=256,
                        type=int,
                        help="Character-level embedding dimension.")
    parser.add_argument("--cont",
                        default=0,
                        type=int,
                        help="load finetuned model and continue training?")
    parser.add_argument("--debug",
                        default=0,
                        type=int,
                        help="debug on small dataset")
    parser.add_argument("--dropout", default=0.5, type=float, help="Dropout")
    parser.add_argument("--embeddings",
                        default=None,
                        type=str,
                        help="External embeddings to use.")
    parser.add_argument("--epochs",
                        default="40:1e-3,20:1e-4",
                        type=str,
                        help="Epochs and learning rates.")
    parser.add_argument("--exp",
                        default=None,
                        type=str,
                        help="Experiment name.")
    parser.add_argument("--factor_layers",
                        default=1,
                        type=int,
                        help="Per-factor layers.")
    parser.add_argument("--factors",
                        default="Lemmas,Tags",
                        type=str,
                        help="Factors to predict.")
    parser.add_argument("--fine_lr",
                        default=0,
                        type=float,
                        help="Learning rate for bert layers")
    parser.add_argument("--label_smoothing",
                        default=0.00,
                        type=float,
                        help="Label smoothing.")
    parser.add_argument("--layers",
                        default=None,
                        type=str,
                        help="Which layers should be used")
    parser.add_argument("--lemma_re_strip",
                        default=r"(?<=.)(?:`|_|-[^0-9]).*$",
                        type=str,
                        help="RE suffix to strip from lemma.")
    parser.add_argument("--lemma_rule_min",
                        default=2,
                        type=int,
                        help="Minimum occurences to keep a lemma rule.")
    # parser.add_argument("--min_epoch_batches", default=300, type=int, help="Minimum number of batches per epoch.")
    parser.add_argument("--predict",
                        default=None,
                        type=str,
                        help="Predict using the passed model.")
    parser.add_argument("--rnn_cell",
                        default="LSTM",
                        type=str,
                        help="RNN cell type.")
    parser.add_argument("--rnn_cell_dim",
                        default=512,
                        type=int,
                        help="RNN cell dimension.")
    parser.add_argument("--rnn_layers",
                        default=3,
                        type=int,
                        help="RNN layers.")
    parser.add_argument("--test_only",
                        default=None,
                        type=str,
                        help="Only test evaluation")
    parser.add_argument(
        "--warmup_decay",
        default=None,
        type=str,
        help=
        "Type i or c. Number of warmup steps, than will be applied inverse square root decay"
    )
    parser.add_argument("--we_dim",
                        default=512,
                        type=int,
                        help="Word embedding dimension.")
    parser.add_argument("--word_dropout",
                        default=0.2,
                        type=float,
                        help="Word dropout")
    parser.add_argument("data", type=str, help="Input data")

    args = parser.parse_args(args)
    args.debug = args.debug == 1
    args.cont = args.cont == 1
    # Postprocess args
    args.factors = args.factors.split(",")
    args.epochs = [(int(epochs), float(lr))
                   for epochs, lr in (epochs_lr.split(":")
                                      for epochs_lr in args.epochs.split(","))]

    if args.warmup_decay is not None:
        print("decay is not none")
        print(args.warmup_decay)
        args.warmup_decay = args.warmup_decay.split(":")
        args.decay_type = args.warmup_decay[0]
        args.warmup_decay = int(args.warmup_decay[1])
    else:
        args.decay_type = None

    args.bert_load = None
    name = None
    if args.bert or args.bert_model:
        if args.bert_model:
            print("před parsovanim")
            print(args.bert_model)
            args.bert_model = args.bert_model.split(":")
            if len(args.bert_model) > 1:
                args.bert_load = args.bert_model[0]
                print(args.bert_load)
                print("load")
                args.bert_model = args.bert_model[1]
            else:
                args.bert_model = args.bert_model[0]
            name = args.bert_model
        elif args.bert:
            args.bert = args.bert.split(":")
            if len(args.bert) > 1:
                args.bert_load = args.bert[0]
                print(args.bert_load)
                print("load")
                args.bert = args.bert[1]
            else:
                args.bert = args.bert[0]
            name = args.bert

    if name is not None and "robeczech" in name:
        sys.path.append(name)
        import tokenizer.robeczech_tokenizer

    # TODO vyřešit
    # tf.config.threading.set_inter_op_parallelism_threads(args.threads)
    # tf.config.threading.set_intra_op_parallelism_threads(args.threads)
    # tf.config.set_soft_device_placement(True)

    if args.predict is None:
        # Create logdir name
        if args.exp is None:
            args.exp = "{}-{}".format(
                os.path.basename(__file__),
                datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S"))

        do_not_log = {
            "exp", "legtomma_re_strip", "predict", "threads", "bert_model",
            "bert"
        }
        args.logdir = "models/{}".format(
            args.exp
            # ",".join(("{}={}".format(re.sub("(.)[^_]*_?", r"\1", key),
            #                          re.sub("[^,]*/", "", value) if type(value) == str else value)
            #           for key, value in sorted(vars(args).items()) if key not in do_not_log))
        )
        if not os.path.exists("models"): os.mkdir("models")
        if not os.path.exists(args.logdir): os.mkdir(args.logdir)

        # Dump passed options
        with open("{}/options.json".format(args.logdir),
                  mode="w") as options_file:
            json.dump(vars(args), options_file, sort_keys=True)

    # Load embeddings
    if args.embeddings:
        with np.load(args.embeddings, allow_pickle=True) as embeddings_npz:
            args.embeddings_words = embeddings_npz["words"]
            args.embeddings_data = embeddings_npz["embeddings"]
            args.embeddings_size = args.embeddings_data.shape[1]

        # Nechceme to vsechno dohromady
    if args.bert and args.bert_model:
        warnings.warn(
            "embeddings and whole bert model training are both selected.")
    model_bert = None
    if args.bert or args.bert_model:
        model_bert = BertModel(name, args)

    if args.predict:
        # Load training dataset maps from the checkpoint
        saved = args.exp
        args.train = morpho_dataset.MorphoDataset.load_mappings(
            "models/{}/mappings.pickle".format(saved))  # To je ulozeno v
        # models/jmeno experimentu a checkpoints, predict bude jmneo modelu, v data bude cele jeno vcetne test.txt
        # Load input data
        predict = morpho_dataset.MorphoDataset(args.data,
                                               train=args.train,
                                               shuffle_batches=False,
                                               bert=model_bert)
    else:
        # Load input data
        data_paths = [None] * 3
        if args.debug:
            print("DEBUG MODE")
            data_paths[0] = "{}-train-small.txt".format(args.data)
            data_paths[1] = "{}-dev-small.txt".format(args.data)
            data_paths[2] = "{}-test-small.txt".format(args.data)
        else:
            data_paths[0] = "{}-train.txt".format(args.data)
            data_paths[1] = "{}-dev.txt".format(args.data)
            data_paths[2] = "{}-test.txt".format(args.data)

        args.train = morpho_dataset.MorphoDataset(
            data_paths[0],
            embeddings=args.embeddings_words if args.embeddings else None,
            bert=model_bert,
            lemma_re_strip=args.lemma_re_strip,
            lemma_rule_min=args.lemma_rule_min)

        if os.path.exists(data_paths[1]):
            args.dev = morpho_dataset.MorphoDataset(data_paths[1],
                                                    train=args.train,
                                                    shuffle_batches=False,
                                                    bert=model_bert)
        else:
            args.dev = None

        if os.path.exists(data_paths[2]):
            args.test = morpho_dataset.MorphoDataset(data_paths[2],
                                                     train=args.train,
                                                     shuffle_batches=False,
                                                     bert=model_bert)
        else:
            args.test = None

    print(args.bert_load)
    print("again")
    # TODO nacitat velikost
    args.bert_size = 768
    if args.decay_type != None:
        args.steps_in_epoch = math.floor(
            len(args.train.factors[1].word_strings) /
            (args.batch_size * args.accu))
    network = Network(
        args=args,
        num_words=len(args.train.factors[args.train.FORMS].words),
        num_chars=len(args.train.factors[args.train.FORMS].alphabet),
        factor_words=dict(
            (factor,
             len(args.train.factors[args.train.FACTORS_MAP[factor]].words))
            for factor in args.factors),
        model=model_bert)

    if args.debug:
        ...
        # tf.keras.utils.plot_model(network.outer_model, "my_first_model_with_shape_info.svg", show_shapes=True)

    if args.fine_lr > 0:
        args.lr_split = len(network.outer_model.trainable_variables) - len(
            network.model.trainable_variables)

    # print("model variables:")
    # print(str(network.model.trainable_variables))
    # print("outer model variables:")
    # print(str(network.outer_model.trainable_variables))
    network.args = args
    if args.predict:
        # network.saver_inference.restore(network.session, "{}/checkpoint-inference".format(args.predict))
        network.outer_model.load_weights(args.predict)
        network.predict(predict,
                        args,
                        open(saved + "_vystup", "w"),
                        compare=True)

    else:
        log_file = open("{}/log".format(args.logdir), "w")
        for factor in args.factors:
            print("{}: {}".format(
                factor,
                len(args.train.factors[args.train.FACTORS_MAP[factor]].words)),
                  file=log_file,
                  flush=True)
        print("Tagging with args:",
              "\n".join(("{}: {}".format(key, value)
                         for key, value in sorted(vars(args).items())
                         if key not in [
                             "embeddings_data", "embeddings_words", "train",
                             "test", "dev"
                         ])),
              flush=True)

        def test_eval(predict=None):
            metrics = network.evaluate(args.test, "test", args, predict)
            metrics_log = ", ".join(
                ("{}: {:.2f}".format(metric, 100 * metrics[metric])
                 for metric in metrics))
            for f in [sys.stderr, log_file]:
                print("Test, epoch {}, lr {}, {}".format(
                    epoch + 1, learning_rate, metrics_log),
                      file=f,
                      flush=True)

        for i, (epochs, learning_rate) in enumerate(args.epochs):
            tf.summary.experimental.set_step(0)
            epoch = 0
            test_eval()
            for epoch in range(epochs):
                network.train_epoch(args.train, args, learning_rate)

                if args.dev:
                    print("evaluate")
                    metrics = network.evaluate(args.dev, "dev", args)
                    metrics_log = ", ".join(
                        ("{}: {:.2f}".format(metric, 100 * metrics[metric])
                         for metric in metrics))
                    for f in [sys.stderr, log_file]:
                        print("Dev, epoch {}, lr {}, {}".format(
                            epoch + 1, learning_rate, metrics_log),
                              file=f,
                              flush=True)

                if args.cont and test:
                    test_eval()

            args.train.save_mappings("{}/mappings.pickle".format(args.logdir))
            if args.checkp:
                checkp = args.checkp
            else:
                checkp = args.logdir.split("/")[1]

        network.outer_model.save_weights('./checkpoints/' + checkp)
        output_file = args.logdir.split("/")[1]
        print(output_file)

        if args.test:
            test_eval(predict=open("./" + output_file + "_vysledky", "w"))
示例#12
0
class Network:

    MAX_GEN_LEN = 99
    EMBEDDING_SIZE = 100
    ALIGNMENT_SIZE = 100

    def __init__(self,
                 encoder, decoder,
                 rnn_cell, rnn_cell_dim,
                 chars_size, words_size, tags_size,
                 bow_char, eow_char,
                 logdir, expname,
                 threads=1, seed=42):
        # Create an empty graph and a session
        graph = tf.Graph()
        graph.seed = seed
        self.session = tf.Session(
                        graph=graph,
                        config=tf.ConfigProto(
                                    inter_op_parallelism_threads=threads,
                                    intra_op_parallelism_threads=threads))

        timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S")
        self.summary_writer = tf.summary.FileWriter("{}/{}-{}".format(logdir, timestamp, expname), flush_secs=10)

        # Construct the graph
        with self.session.graph.as_default():
            if rnn_cell == "LSTM":
                rnn_cell = tf.contrib.rnn.LSTMCell(rnn_cell_dim)
            elif rnn_cell == "GRU":
                rnn_cell = tf.contrib.rnn.GRUCell(rnn_cell_dim)
            else:
                raise ValueError("Unknown rnn_cell {}".format(rnn_cell))

            self.global_step = tf.Variable(0, dtype=tf.int64, trainable=False, name="global_step")
            self.sentence_lens = tf.placeholder(tf.int32, [None], name="sent_lens")
            self.lemma_ids = tf.placeholder(tf.int32, [None, None], name="lemma_ids")
            self.lemmas = tf.placeholder(tf.int64, [None, None], name="lemmas")
            self.lemma_lens = tf.placeholder(tf.int32, [None], name="lemma_lens")
            self.tag_ids = tf.placeholder(tf.int32, [None, None], name="tag_ids")
            self.tags = tf.placeholder(tf.int64, [None, None], name="tags")
            self.tag_lens = tf.placeholder(tf.int32, [None], name="tag_lens")
            self.form_ids = tf.placeholder(tf.int32, [None, None], name="form_ids")
            self.forms = tf.placeholder(tf.int64, [None, None], name="forms")
            self.form_lens = tf.placeholder(tf.int32, [None], name="form_lens")

            self.alphabet_len = chars_size
            self.word_vocab_len = words_size
            self.tag_vocab_len = tags_size

            self.dummy_inputs = tf.zeros([tf.shape(self.sentence_lens)[0], self.MAX_GEN_LEN], name="inference_shape")

            self.char_embedding_matrix = tf.get_variable(
                                            "char_embeddings",
                                            [self.alphabet_len, self.EMBEDDING_SIZE],
                                            initializer=tf.random_normal_initializer(stddev=0.01),
                                            dtype=tf.float32)
            self.we_lookup_matrix = tf.get_variable(
                                        "we_lookup_matrix",
                                        [self.word_vocab_len, self.EMBEDDING_SIZE],
                                        initializer=tf.random_normal_initializer(stddev=0.01),
                                        dtype=tf.float32,
                                        trainable=True)
            self.tag_lookup_matrix = tf.get_variable(
                                        "tag_lookup_matrix",
                                        [self.tag_vocab_len, self.EMBEDDING_SIZE],
                                        initializer=tf.random_normal_initializer(stddev=0.01),
                                        dtype=tf.float32,
                                        trainable=True)
           
            # Encode words
            with tf.variable_scope("encoder"):
                self.char_embeddings = tf.nn.embedding_lookup(self.char_embedding_matrix, self.lemmas)
                ch_rnn_cell = tf.contrib.rnn.GRUCell(rnn_cell_dim)
                hidden_states, final_states = tf.nn.bidirectional_dynamic_rnn(
                                                    cell_fw=ch_rnn_cell,
                                                    cell_bw=ch_rnn_cell,
                                                    inputs=self.char_embeddings,
                                                    sequence_length=self.lemma_lens,
                                                    dtype=tf.float32,
                                                    scope="char_BiRNN")

            self.sentence_mask = tf.sequence_mask(self.sentence_lens)

            # Create decoder input
            self.we_encoder_matrix = tf_layers.linear(
                                        tf.concat(axis=1, values=final_states),
                                        self.EMBEDDING_SIZE,
                                        scope="we_encoder_matrix")
            self.encoder_output = tf.nn.embedding_lookup(self.we_encoder_matrix, self.lemma_ids)
            self.encoder_output = tf.reshape(
                                    tf.boolean_mask(self.encoder_output, self.sentence_mask),
                                    [-1, self.EMBEDDING_SIZE],
                                    name="encoder_output_flat")

            # Encode tags
            self.tags_embedded = tf.nn.embedding_lookup(self.tag_lookup_matrix, self.tag_ids)
            self.tags_embedded = tf.reshape(
                                    tf.boolean_mask(self.tags_embedded, self.sentence_mask),
                                    [-1, self.EMBEDDING_SIZE],
                                    name="tag_embeddings_flat")

            # Combine encoder_output with tag embedding
            self.encoder_output = tf_layers.linear(
                                    tf.concat(axis=1, values=[self.encoder_output, self.tags_embedded]),
                                    self.EMBEDDING_SIZE,
                                    scope="encoder_output_with_tags")

            # Create annotations for attention
            self.annot_matrix = tf_layers.linear(
                                    tf.concat(axis=2, values=hidden_states),
                                    self.EMBEDDING_SIZE,
                                    scope="annot_matrix")
            self.annotations = tf.nn.embedding_lookup(self.annot_matrix, self.lemma_ids)
            self.annotations = tf.reshape(
                                tf.boolean_mask(self.annotations, self.sentence_mask),
                                [-1, tf.shape(self.annot_matrix)[1], self.EMBEDDING_SIZE],
                                name="annotations_flat")

            # Reshape form values
            self.forms_flat = tf.nn.embedding_lookup(self.forms, self.form_ids)
            self.forms_flat = tf.reshape(
                                    tf.boolean_mask(self.forms_flat, self.sentence_mask),
                                    [-1, tf.shape(self.forms)[1]],
                                    name="forms_flat")
            self.forms_flat_lens = tf.nn.embedding_lookup(self.form_lens, self.form_ids)
            self.forms_flat_lens = tf.reshape(
                                        tf.boolean_mask(self.forms_flat_lens, self.sentence_mask),
                                        [-1],
                                        name="lemmas_flat_lens")

            self.attention_fn = None
            if decoder in ["individual", "individual_attention", "combined_attention", "combined_attention_birnn"]:
                if decoder in ["individual_attention", "combined_attention", "combined_attention_birnn"]:
                    #self.attention_fn = self.attention_fn_builder(self.annotations)
                if decoder == "combined_attention":
                    word_embeddings = tf.nn.embedding_lookup(self.we_lookup_matrix, self.lemma_ids)
                    word_embeddings = tf.reshape(
                                        tf.boolean_mask(word_embeddings, self.sentence_mask),
                                        [-1, self.EMBEDDING_SIZE],
                                        name="word_embeddings_flat")
                    self.encoder_output = tf_layers.linear(
                                            tf.concat(axis=1, values=[self.encoder_output, word_embeddings]),
                                            self.EMBEDDING_SIZE,
                                            scope="combined_encoder_output")
                if decoder == "combined_attention_rnn":
            else:
                raise ValueError("Unknown decoder ({}).".format(decoder))

            # Decoder training
            with tf.variable_scope("decoder"):
                if decoder == "individual":
                    self.training_logits, states = tf_seq2seq.rnn_decoder(
                                                    decoder_inputs=self.forms_flat,
                                                    initial_state=self.encoder_output,
                                                    cell=rnn_cell)
                else:
                    self.training_logits, states = tf_seq2seq.attention_decoder(
                                                    decoder_inputs=self.forms_flat,
                                                    initial_state=self.encoder_output,
                                                    attention_states=self.annotations,
                                                    cell=rnn_cell)
                                                

                
                #self.training_logits, states = tf_seq2seq.dynamic_rnn_decoder(
                                                #cell=rnn_cell,
                                                #decoder_fn=self.decoder_fn_train(
                                                #    self.encoder_output,
                                                #    self.output_fn_builder(),
                                                #    self.input_fn_builder(self.char_embedding_matrix, self.attention_fn)),
                                                #inputs=tf.expand_dims(self.forms_flat, -1),
                                                #sequence_length=self.forms_flat_lens)

            
            # Decoder inference
            with tf.variable_scope("decoder", reuse=True):
                if decoder == "individual":
                    self.training_logits, states = tf_seq2seq.rnn_decoder(
                                                    decoder_inputs=self.dummy_inputs,
                                                    initial_state=self.encoder_output,
                                                    cell=rnn_cell,
                                                    loop_function=decoder_fn)
                else:
                    self.training_logits, states = tf_seq2seq.attention_decoder(
                                                    decoder_inputs=self.dummy_inputs,
                                                    initial_state=self.encoder_output,
                                                    attention_states=self.annotations,
                                                    cell=rnn_cell,
                                                    loop_function=decoder_fn)

                #self.inference_logits, states = tf_seq2seq.dynamic_rnn_decoder(
                                                    #cell=rnn_cell,
                                                    #decoder_fn=self.decoder_fn_inference(
                                                    #    self.encoder_output,
                                                    #    self.output_fn_builder(),
                                                    #    self.input_fn_builder(self.char_embedding_matrix, self.attention_fn),
                                                    #bow_char,
                                                    #eow_char,
                                                    #self.MAX_GEN_LEN))

            self.predictions = tf.argmax(self.inference_logits, 2)
            loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.training_logits, labels=self.forms_flat[:,1:]))
            self.training = tf.train.AdamOptimizer().minimize(loss, global_step=self.global_step)

            self.forms_flat = tf.cond(
                                tf.reduce_max(self.forms_flat_lens) > self.MAX_GEN_LEN,
                                lambda: tf.slice(self.forms_flat, [0, 0], [-1, self.MAX_GEN_LEN]),
                                lambda: self.forms_flat)

            self.pred_padded = tf.pad(
                            self.predictions,
                            [[0,0],[0, self.MAX_GEN_LEN - tf.shape(self.predictions)[1]]],
                            mode="CONSTANT")
            self.forms_padded = tf.pad(
                                self.forms_flat,
                                [[0,0],[0, self.MAX_GEN_LEN - tf.shape(self.forms_flat)[1] + 1]],
                                mode="CONSTANT")

            self.char_accuracy = tf_metrics.accuracy(self.pred_padded, self.forms_padded[:,1:])
            self.word_accuracy = tf.reduce_mean(tf.reduce_min(tf.cast(tf.equal(self.pred_padded, self.forms_padded[:,1:]), tf.float32), axis=1))


            self.summary = {}
            for dataset_name in ["train", "dev"]:
                self.summary[dataset_name] = tf.summary.merge([tf.summary.scalar(dataset_name+"/loss", loss),
                                             tf.summary.scalar(dataset_name+"/char_accuracy", self.char_accuracy),
                                             tf.summary.scalar(dataset_name+"/word_accuracy", self.word_accuracy)])

            # Initialize variables
            self.session.run(tf.global_variables_initializer())
            if self.summary_writer:
                self.summary_writer.add_graph(self.session.graph)


    # Simple decoder for training
    def decoder_fn_train(self, encoder_state, output_fn, input_fn, name=None):
        def decoder_fn(time, cell_state, next_id, cell_output, context_state):
            cell_output = output_fn(cell_output)
            reuse = True
            if cell_state is None:  # first call, return encoder_state
                cell_state = encoder_state
                reuse = None
            next_input = input_fn(tf.squeeze(next_id, [1]), cell_state, reuse)
            
            return (None, cell_state, next_input, cell_output, context_state)

        return decoder_fn

    # TODO: Beam search
    # Simple decoder for inference
    def decoder_fn_inference(self, encoder_state, output_fn, input_fn,
                         beginning_of_word="<bow>", end_of_word="<eow>", maximum_length=MAX_GEN_LEN):
        batch_size = tf.shape(encoder_state)[0]
        def decoder_fn(time, cell_state, cell_input, cell_output, context_state):
            cell_output = output_fn(cell_output)
            if cell_state is None:
                cell_state = encoder_state
                next_id = tf.tile([beginning_of_word], [batch_size])
                done = tf.zeros([batch_size], dtype=tf.bool)
            else:
                next_id = tf.argmax(cell_output, 1)
                done = tf.equal(next_id, end_of_word)
                done = tf.cond(
                        tf.greater_equal(time, maximum_length), # return true if time >= maxlen
                        lambda: tf.ones([batch_size], dtype=tf.bool),
                        lambda: done)
            next_input = input_fn(next_id, cell_state, True)

            return (done, cell_state, next_input, cell_output, context_state)

        return decoder_fn

    def decoder_fn_builder(self, encoder_state, output_fn, input_fn,
                        beginning_of_word="<bow>", end_of_word="<eow>", maximum_length=MAX_GEN_LEN):
        def decoder_fn(cell_output, i):
            cell_output = output_fn(cell_output)
            next_input = tf.argmax(cell_output, 1)
            next_input = input_fn(next_input)

        return decoder_fn

    # TODO: dropout
    def attention_fn_builder(self, annotations):
        def attention_fn(state):
            batch_size = tf.shape(state)[0]
            annot_len = tf.shape(annotations)[1]

            annot_dim = annotations.get_shape().as_list()[2]
            state_dim = state.get_shape().as_list()[1]
            e_dim = self.ALIGNMENT_SIZE

            a = tf.reshape(annotations, [-1, annot_dim])

            U = tf.get_variable(
                    "annot_weight",
                    shape=[annot_dim, e_dim],
                    initializer=tf.random_normal_initializer(stddev=0.1),
                    trainable=True)
            U_b = tf.get_variable(
                    "annot_bias",
                    shape=[e_dim],
                    initializer=tf.constant_initializer(0.1)) 

            W = tf.get_variable(
                    "state_weight",
                    shape=[state_dim, e_dim],
                    initializer=tf.random_normal_initializer(stddev=0.1),
                    trainable=True)
            W_b = tf.get_variable(
                    "state_bias",
                    shape=[e_dim],
                    initializer=tf.constant_initializer(0.1))

            v = tf.get_variable(
                    "lin_combo",
                    shape=[e_dim, 1],
                    initializer=tf.random_normal_initializer(stddev=0.1),
                    trainable=True)

            w_res = tf.matmul(state, W) + W_b
            w_res = tf.tile(tf.reshape(w_res, [-1, 1]), [1, annot_len])

            u_res = tf.matmul(a, U) + U_b
            u_res = tf.reshape(u_res, [-1, annot_len])

            e = tf.matmul(tf.tanh(tf.reshape(w_res + u_res, [-1, e_dim])), v)
            e = tf.reshape(e, [batch_size, -1])

            alpha = tf.nn.softmax(e)
            alpha = tf.tile(tf.reshape(alpha, [-1, 1]), [1, annot_dim])
            c = tf.multiply(alpha, a)
            c = tf.reduce_sum(tf.reshape(c, [batch_size, -1, annot_dim]), 1)

            C = tf.get_variable(
                    "attention_weight",
                    shape=[state_dim, state_dim],
                    initializer=tf.random_normal_initializer(stddev=0.1),
                    trainable=True)
            C_b = tf.get_variable(
                    "attention_bias",
                    shape=[state_dim],
                    initializer=tf.constant_initializer(0.1))

            return tf.add(tf.matmul(c, C), C_b)

        return attention_fn


    # Output function builder (makes logits out of rnn outputs)
    def output_fn_builder(self):
        def output_fn(cell_output):
            if cell_output is None:
                return tf.zeros([self.alphabet_len], tf.float32) # only used for shape inference
            else:
                return tf_layers.linear(
                            cell_output,
                            num_outputs=self.alphabet_len,
                            scope="decoder_output")

        return output_fn

    # Input function builder (makes rnn input from word id and cell state)
    def input_fn_builder(self, embeddings):
        def input_fn(next_id):
            return tf.nn.embedding_lookup(embeddings, next_id)
    
        return input_fn


    # Input function builder (makes rnn input from word id and cell state)
    #def input_fn_builder(self, embeddings, attention_fn=None):
    #    def input_fn(next_id, cell_state, reuse=True):
    #        if attention_fn is not None:
    #            with tf.variable_scope("attention", reuse=reuse):
    #                return tf.add(
    #                            tf.nn.embedding_lookup(embeddings, next_id),
    #                            attention_fn(cell_state))
    #        else:
    #            return tf.nn.embedding_lookup(embeddings, next_id)
    #
    #    return input_fn

    @property
    def training_step(self):
        return self.session.run(self.global_step)

    def train(self,
              sentence_lens,
              forms, form_ids, form_lens,
              tags, tag_ids, tag_lens,
              lemmas, lemma_ids, lemma_lens):
        try:
            _, summary, pred = self.session.run([self.training, self.summary, self.predictions],
                                      {self.sentence_lens: sentence_lens,
                                       self.forms: forms,
                                       self.form_ids: form_ids,
                                       self.form_lens: form_lens,
                                       self.tags: tags,
                                       self.tag_ids: tag_ids,
                                       self.tag_lens: tag_lens,
                                       self.lemmas: lemmas,
                                       self.lemma_ids: lemma_ids,
                                       self.lemma_lens: lemma_lens})
        except Exception as e:
            import pdb; pdb.set_trace()
            raise e

        self.summary_writer.add_summary(summary["train"], self.training_step)

    def evaluate(self,
                 sentence_lens,
                 forms, form_ids, form_lens,
                 tags, tag_ids, tag_lens,
                 lemmas, lemma_ids, lemma_lens):
        try:
            ch_acc, w_acc, summary, pred = self.session.run([self.char_accuracy, self.word_accuracy, self.summary, self.predictions],
                                             {self.sentence_lens: sentence_lens,
                                              self.forms: forms,
                                              self.form_ids: form_ids,
                                              self.form_lens: form_lens,
                                              self.tags: tags,
                                              self.tag_ids: tag_ids,
                                              self.tag_lens: tag_lens,
                                              self.lemmas: lemmas,
                                              self.lemma_ids: lemma_ids,
                                              self.lemma_lens: lemma_lens})
        except Exception as e:
            import pdb; pdb.set_trace()
            raise e

        self.summary_writer.add_summary(summary["dev"], self.training_step)
        return ch_acc, w_acc

    def predict(self,
                sentence_lens,
                lemmas, lemma_ids, lemma_lens,
                tags, tag_ids, tag_lens):
        predictions = self.session.run([self.predictions],
                                {self.sentence_lens: sentence_lens,
                                 self.lemmas: lemmas,
                                 self.lemma_ids: lemma_ids,
                                 self.lemma_lens: lemma_lens,
                                 self.tags: tags,
                                 self.tag_ids: tag_ids,
                                 self.tag_lens: tag_lens})
        return predictions

if __name__ == "__main__":
    # Fix random seed
    np.random.seed(42)

    # Parse arguments
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--batch_size", default=64, type=int, help="Batch size.")
    parser.add_argument("--data_train", default="data/en-train-gen.txt", type=str, help="Training data file.")
    parser.add_argument("--data_dev", default="data/en-dev.txt", type=str, help="Development data file.")
    parser.add_argument("--data_test", default="data/en-test-gen.txt", type=str, help="Testing data file.")
    parser.add_argument("--epochs", default=10, type=int, help="Number of epochs.")
    parser.add_argument("--logdir", default="logs", type=str, help="Logdir name.")
    parser.add_argument("--rnn_cell", default="GRU", type=str, help="RNN cell type.")
    parser.add_argument("--rnn_cell_dim", default=100, type=int, help="RNN cell dimension.")
    parser.add_argument("--encoder", default="simple", type=str, help="Which encoder should we use.")
    parser.add_argument("--decoder", default="individual", type=str, help="Which decoder should we use.")
    parser.add_argument("--threads", default=1, type=int, help="Maximum number of threads to use.")
    args = parser.parse_args()

    # Load the data
    print("Loading the data.", file=sys.stderr)
    data_train = morpho_dataset.MorphoDataset(args.data_train, add_bow_eow=True)
    data_dev = morpho_dataset.MorphoDataset(args.data_dev, add_bow_eow=True, train=data_train)
    data_test = morpho_dataset.MorphoDataset(args.data_test, add_bow_eow=True, train=data_train)
    bow_char = data_train.alphabet.index("<bow>")
    eow_char = data_train.alphabet.index("<eow>")

    # Construct the network
    print("Constructing the network.", file=sys.stderr)
    expname = "generator-{}{}-bs{}-epochs{}".format(args.rnn_cell, args.rnn_cell_dim, args.batch_size, args.epochs)
    network = Network(rnn_cell=args.rnn_cell,
                      encoder=args.encoder,
                      decoder=args.decoder,
                      rnn_cell_dim=args.rnn_cell_dim,
                      chars_size=len(data_train.alphabet),
                      words_size=len(data_train.factors[data_train.FORMS]['words']),
                      tags_size=len(data_train.factors[data_train.TAGS]['words']),
                      bow_char=bow_char,
                      eow_char=eow_char,
                      logdir=args.logdir,
                      expname=expname,
                      threads=args.threads)

    # Train
    best_dev_ch_acc = 0
    best_dev_w_acc = 0
    test_predictions = None

    for epoch in range(args.epochs):
        print("Training epoch {}".format(epoch + 1), file=sys.stderr)
        while not data_train.epoch_finished():
            sentence_lens, form_ids, charseq_ids, charseqs, charseq_lens = \
                data_train.next_batch(args.batch_size, including_charseqs=True)

            network.train(
                sentence_lens,
                charseqs[data_train.FORMS],
                charseq_ids[data_train.FORMS],
                charseq_lens[data_train.FORMS],
                charseqs[data_train.TAGS],
                charseq_ids[data_train.TAGS],
                charseq_lens[data_train.TAGS],
                charseqs[data_train.LEMMAS],
                charseq_ids[data_train.LEMMAS],
                charseq_lens[data_train.LEMMAS])

        sentence_lens, form_ids, charseq_ids, charseqs, charseq_lens = data_dev.whole_data_as_batch(including_charseqs=True)
        dev_ch_acc, dev_w_acc = network.evaluate(
                                    sentence_lens,
                                    charseqs[data_train.FORMS],
                                    charseq_ids[data_train.FORMS],
                                    charseq_lens[data_train.FORMS],
                                    charseqs[data_train.TAGS],
                                    charseq_ids[data_train.TAGS],
                                    charseq_lens[data_train.TAGS],
                                    charseqs[data_train.LEMMAS],
                                    charseq_ids[data_train.LEMMAS],
                                    charseq_lens[data_train.LEMMAS])

        print("Development ch_acc after epoch {} is {:.2f}, w_acc is {:.2f}.".format(epoch + 1, 100. * dev_ch_acc, 100. * dev_w_acc), file=sys.stderr)

        if dev_w_acc > best_dev_w_acc or (dev_w_acc == best_dev_w_acc and dev_ch_acc > best_dev_ch_acc):
            best_dev_w_acc = dev_w_acc
            best_dev_ch_acc = dev_ch_acc

            sentence_lens, form_ids, charseq_ids, charseqs, charseq_lens = data_test.whole_data_as_batch(including_charseqs=True)
            test_predictions = network.predict(
                                    sentence_lens,
                                    charseqs[data_train.LEMMAS],
                                    charseq_ids[data_train.LEMMAS],
                                    charseq_lens[data_train.LEMMAS],
                                    charseqs[data_train.TAGS],
                                    charseq_ids[data_train.TAGS],
                                    charseq_lens[data_train.TAGS])

    # Print test predictions
    test_forms = data_test.factors[data_test.FORMS]['strings'] # We use strings instead of words, because words can be <unk>
    test_predictions = list(test_predictions)
    for i in range(len(data_test.sentence_lens)):
        for j in range(data_test.sentence_lens[i]):
            form = ''
            pred = test_predictions.pop(0)
            for k in range(len(pred)):
                if pred[k] == eow_char:
                    break
                form += data_test.alphabet[pred[k]]
            print("{}\t{}\t_".format(test_forms[i][j], form))
        print()

    print("Final best dev set accuracy: {:.2f}".format(100. * best_dev_w_acc))
示例#13
0
    # np.random.seed(42)

    # Parse arguments
    # parser = argparse.ArgumentParser()
    # parser.add_argument("--batch_size", default=10, type=int, help="Batch size.")
    # parser.add_argument("--epochs", default=10, type=int, help="Number of epochs.")
    # parser.add_argument("--threads", default=8, type=int, help="Maximum number of threads to use.")
    parser = argparse.ArgumentParser()
    parser.add_argument("best", type=str, help="dev.")
    parser.add_argument("prediction",  type=str, help="prediction.")
    args = parser.parse_args()

    analyzer_dictionary = MorphoAnalyzer("../18_tagger_sota/czech-pdt-analysis-dictionary.txt")
    analyzer_guesser = MorphoAnalyzer("../18_tagger_sota/czech-pdt-analysis-guesser.txt")

    prediction = morpho_dataset.MorphoDataset(args.prediction)

    dir = os.path.dirname(args.prediction)
    f = os.path.basename(args.prediction)

    with open("{}/a_{}.txt".format(dir, f), "w", encoding="utf-8") as test_file:
        forms = prediction.factors[prediction.FORMS].strings
        tags = prediction.factors[prediction.LEMMAS].strings
        for s in range(len(forms)):
            for j in range(len(forms[s])):
                print("{}\t{}\t_".format(forms[s][j], analyze(forms[s][j], tags[s][j], analyzer_dictionary, analyzer_guesser)), file=test_file)
            print("", file=test_file)

    print("Puvodni")
    os.system('python morpho_eval.py ' + args.best + " " + args.prediction)
    print("+Analyzer")
示例#14
0
                        default=8,
                        type=int,
                        help="Maximum number of threads to use.")
    args = parser.parse_args()

    # Create logdir name
    args.logdir = "logs/{}-{}-{}".format(
        os.path.basename(__file__),
        datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S"), ",".join(
            ("{}={}".format(re.sub("(.)[^_]*_?", r"\1", key), value)
             for key, value in sorted(vars(args).items()))))
    if not os.path.exists("logs"):
        os.mkdir("logs")  # TF 1.6 will do this by itself

    # Load the data
    train = morpho_dataset.MorphoDataset(
        "../18_tagger_sota/czech-pdt-train.txt", max_sentences=1000)
    dev = morpho_dataset.MorphoDataset("../18_tagger_sota/czech-pdt-dev.txt",
                                       train=train,
                                       shuffle_batches=False)
    test = morpho_dataset.MorphoDataset("../18_tagger_sota/czech-pdt-test.txt",
                                        train=train,
                                        shuffle_batches=False)

    analyzer_dictionary = MorphoAnalyzer(
        "../18_tagger_sota/czech-pdt-analysis-dictionary.txt")
    analyzer_guesser = MorphoAnalyzer(
        "../18_tagger_sota/czech-pdt-analysis-guesser.txt")

    # Construct the network
    network = Network(threads=args.threads)
    network.construct(args, len(train.factors[train.FORMS].alphabet),
示例#15
0
    #train = morpho_dataset.MorphoDataset("/home/liefe/data/cs/train.txt", lowercase=True)
    #train = morpho_dataset.MorphoDataset("/afs/ms/u/l/liefe/data/cs/train.txt", lowercase=True)
    # To read as text
    #file = 'word2vec_cs.txt'
    #we, index_to_word, word_to_index = get_params(file)
    #print(we)
    #print(index_to_word[14])
    #print(word_to_index['odkazy'])

    # Read bin file
    model_file = sys.argv[1]
    train_file = sys.argv[2]
    home = expanduser('~')
    train_file = home + '/data/cs/' + train_file
    train = morpho_dataset.MorphoDataset(train_file, lowercase=False)
    #train = morpho_dataset.MorphoDataset(train_file, lowercase=True)

    # Save file in numpy format
    #with open(model_file, 'wb') as f:
    #file = '/home/liefe/py/wv_data/word2vec_cs64.bin'

    #model = load_text(model_file) # read text file
    model = load_bin(model_file)  # read text file

    print 'model shape: '
    print model.shape
    #print(index_to_word[14])
    #print('done emebedding..testing')
    idx = train.factors[train.FORMS].words_map.get('odkazy')
    print 'odkazy'
示例#16
0
    
    #train = morpho_dataset.MorphoDataset("/home/liefe/data/cs/train.txt", lowercase=True)
    #train = morpho_dataset.MorphoDataset("/afs/ms/u/l/liefe/data/cs/train.txt", lowercase=True)
    # To read as text
    #file = 'word2vec_cs.txt'
    #we, index_to_word, word_to_index = get_params(file)
    #print(we)
    #print(index_to_word[14])
    #print(word_to_index['odkazy'])
    
    # Read bin file
    model_file = sys.argv[1]
    train_file = sys.argv[2]
    home = expanduser('~')
    train_file = home + '/data/cs/' + train_file 
    train = morpho_dataset.MorphoDataset(train_file, lowercase=True)
            

    # Save file in numpy format
    #with open(model_file, 'wb') as f:
        #file = '/home/liefe/py/wv_data/word2vec_cs64.bin'
        
    model = load_text(model_file) # read text file
    print('model shape: ', model.shape)
    #print(index_to_word[14])
    print('done emebedding..testing')
    idx = train.factors[train.FORMS].words_map.get('odkazy')
    print('odkazy: {}, we={}'.format(idx, model[idx,:])) 
    print('saving model')
    np.save(model_file + '_embedded', model)