Пример #1
0
    def initialize(self, hidden_size, projection_size, in_vocabulary, out_vocabulary, batch_size, hidden_activation="Tanh", bptt_steps=5, use_pauses=False):

        self.hidden_size = hidden_size
        self.projection_size = projection_size
        self.bptt_steps = bptt_steps
        self.batch_size = batch_size
        self.use_pauses = use_pauses

        self.in_vocabulary = in_vocabulary
        self.out_vocabulary = out_vocabulary

        self.hidden_activation_name = hidden_activation
        self.hidden_activation = getattr(activation_functions, hidden_activation)
        
        self.We = self.weights(get_vocabulary_size(self.in_vocabulary), self.projection_size)
        self.Wp = self.weights(1, self.projection_size)
        self.W = self.weights(self.projection_size, self.hidden_size*4)
        self.Wip = self.weights(1, self.hidden_size)
        self.Wfp = self.weights(1, self.hidden_size)
        self.Wop = self.weights(1, self.hidden_size)
        self.Wr = self.weights(self.hidden_size, self.hidden_size*4)
        self.Wy = self.weights(self.hidden_size, get_vocabulary_size(self.out_vocabulary))

        # AdaGread sum of squares of per feature historical gradients
        for p in self.params:
            setattr(self, p+"_hg", np.zeros_like(getattr(self, p)))

        self.reset_state()

        self.initialized = True
Пример #2
0
def train(model_name, p1_train_data, p1_dev_data, p2_train_data, p2_dev_data):

    ### PHASE 1 ###    

    training_data = np.load(p1_train_data)
    validation_data = np.load(p1_dev_data)

    assert training_data["batch_size"] == validation_data["batch_size"]
    assert training_data["vocabulary"] == validation_data["vocabulary"]
    assert training_data["punctuations"] == validation_data["punctuations"]

    print "1st phase data loaded..."

    print "Vocabulary size is %d" % utils.get_vocabulary_size(validation_data["vocabulary"])
    print "Training set size is %d" % training_data["total_size"]
    print "Validation set size is %d" % validation_data["total_size"]

    net = models.T_LSTM()
    net.initialize(hidden_size=conf.PHASE1["HIDDEN_SIZE"],
                   projection_size=conf.PHASE1["PROJECTION_SIZE"],
                   in_vocabulary=training_data["vocabulary"],
                   out_vocabulary=training_data["punctuations"],
                   batch_size=training_data["batch_size"],
                   hidden_activation=conf.PHASE1["HIDDEN_ACTIVATION"],
                   bptt_steps=conf.PHASE1["BPTT_STEPS"],
                   use_pauses=False)

    _train(net, training_data, validation_data, model_name, conf.PHASE1["LEARNING_RATE"], conf.PHASE1["MAX_EPOCHS"], conf.PHASE1["MIN_IMPROVEMENT"])

    ### PHASE 2 ###

    if not os.path.isfile(p2_train_data) or not os.path.isfile(p2_train_data):
        print "No second phase data."
        return

    training_data = np.load(p2_train_data)
    validation_data = np.load(p2_dev_data)

    assert training_data["batch_size"] == validation_data["batch_size"] == net.batch_size
    assert training_data["vocabulary"] == validation_data["vocabulary"] == net.in_vocabulary
    assert training_data["punctuations"] == validation_data["punctuations"] == net.out_vocabulary

    print "2nd phase data loaded..."

    print "Training set size is %d" % training_data["total_size"]
    print "Validation set size is %d" % validation_data["total_size"]
    print "Trainging %s pause durations." % ("with" if conf.PHASE2["USE_PAUSES"] else "without")

    t_lstm = net

    net = models.TA_LSTM()
    net.initialize(hidden_size=conf.PHASE2["HIDDEN_SIZE"],
                   t_lstm=t_lstm,
                   out_vocabulary=training_data["punctuations"],
                   batch_size=training_data["batch_size"],
                   hidden_activation=conf.PHASE2["HIDDEN_ACTIVATION"],
                   bptt_steps=conf.PHASE2["BPTT_STEPS"],
                   use_pauses=conf.PHASE2["USE_PAUSES"])

    _train(net, training_data, validation_data, model_name, conf.PHASE2["LEARNING_RATE"], conf.PHASE2["MAX_EPOCHS"], conf.PHASE2["MIN_IMPROVEMENT"])
Пример #3
0
def train(model_name, p1_train_data, p1_dev_data, p2_train_data, p2_dev_data):

    ### PHASE 1 ###

    training_data = np.load(p1_train_data)
    validation_data = np.load(p1_dev_data)

    assert training_data["batch_size"] == validation_data["batch_size"]
    assert training_data["vocabulary"] == validation_data["vocabulary"]
    assert training_data["punctuations"] == validation_data["punctuations"]

    print("1st phase data loaded...")

    print("Vocabulary size is %d" %
          utils.get_vocabulary_size(validation_data["vocabulary"]))
    print("Training set size is %d" % training_data["total_size"])
    print("Validation set size is %d" % validation_data["total_size"])

    net = models.T_LSTM()
    net.initialize(hidden_size=conf.PHASE1["HIDDEN_SIZE"],
                   projection_size=conf.PHASE1["PROJECTION_SIZE"],
                   in_vocabulary=training_data["vocabulary"],
                   out_vocabulary=training_data["punctuations"],
                   batch_size=training_data["batch_size"],
                   hidden_activation=conf.PHASE1["HIDDEN_ACTIVATION"],
                   gate_activation=conf.GATE_ACTIVATION,
                   bptt_steps=conf.PHASE1["BPTT_STEPS"],
                   use_pauses=False)

    _train(net, training_data, validation_data, model_name,
           conf.PHASE1["LEARNING_RATE"], conf.PHASE1["MAX_EPOCHS"],
           conf.PHASE1["MIN_IMPROVEMENT"])
Пример #4
0
    def initialize(self,
                   hidden_size,
                   projection_size,
                   in_vocabulary,
                   out_vocabulary,
                   batch_size,
                   hidden_activation="Tanh",
                   gate_activation="Sigmoid",
                   bptt_steps=5,
                   use_pauses=False):

        self.hidden_size = hidden_size
        self.projection_size = projection_size
        self.bptt_steps = bptt_steps
        self.batch_size = batch_size
        self.use_pauses = use_pauses

        self.in_vocabulary = in_vocabulary
        self.out_vocabulary = out_vocabulary

        self.hidden_activation_name = hidden_activation
        self.hidden_activation = getattr(activation_functions,
                                         hidden_activation)

        self.gate_activation_name = gate_activation
        self.gate_activation = getattr(activation_functions, gate_activation)

        self.We = self.weights(get_vocabulary_size(self.in_vocabulary),
                               self.projection_size)
        self.Wp = self.weights(1, self.projection_size)
        self.W = self.weights(self.projection_size, self.hidden_size * 4)
        self.Wip = self.weights(1, self.hidden_size)
        self.Wfp = self.weights(1, self.hidden_size)
        self.Wop = self.weights(1, self.hidden_size)
        self.Wr = self.weights(self.hidden_size, self.hidden_size * 4)
        self.Wy = self.weights(self.hidden_size,
                               get_vocabulary_size(self.out_vocabulary))

        # AdaGread sum of squares of per feature historical gradients
        for p in self.params:
            setattr(self, p + "_hg", np.zeros_like(getattr(self, p)))

        self.reset_state()

        self.initialized = True
Пример #5
0
    def initialize(self,
                   hidden_size,
                   t_lstm,
                   out_vocabulary,
                   batch_size,
                   hidden_activation="Tanh",
                   bptt_steps=5,
                   use_pauses=False):

        assert isinstance(t_lstm, T_LSTM)

        self.hidden_size = hidden_size
        self.t_lstm = t_lstm
        self.bptt_steps = bptt_steps
        self.batch_size = batch_size
        self.use_pauses = use_pauses

        self.in_vocabulary = self.t_lstm.in_vocabulary
        self.out_vocabulary = out_vocabulary

        self.hidden_activation_name = hidden_activation
        self.hidden_activation = getattr(activation_functions,
                                         hidden_activation)

        self.W = self.weights(self.t_lstm.hidden_size, self.hidden_size * 4)
        self.Wp = self.weights(1, self.hidden_size * 4)
        self.Wy = self.weights(self.hidden_size,
                               get_vocabulary_size(self.out_vocabulary))
        self.Wip = self.weights(1, self.hidden_size)
        self.Wfp = self.weights(1, self.hidden_size)
        self.Wop = self.weights(1, self.hidden_size)
        self.Wr = self.weights(self.hidden_size, self.hidden_size * 4)

        # AdaGread sum of squares of per feature historical gradients
        for p in self.params:
            setattr(self, p + "_hg", np.zeros_like(getattr(self, p)))

        self.reset_state()

        self.initialized = True
Пример #6
0
SEQUENCE_LENGTH = 250
EMBEDDING_DIM = 100
HIDDEN_SIZE = 150
ATTENTION_SIZE = 50
KEEP_PROB = 0.8
BATCH_SIZE = 256
NUM_EPOCHS = 3  # Model easily overfits without pre-trained words embeddings, that's why train for a few epochs
DELTA = 0.5
MODEL_PATH = './model'

# Load the data set
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=NUM_WORDS,
                                                      index_from=INDEX_FROM)

# Sequences pre-processing
vocabulary_size = get_vocabulary_size(X_train)
X_test = fit_in_vocabulary(X_test, vocabulary_size)
X_train = zero_pad(X_train, SEQUENCE_LENGTH)
X_test = zero_pad(X_test, SEQUENCE_LENGTH)

# Different placeholders
with tf.name_scope('Inputs'):
    batch_ph = tf.placeholder(tf.int32, [None, SEQUENCE_LENGTH],
                              name='batch_ph')
    target_ph = tf.placeholder(tf.float32, [None], name='target_ph')
    seq_len_ph = tf.placeholder(tf.int32, [None], name='seq_len_ph')
    keep_prob_ph = tf.placeholder(tf.float32, name='keep_prob_ph')

# Embedding layer
with tf.name_scope('Embedding_layer'):
    embeddings_var = tf.Variable(tf.random_uniform(
Пример #7
0
    def read_sparse_input(lang_code, concepts_to_include, validation_set):
        '''Reads data for langauge lang_code from a file in the sparse matrix folder. 
		All of the concepts not found in the set concepts_to_include are excluded. 
		Aditionaly the concepts found in the validation_set are also excluded from the read data (even if they are found in concepts_to_include).'''

        rows = []
        columns = []
        data = []
        concept_id_2_index = {}

        intersection_with_concept_validation_set = set()  # Sanity check

        with gzip.open(utils.get_sparse_matrix_file_path(lang_code), 'r') as f:
            # Used to check for repeated assignment of same cell (which should not happen)
            non_zero = set()
            # Used to check for word ID conformation
            vocabulary_size = utils.get_vocabulary_size(lang_code)

            documents_counter = 0
            for line in codecs.getreader("utf-8")(f):
                parts = line.split()

                if len(parts) != 3:
                    raise Exception(
                        "Line formatting not as expected! Line: %s" % line)

                concept = parts[0]
                word_id = int(parts[1])

                if concept not in concepts_to_include:
                    continue

                if concept in validation_set:
                    intersection_with_concept_validation_set.add(concept)
                    continue

                # Keeps track of which concept is described in the given document
                if concept not in concept_id_2_index:
                    if documents_counter % 100000 == 0:
                        print("%d documents processed" % documents_counter)
                        sys.stdout.flush()

                    concept_id_2_index[concept] = documents_counter
                    documents_counter = documents_counter + 1

                r = concept_id_2_index[concept]

                if word_id > vocabulary_size:
                    raise Exception('Word_ID higher then vocabulary size!')

                c = word_id
                d = int(parts[2])

                field = (r, c)
                if field in non_zero:
                    # raise Exception('Repeated cell assignment!')
                    print('Repeated cell assignment!')
                    print(field)
                else:
                    non_zero.add(field)

                    rows.append(r)
                    columns.append(c)
                    data.append(d)

        index_2_concept_id = {
            index: concept_id
            for concept_id, index in concept_id_2_index.items()
        }

        # Generating the word index
        word_id_2_index = {}

        # Removes zero columns, by first generating a new index for the nonzero columns, and then updating the document representation to comply
        # with the new index which contain only nonzero_columns

        c = 0
        non_zero_word_ids = sorted(set(columns))
        for word_id in non_zero_word_ids:
            word_id_2_index[word_id] = c
            c += 1

        for i in range(len(columns)):
            word_id = columns[i]
            columns[i] = word_id_2_index[word_id]

        index_2_word_id = {
            index: word_id
            for word_id, index in word_id_2_index.items()
        }

        print("Intersection with validation set in %s counts %d elements" %
              (lang_code,
               len(intersection_with_concept_validation_set)))  # Sanity check
        sys.stdout.flush()

        temp_processed_data = {'rows' : np.array(rows), 'columns' : np.array(columns), 'data' : np.array(data), 'concept_id_2_index' : concept_id_2_index, \
         'index_2_concept_id' : index_2_concept_id, 'word_id_2_index' : word_id_2_index, 'index_2_word_id' : index_2_word_id }

        return temp_processed_data
Пример #8
0
def train_RNN(SEQUENCE_LENGTH, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE,
              KEEP_PROB, BATCH_SIZE, NUM_EPOCHS, DELTA, LEARNING_RATE,
              ALPHA_DIVIDER, MAXIMUM_DATA_NUM):
    t1 = time.time()
    # Load preprocessed data
    with open('raw_sums', 'rb') as fp:
        raw_sums = pickle.load(fp)

    with open('raw_texts', 'rb') as fp:
        raw_texts = pickle.load(fp)

    with open('vocab_limit', 'rb') as fp:
        vocab_limit = pickle.load(fp)

    #removing stopwords for the y_in of the arnn
    y_trains = []
    for line in raw_sums:
        line = [
            word for word in line if word not in open('english.txt').read()
        ]
        y_trains.append(line)
    print(y_trains[0])

    # Embedd vocabulary(important for our int2word later to have only one embedding)
    embd_vocab = []
    max_words = len(vocab_limit)
    tokenizer = Tokenizer(num_words=max_words)
    # This builds the word index
    tokenizer.fit_on_texts(vocab_limit)
    # This turns strings into lists of integer indices.
    embd_vocab = tokenizer.texts_to_sequences(vocab_limit)
    embd_vocab = list(itertools.chain(*embd_vocab))
    #embd_vocab.append(encoded_docs)
    print('TRAIN_ARNN: vocab embedded.')
    print('Länge EMBD_VOCAB: ' + str(len(embd_vocab)))

    # Saving
    with open('tokenizer.pickle', 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

    keys = vocab_limit[:]
    values = embd_vocab[:]
    dictionary = dict(zip(keys, values))

    def word2int(texts):
        embd_texts = []
        for each_text in texts:
            embd_text = []
            for word in each_text:
                for key in dictionary.keys():
                    if key == word:
                        embd_text.append(dictionary.get(key))
            embd_texts.append(embd_text)
        return embd_texts
        print('TRAIN_ARNN: ' + str(texts) + ' embedded.')

    embd_texts = word2int(raw_texts)
    embd_sums = word2int(raw_sums)
    embd_summaries = word2int(y_trains)

    #%% ADDED BY MKLOENNE from summarization model -- Creating train and test sets

    train_len = int((.7) * len(embd_sums))

    train_texts = embd_texts[0:train_len]
    train_summaries = embd_sums[0:train_len]
    train_sums = embd_summaries[0:train_len]

    val_len = int((.15) * len(embd_sums))
    '''
    val_texts = embd_texts[train_len:train_len+val_len]
    val_summaries = embd_sums[train_len:train_len+val_len]
    '''
    test_texts = embd_texts[train_len + val_len:len(embd_sums)]
    test_summaries = embd_sums[train_len + val_len:len(embd_sums)]
    test_sums = embd_summaries[train_len + val_len:len(embd_summaries)]

    # Load the dataset
    (X_train, y_train), (X_test, y_tests) = (train_texts,
                                             train_summaries), (test_texts,
                                                                test_sums)
    y_trains = train_sums

    # Convert text lists into numpy arrays to get shape like in the imdb dataset for train and test data
    def convert2array(listtoconvert, comparing):
        converted_arr = []
        maxList = max(max(len(x) for x in listtoconvert),
                      max(len(x) for x in comparing))
        pre_array = np.asarray([np.asarray(x) for x in listtoconvert])
        for arr in pre_array:
            arr = np.lib.pad(arr, (0, maxList - len(arr)),
                             'constant',
                             constant_values=0)
            text_with_zero = np.concatenate([[1], arr])
            text_with_zero = np.concatenate([text_with_zero, [0]])
            converted_arr.append(text_with_zero)
        converted_arr = np.asarray(converted_arr)
        return converted_arr

    max_y = max(max(len(x)
                    for x in train_sums), max(len(x) for x in test_sums)) + 2
    y_trains = convert2array(train_sums, test_sums)
    y_tests = convert2array(test_sums, train_sums)
    X_test = convert2array(test_texts, train_texts)
    X_train = convert2array(train_texts, test_texts)
    y_test = convert2array(test_summaries, train_summaries)
    y_train = convert2array(train_summaries, test_summaries)
    print(str(X_train[0]))

    #%%

    def normalize(i, mini, maxi):
        norm = (np.float64(i) - mini) / (maxi - mini)
        return norm

    vocabulary_size = int(get_vocabulary_size(X_train))
    print('SIZE X_TRAIN:' + str(vocabulary_size))

    # Different placeholders
    batch_ph = tf.placeholder(tf.int32, [None, None], name="X")
    target_ph = tf.placeholder(tf.float32, [None, None],
                               name="Y")  #adapted for y
    seq_len_ph = tf.placeholder(tf.int32, [None], name="SEQ_LEN")
    keep_prob_ph = tf.placeholder(tf.float32, name="KEEP_PROB")

    with tf.name_scope("Embedding-Layer"):
        # Embedding layer
        embeddings_var = tf.Variable(tf.random_uniform(
            [vocabulary_size, EMBEDDING_DIM], 0.0, 1.0),
                                     trainable=True,
                                     name="embedding")
        batch_embedded = tf.nn.embedding_lookup(embeddings_var, batch_ph)

    with tf.name_scope("RNN"):
        # (Bi-)RNN layer(-s)
        rnn_outputs, _ = bi_rnn(GRUCell(HIDDEN_SIZE),
                                GRUCell(HIDDEN_SIZE),
                                inputs=batch_embedded,
                                sequence_length=seq_len_ph,
                                dtype=tf.float32)
        #, _ = rnn(GRUCell(HIDDEN_SIZE), inputs=batch_embedded, sequence_length=seq_len_ph, dtype=tf.float32)

    with tf.name_scope("Attention-Layer"):
        # Attention layer
        attention_output, alphas = attention(rnn_outputs,
                                             ATTENTION_SIZE,
                                             return_alphas=True)

    with tf.name_scope("Dropout"):
        # Dropout
        drop = tf.nn.dropout(attention_output, keep_prob_ph)

    with tf.name_scope("Dense"):
        # Fully connected layer
        W = tf.Variable(
            tf.truncated_normal(shape=[HIDDEN_SIZE * 2, max_y], stddev=0.1),
            name="weights")  # Hidden size is multiplied by 2 for Bi-RNN
        b = tf.Variable(tf.constant(0., shape=[max_y]), name="bias")
        y_hat = tf.nn.xw_plus_b(drop, W, b)
        y_hat = tf.squeeze(y_hat)

    with tf.name_scope("loss"):
        # Cross-entropy loss and optimizer initialization
        loss = tf.reduce_mean(
            tf.nn.sigmoid_cross_entropy_with_logits(logits=y_hat,
                                                    labels=target_ph))
        # = tf.reduce_mean(tf.nn.nce_loss(weights=W, biases=b, labels=target_ph, inputs=y_hat))
    tf.summary.scalar('loss', loss)
    optimizer = tf.train.AdamOptimizer(
        learning_rate=LEARNING_RATE).minimize(loss)

    with tf.name_scope("Accuracy"):
        # Accuracy metric
        accuracy = tf.reduce_mean(
            tf.cast(tf.equal(tf.round(tf.sigmoid(y_hat)), target_ph),
                    tf.float32))
    tf.summary.scalar('Accuracy', accuracy)

    #    rec, rec_op = tf.metrics.recall(labels=target_ph, predictions=y_hat)
    #    pre, pre_op = tf.metrics.precision(labels=target_ph, predictions=y_hat)

    # Batch generators
    train_batch_generator = batch_generator(X_train, y_trains, BATCH_SIZE)
    test_batch_generator = batch_generator(X_test, y_tests, BATCH_SIZE)

    #%%

    print(type([x for x in X_train[0]]))
    saver = tf.train.Saver()
    # Train session
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        train_writer = tf.summary.FileWriter('./logs/rnn_logs' + '/train')
        train_writer.add_graph(sess.graph)
        test_writer = tf.summary.FileWriter('./logs/rnn_logs' + '/test')
        test_writer.add_graph(sess.graph)
        merged = tf.summary.merge_all()
        losses = []

        print("TRAIN_ARNN: Start learning...")
        for epoch in range(NUM_EPOCHS):
            loss_train = 0
            loss_test = 0
            accuracy_train = 0
            accuracy_test = 0

            print("epoch: {}\t".format(epoch), end="")

            # Training
            num_batches = X_train.shape[0] // BATCH_SIZE
            for b in range(num_batches):
                x_batch, y_batch = next(train_batch_generator)
                maxValue_y = max([max(sublist) for sublist in y_batch])
                minValue_y = min([min(sublist) for sublist in y_batch])
                j = 0
                y_batch_normed = []
                for each_y_batch in y_batch:
                    norm_y = [
                        normalize(i, minValue_y, maxValue_y)
                        for i in each_y_batch
                    ]
                    y_batch_normed.append(norm_y)
                    j += 1
                seq_len = np.array([list(x).index(0) + 1 for x in x_batch
                                    ])  # actual lengths of sequences
                summary, loss_tr, acc, _ = sess.run(
                    [merged, loss, accuracy, optimizer],
                    feed_dict={
                        batch_ph: x_batch,
                        target_ph: y_batch_normed,
                        seq_len_ph: seq_len,
                        keep_prob_ph: KEEP_PROB
                    })

                accuracy_train += acc
                loss_train = loss_tr * DELTA + loss_train * (1 - DELTA)
                train_writer.add_summary(summary, epoch * num_batches + b)
            if accuracy_train != 0:
                accuracy_train /= num_batches
            saver.save(
                sess,
                "/home/kloe_mr/AnacondaProjects/tf-rnn-attention/Bachelorarbeit/BA/buffering-model-rnn.ckpt"
            )
            # Testing
            print(X_test.shape)
            num_batches = X_test.shape[0] // BATCH_SIZE
            for b in range(num_batches):
                x_batch, y_batch = next(test_batch_generator)
                maxValue_y = max([max(sublist) for sublist in y_batch])
                minValue_y = min([min(sublist) for sublist in y_batch])
                j = 0
                y_batch_normed = []
                for each_y_batch in y_batch:
                    norm_y = [
                        normalize(i, minValue_y, maxValue_y)
                        for i in each_y_batch
                    ]
                    y_batch_normed.append(norm_y)
                    j += 1
                seq_len = np.array([list(x).index(0) + 1 for x in x_batch
                                    ])  # actual lengths of sequences
                summary, loss_test_batch, acc = sess.run(
                    [merged, loss, accuracy],
                    feed_dict={
                        batch_ph: x_batch,
                        target_ph: y_batch_normed,
                        seq_len_ph: seq_len,
                        keep_prob_ph: 1.0
                    })
                accuracy_test += acc
                loss_test += loss_test_batch
                test_writer.add_summary(summary, epoch * num_batches + b)
            if loss_test == 0:
                print('loss_test is zero!')
                break
            if accuracy_train != 0:
                accuracy_test /= num_batches
            loss_test /= num_batches
            losses.append(loss_test)
            print(
                "loss: {:.3f}, val_loss: {:.3f}, acc: {:.3f}, val_acc: {:.3f}".
                format(loss_train, loss_test, accuracy_train, accuracy_test))

        #saver.save(sess, "H:/AnacondaProjects/tf-rnn-attention/tf-rnn-attention-master/model.ckpt") # "model")
        saver.save(
            sess,
            "/home/kloe_mr/AnacondaProjects/tf-rnn-attention/tf-rnn-attention-master/model.ckpt"
        )  # "model")

        #%% Restore session
        with tf.Session() as sess:
            saver.restore(
                sess,
                "/home/kloe_mr/AnacondaProjects/tf-rnn-attention/tf-rnn-attention-master/model.ckpt"
            )  #"model")
            #saver.restore(sess, "H:/AnacondaProjects/tf-rnn-attention/tf-rnn-attention-master/model.ckpt")#"model")
            max_y = max(max(len(x) for x in train_summaries),
                        max(len(x) for x in test_summaries)) + 2
            x_batch_train, y_batch_train = X_train[:len(
                X_train)], y_trains[:len(y_trains)]
            seq_len_test = np.array(
                [list(x).index(0) + 1 for x in x_batch_train])
            maxValue_y = max([max(sublist) for sublist in y_batch])
            minValue_y = min([min(sublist) for sublist in y_batch])
            j = 0
            y_batch_normed = []
            for each_y_batch in y_batch_train:
                norm_y = [
                    normalize(i, minValue_y, maxValue_y) for i in each_y_batch
                ]
                y_batch_normed.append(norm_y)
                j += 1
            alphas_train = sess.run(
                [alphas],
                feed_dict={
                    batch_ph: x_batch_train,
                    target_ph: y_batch_normed,
                    seq_len_ph: seq_len_test,
                    keep_prob_ph: 1.0
                })

#%% Restore session
        with tf.Session() as sess:
            saver.restore(
                sess,
                "/home/kloe_mr/AnacondaProjects/tf-rnn-attention/tf-rnn-attention-master/model.ckpt"
            )  #"model")
            #saver.restore(sess, "H:/AnacondaProjects/tf-rnn-attention/tf-rnn-attention-master/model.ckpt")#"model")
            max_y = max(max(len(x) for x in train_summaries),
                        max(len(x) for x in test_summaries)) + 2
            x_batch_test, y_batch_test = X_test[:len(X_test
                                                     )], y_tests[:len(y_tests)]
            seq_len_test = np.array(
                [list(x).index(0) + 1 for x in x_batch_test])
            maxValue_y = max([max(sublist) for sublist in y_batch])
            minValue_y = min([min(sublist) for sublist in y_batch])
            j = 0
            y_batch_normed = []
            for each_y_batch in y_batch_test:
                norm_y = [
                    normalize(i, minValue_y, maxValue_y) for i in each_y_batch
                ]
                y_batch_normed.append(norm_y)
                j += 1
            alphas_test = sess.run(
                [alphas],
                feed_dict={
                    batch_ph: x_batch_test,
                    target_ph: y_batch_normed,
                    seq_len_ph: seq_len_test,
                    keep_prob_ph: 1.0
                })

#%% setting up the generated outputs by choosing the most weighted higlights out of the text for the LSTM input

        alphas_values = alphas_train[:][0]
        y_batch = y_train[:len(y_train)]
        # Save visualization as HTML
        rnn_outs = []
        #with open("visualization.html", "w") as html_file:
        for words, alphas in zip(x_batch_train, alphas_values):
            rnn_out = []
            Largest_alphas = len(words) // ALPHA_DIVIDER
            if Largest_alphas == 0:
                continue
            min_value = min(heapq.nlargest(Largest_alphas, alphas, key=None))
            for word, alpha in zip(
                    words, alphas):  #_values):# / alphas_values.max()):
                if alpha > min_value:
                    if word not in rnn_out:  #think about wether it makes sense to remove 2nd appearance, maybe check for frequency in listheap
                        rnn_out.append(word)
                if word == 0:
                    break
                #html_file.write('<font style="background: rgba(255, 255, 0, %f)">%s</font>\n' % (alpha, word))
            rnn_outs.append(rnn_out)
        print('RNN_OUTS: ' + str(len(rnn_outs)))

        alphas_values = alphas_test[:][0]
        #y_batch = y_test[:len(y_test)]
        #print(alphas_values[0])
        # Save visualization as HTML
        rnntest_outs = []
        #with open("visualization.html", "w") as html_file:
        for words, alphas in zip(x_batch_test, alphas_values):
            rnn_out = []
            Largest_alphas = len(words) // ALPHA_DIVIDER
            if Largest_alphas == 0:
                continue
            min_value = min(heapq.nlargest(Largest_alphas, alphas, key=None))
            for word, alpha in zip(
                    words, alphas):  #_values):# / alphas_values.max()):
                if alpha > min_value:
                    if word not in rnn_out:  #think about wether it makes sense to remove 2nd appearance, maybe check for frequency in listheap
                        rnn_out.append(word)
                if word == 0:
                    break
                #html_file.write('<font style="background: rgba(255, 255, 0, %f)">%s</font>\n' % (alpha, word))
            rnntest_outs.append(rnn_out)

    # Creates list of lists with reembedded words
    def int2word(data):
        lists = []
        for paragraph in data:
            words = []
            for word in paragraph:
                if word == 1:
                    continue
                if word == 0:
                    break
                words.append(
                    list(dictionary.keys())[list(
                        dictionary.values()).index(word)])
            lists.append(words)
        return lists

    rnn_out = int2word(rnn_outs)
    batch_words = int2word(y_batch_train)
    y_batch = int2word(y_batch)
    print(rnn_out[0])
    print(batch_words[0])
    mean_testloss = np.mean(losses)
    avg_testloss = np.mean(losses)
    std_testloss = np.std(losses)

    ##EVENTUELL AUCH FÜR TEST DATA(????)
    #in csv datei packen
    t2 = time.time()
    t = t2 - t1
    r = Rouge()

    def rouge_score(rnn_outs, rnn_out, batch_words):
        d = []
        for i in range(len(rnn_outs)):
            system_generated_summary = rnn_out[i]
            manual_summmary = batch_words[i]
            try:
                [precision, recall,
                 f_score] = r.rouge_l([system_generated_summary],
                                      [manual_summmary])
            except ZeroDivisionError:
                continue
            d.append(
                (batch_words[i], rnn_out[i], precision, recall, f_score, t,
                 EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, KEEP_PROB,
                 BATCH_SIZE, NUM_EPOCHS, DELTA, LEARNING_RATE, ALPHA_DIVIDER,
                 MAXIMUM_DATA_NUM, mean_testloss, avg_testloss, std_testloss))
            print("Summary" + str(i) + "\nPrecision is :" + str(precision) +
                  "\nRecall is :" + str(recall) + "\nF Score is :" +
                  str(f_score))
        return d

    d = rouge_score(rnn_outs, rnn_out, batch_words)
    df = pd.DataFrame(
        d,
        columns=('Original Sum', 'Predicted Sum', 'Precision', 'Recall',
                 'F_Score', 'RUNTIME/sec', 'EMBEDDING_DIM', 'HIDDEN_SIZE',
                 'ATTENTION_SIZE', 'KEEP_PROB', 'BATCH_SIZE', 'EPOCHS',
                 'DELTA', 'LEARNING_RATE', 'ALPHA_DIVIDER', 'MAXIMUM_DATA_NUM',
                 'MEAN_TESTLOSS', 'AVG_TESTLOSS', 'STD_TESTLOSS'))

    print(df.head(2))
    df.to_csv("arnn_train_1" + str(BATCH_SIZE) + "_batches" + str(NUM_EPOCHS) +
              "_epochs.csv",
              sep=',')
    #%%
    # saving
    with open('rnn_out.pickle', 'wb') as handle:
        pickle.dump(rnn_out, handle, protocol=pickle.HIGHEST_PROTOCOL)
    # saving
    with open('batch_words.pickle', 'wb') as handle:
        pickle.dump(y_batch, handle, protocol=pickle.HIGHEST_PROTOCOL)

    rnntest_out = int2word(rnntest_outs)
    batch_words = int2word(y_batch_test)
    d = rouge_score(rnntest_outs, rnntest_out, batch_words)
    df = pd.DataFrame(
        d,
        columns=('Original Sum', 'Predicted Sum', 'Precision', 'Recall',
                 'F_Score', 'RUNTIME/sec', 'EMBEDDING_DIM', 'HIDDEN_SIZE',
                 'ATTENTION_SIZE', 'KEEP_PROB', 'BATCH_SIZE', 'EPOCHS',
                 'DELTA', 'LEARNING_RATE', 'ALPHA_DIVIDER', 'MAXIMUM_DATA_NUM',
                 'MEAN_TESTLOSS', 'AVG_TESTLOSS', 'STD_TESTLOSS'))

    print(df.head(2))
    df.to_csv("arnn_test_1" + str(BATCH_SIZE) + "_batches" + str(NUM_EPOCHS) +
              "_epochs.csv",
              sep=',')
Пример #9
0
def train(model_name, p1_train_data, p1_dev_data, p2_train_data, p2_dev_data):

    ### PHASE 1 ###

    training_data = np.load(p1_train_data)
    validation_data = np.load(p1_dev_data)

    assert training_data["batch_size"] == validation_data["batch_size"]
    assert training_data["vocabulary"] == validation_data["vocabulary"]
    assert training_data["punctuations"] == validation_data["punctuations"]

    print "1st phase data loaded..."

    print "Vocabulary size is %d" % utils.get_vocabulary_size(
        validation_data["vocabulary"])
    print "Training set size is %d" % training_data["total_size"]
    print "Validation set size is %d" % validation_data["total_size"]

    net = models.T_LSTM()
    net.initialize(hidden_size=conf.PHASE1["HIDDEN_SIZE"],
                   projection_size=conf.PHASE1["PROJECTION_SIZE"],
                   in_vocabulary=training_data["vocabulary"],
                   out_vocabulary=training_data["punctuations"],
                   batch_size=training_data["batch_size"],
                   hidden_activation=conf.PHASE1["HIDDEN_ACTIVATION"],
                   bptt_steps=conf.PHASE1["BPTT_STEPS"],
                   use_pauses=False)

    _train(net, training_data, validation_data, model_name,
           conf.PHASE1["LEARNING_RATE"], conf.PHASE1["MAX_EPOCHS"],
           conf.PHASE1["MIN_IMPROVEMENT"])

    ### PHASE 2 ###

    if not os.path.isfile(p2_train_data) or not os.path.isfile(p2_train_data):
        print "No second phase data."
        return

    training_data = np.load(p2_train_data)
    validation_data = np.load(p2_dev_data)

    assert training_data["batch_size"] == validation_data[
        "batch_size"] == net.batch_size
    assert training_data["vocabulary"] == validation_data[
        "vocabulary"] == net.in_vocabulary
    assert training_data["punctuations"] == validation_data[
        "punctuations"] == net.out_vocabulary

    print "2nd phase data loaded..."

    print "Training set size is %d" % training_data["total_size"]
    print "Validation set size is %d" % validation_data["total_size"]
    print "Trainging %s pause durations." % (
        "with" if conf.PHASE2["USE_PAUSES"] else "without")

    t_lstm = net

    net = models.TA_LSTM()
    net.initialize(hidden_size=conf.PHASE2["HIDDEN_SIZE"],
                   t_lstm=t_lstm,
                   out_vocabulary=training_data["punctuations"],
                   batch_size=training_data["batch_size"],
                   hidden_activation=conf.PHASE2["HIDDEN_ACTIVATION"],
                   bptt_steps=conf.PHASE2["BPTT_STEPS"],
                   use_pauses=conf.PHASE2["USE_PAUSES"])

    _train(net, training_data, validation_data, model_name,
           conf.PHASE2["LEARNING_RATE"], conf.PHASE2["MAX_EPOCHS"],
           conf.PHASE2["MIN_IMPROVEMENT"])