示例#1
0
def stats():
    '''
    Helper function to print descriptive statistics of training data
    '''

    with open('train_data.pkl', 'rb') as f:
        data_sequences = pkl.load(f)
    with open('train_labels.pkl', 'rb') as f:
        labels = pkl.load(f)

    _, _, lengths, _, _ = build_dictionary(data_sequences)

    bins = [0, 100, 500, 1000, 1500, 1999]
    labels_string = ['cyto', 'secreted', 'mito', 'nucleus']

    df = pd.DataFrame({'length': lengths, 'label': labels})
    table = pd.crosstab(np.digitize(df.length, bins), df.label)

    table.index = pd.Index([
        '[0, 100)', '[100, 500)', '[500, 1000]', '[1000, 1500)',
        '[1500, 2000)', '[2000, inf]'
    ],
                           name="Bin")
    table.columns = pd.Index(labels_string, name="Class")

    sum_row = {col: table[col].sum() for col in table}
    sum_df = pd.DataFrame(sum_row, index=["Total"])
    table = table.append(sum_df)
    table['Total'] = table.sum(axis=1)

    print('\n~~~~~~~ Summary stats for %s set ~~~~~~~')
    print('\nCount of sequence lengths by class')
    print(table)
    print('\nDescriptive statistics')
    print(df.describe())
示例#2
0
dataset = 'wikipedia'

files = [
    'tagged.en/englishEtiquetado_' + str(ind * 10000) + '_' +
    str(ind * 10000 + 10000) for ind in xrange(10)
]

vocabulary_size = 20000

words = []

for filename in files:
    with open(data_path + dataset + '/' + filename, mode='r') as txtfile:

        while True:
            line = txtfile.readline()
            if line == '':
                break
            if line == '\n':
                continue

            tokens = line.split()
            words.append(tokens[0].lower())

print('Total number of words is %d' % len(words))

count, dictionary, reverse_dictionary = build_dictionary(
    words, vocabulary_size)
voc_dict = dict(dic=dictionary, rev_dic=reverse_dictionary, freq=count)
pickle.dump(voc_dict, open(data_path + dataset + '/voc_dict.pkl', 'wb'))
示例#3
0
    def run(self):

        # Load corpus
        corpus = import_data(self.corpus)
        self.dictionary, self.reverse_dictionary, sent_lengths, self.max_sent_len, enc_data, dec_data, dec_lab = build_dictionary(
            corpus)

        # Save metadata for visualisation of embedding matrix
        meta_data = sorted(self.dictionary, key=model.dictionary.get)
        print(len(meta_data))
        with open('meta_data.tsv', 'w') as f:
            tsv_writer = csv.writer(f, dialect='excel')
            tsv_writer.writerow(
                str(i.encode('utf-8')) + '\n' for i in meta_data)

        # np.savetxt("meta_data.tsv", meta_data, fmt="%s")

        self.dictionary = sorted(self.dictionary.items(),
                                 key=operator.itemgetter(1))
        self.vocabulary_size = len(self.dictionary)
        self.max_sent_len += 1

        # Create datasets for encoder and decoders
        enc_data = enc_data[1:-1]
        enc_lengths = sent_lengths[1:-1]
        post_lengths = sent_lengths[2:] + 1
        post_data = dec_data[2:]
        post_lab = dec_lab[2:]
        pre_lengths = sent_lengths[:-2] + 1
        pre_data = dec_data[:-2]
        pre_lab = dec_lab[:-2]

        # Print summary statistics
        self.corpus_length = len(enc_data)
        self.corpus_stats()

        self.graph = tf.Graph()

        with self.graph.as_default():

            print('\r~~~~~~~ Building model ~~~~~~~\r')
            self.initializer = tf.random_normal_initializer()

            # Variables
            self.word_embeddings = tf.get_variable(
                'embeddings', [self.vocabulary_size, self.embedding_size],
                tf.float32,
                initializer=self.initializer)
            self.W_pre = tf.get_variable(
                'precoder/weight', [self.embedding_size, self.vocabulary_size],
                tf.float32,
                initializer=self.initializer)
            self.b_pre = tf.get_variable('precoder/bias',
                                         [self.vocabulary_size],
                                         tf.float32,
                                         initializer=self.initializer)
            self.W_post = tf.get_variable(
                'postcoder/weight',
                [self.embedding_size, self.vocabulary_size],
                tf.float32,
                initializer=self.initializer)
            self.b_post = tf.get_variable('postcoder/bias',
                                          [self.vocabulary_size],
                                          tf.float32,
                                          initializer=self.initializer)

            global_step = tf.Variable(0, name='global_step', trainable=False)

            # Encoder placeholders
            sentences = tf.placeholder(tf.int32, [None, None], "sentences")
            sentences_lengths = tf.placeholder(tf.int32, [None],
                                               "sentences_lengths")

            # Postcoder placeholders
            post_inputs = tf.placeholder(tf.int32, [None, None], "post_inputs")
            post_labels = tf.placeholder(tf.int32, [None, None], "post_labels")
            post_sentences_lengths = tf.placeholder(tf.int32, [None],
                                                    "post_sentences_lengths")

            # Precoder placeholders
            pre_inputs = tf.placeholder(tf.int32, [None, None], "pre_inputs")
            pre_labels = tf.placeholder(tf.int32, [None, None], "pre_labels")
            pre_sentences_lengths = tf.placeholder(tf.int32, [None],
                                                   "pre_sentences_lengths")

            # Embed sentences
            sentences_embedded = self.embed_data(sentences)
            post_inputs_embedded = self.embed_data(post_inputs)
            pre_inputs_embedded = self.embed_data(pre_inputs)

            # Encoder
            encoded_sentences = self.encoder(sentences_embedded,
                                             sentences_lengths,
                                             self.bidirectional)

            # Decoder for following sentence
            post_logits_projected, post_logits = self.decoder(
                decoder_inputs=post_inputs_embedded,
                encoder_state=encoded_sentences,
                name='postcoder',
                lengths=post_sentences_lengths,
                train=True)

            # Decoder for previous sentence
            pre_logits_projected, pre_logits = self.decoder(
                decoder_inputs=pre_inputs_embedded,
                encoder_state=encoded_sentences,
                name='precoder',
                lengths=pre_sentences_lengths,
                train=True)

            # Compute loss
            if self.loss_function == 'softmax':
                post_loss = self.get_softmax_loss(post_labels,
                                                  post_logits_projected)
                pre_loss = self.get_softmax_loss(pre_labels,
                                                 pre_logits_projected)
            else:
                post_loss = self.get_sampled_softmax_loss(post_labels,
                                                          post_logits,
                                                          name='postcoder')
                pre_loss = self.get_sampled_softmax_loss(pre_labels,
                                                         pre_logits,
                                                         name='precoder')

            loss = pre_loss + post_loss
            opt_op = tf.contrib.layers.optimize_loss(
                loss=loss,
                global_step=global_step,
                learning_rate=self.learning_rate,
                optimizer='Adam',
                clip_gradients=2.0,
                learning_rate_decay_fn=None,
                summaries=['loss'])

            # Decode sentences at prediction time
            pre_predict = self.decoder(decoder_inputs=pre_inputs_embedded,
                                       encoder_state=encoded_sentences,
                                       name='precoder',
                                       lengths=pre_sentences_lengths,
                                       train=False)
            post_predict = self.decoder(decoder_inputs=post_inputs_embedded,
                                        encoder_state=encoded_sentences,
                                        name='postcoder',
                                        lengths=post_sentences_lengths,
                                        train=False)
            predict = [pre_predict, post_predict]

        with tf.Session(graph=self.graph) as session:

            self.a = tf.contrib.graph_editor.get_tensors(self.graph)
            train_loss_writer = tf.summary.FileWriter(
                './tensorboard/train_loss', session.graph)

            # Use the same LOG_DIR where you stored your checkpoint.
            embedding_writer = tf.summary.FileWriter('./tensorboard/',
                                                     session.graph)

            config = projector.ProjectorConfig()
            embedding = config.embeddings.add()
            embedding.tensor_name = self.word_embeddings.name
            # Link this tensor to its metadata file (e.g. labels).
            embedding.metadata_path = os.path.join('./meta_data.tsv')

            # Saves a configuration file that TensorBoard will read during startup.
            projector.visualize_embeddings(embedding_writer, config)

            merged = tf.summary.merge_all()

            print('\r~~~~~~~ Initializing variables ~~~~~~~\r')
            tf.global_variables_initializer().run()

            print('\r~~~~~~~ Starting training ~~~~~~~\r')
            start_time = time.time()

            try:
                train_summaryIndex = -1

                for epoch in range(self.num_epochs):
                    self.is_train = True
                    epoch_time = time.time()
                    print('----- Epoch', epoch, '-----')
                    print('Shuffling dataset')

                    perm = np.random.permutation(self.corpus_length)

                    enc_lengths_perm = enc_lengths[perm]
                    enc_data_perm = enc_data[perm]
                    post_lengths_perm = post_lengths[perm]
                    post_inputs_perm = np.array(post_data)[perm]
                    post_labels_perm = np.array(post_lab)[perm]
                    pre_lengths_perm = pre_lengths[perm]
                    pre_inputs_perm = np.array(pre_data)[perm]
                    pre_labels_perm = np.array(pre_lab)[perm]

                    total_loss = 0
                    predict_step = 50

                    for step in range(self.corpus_length // self.batch_size):
                        begin = step * self.batch_size
                        end = (step + 1) * self.batch_size

                        batch_enc_lengths = enc_lengths_perm[begin:end]
                        batch_enc_inputs = enc_data_perm[begin:end]
                        batch_post_lengths = post_lengths_perm[begin:end]
                        batch_post_inputs = post_inputs_perm[
                            begin:end, :np.max(batch_post_lengths)]
                        batch_post_labels = post_labels_perm[
                            begin:end, :np.max(batch_post_lengths)]
                        batch_pre_lengths = pre_lengths_perm[begin:end]
                        batch_pre_inputs = pre_inputs_perm[
                            begin:end, :np.max(batch_pre_lengths)]
                        batch_pre_labels = pre_labels_perm[
                            begin:end, :np.max(batch_pre_lengths)]

                        train_dict = {
                            sentences: batch_enc_inputs,
                            sentences_lengths: batch_enc_lengths,
                            post_inputs: batch_post_inputs,
                            post_labels: batch_post_labels,
                            post_sentences_lengths: batch_post_lengths,
                            pre_inputs: batch_pre_inputs,
                            pre_labels: batch_pre_labels,
                            pre_sentences_lengths: batch_pre_lengths
                        }

                        _, loss_val, batch_summary, glob_step = session.run(
                            [opt_op, loss, merged, global_step],
                            feed_dict=train_dict)
                        train_loss_writer.add_summary(
                            batch_summary, step +
                            (self.corpus_length // self.batch_size) * epoch)

                        total_loss += loss_val

                        if glob_step % predict_step == 0:
                            # if step > 0:
                            print("Average loss at step ", glob_step, ": ",
                                  total_loss / predict_step)
                            total_loss = 0

                            print('\nOriginal sequence:\n')
                            print(
                                self.print_sentence(batch_pre_inputs[0, 1:],
                                                    batch_pre_lengths[0] - 1))
                            print(
                                self.print_sentence(batch_enc_inputs[0],
                                                    batch_enc_lengths[0]))
                            print(
                                self.print_sentence(batch_post_inputs[0, 1:],
                                                    batch_post_lengths[0] - 1))

                            test_enc_lengths = np.expand_dims(
                                batch_enc_lengths[0], 0)
                            test_enc_inputs = np.expand_dims(
                                batch_enc_inputs[0], 0)
                            test_post_lengths = np.expand_dims(
                                batch_post_lengths[0], 0)
                            test_post_inputs = np.expand_dims(
                                batch_post_inputs[0], 0)
                            test_post_labels = np.expand_dims(
                                batch_post_labels[0], 0)
                            test_pre_lengths = np.expand_dims(
                                batch_pre_lengths[0], 0)
                            test_pre_inputs = np.expand_dims(
                                batch_pre_inputs[0], 0)
                            test_pre_labels = np.expand_dims(
                                batch_pre_labels[0], 0)

                            test_dict = {
                                sentences_lengths: test_enc_lengths,
                                sentences: test_enc_inputs,
                                post_sentences_lengths: test_post_lengths,
                                post_inputs: test_post_inputs,
                                post_labels: test_post_labels,
                                pre_sentences_lengths: test_pre_lengths,
                                pre_inputs: test_pre_inputs,
                                pre_labels: test_pre_labels
                            }

                            pre_prediction, post_prediction = session.run(
                                [predict], feed_dict=test_dict)[0]

                            print(
                                '\nPredicted previous and following sequence around original sentence:\n'
                            )
                            print(
                                self.print_sentence(pre_prediction[0],
                                                    len(pre_prediction[0])))
                            print(
                                self.print_sentence(batch_enc_inputs[0],
                                                    batch_enc_lengths[0]))
                            print(
                                self.print_sentence(post_prediction[0],
                                                    len(post_prediction[0])))

                            end_time = time.time()
                            print('\nTime for %d steps: %0.2f seconds' %
                                  (predict_step, end_time - start_time))
                            start_time = time.time()
                            print(
                                '\n\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
                            )

                    saver = tf.train.Saver()
                    saver.save(session,
                               os.path.join('./tensorboard/', 'model.ckpt'))

            except KeyboardInterrupt:
                save = input('save?')
                if 'y' in save:
                    self.save_model(session, 0)
示例#4
0
    def run(self):
        '''
        Runs the model according to the specified settings
        -   If mode = Train: Train a GRU model using the training data
        -   If mode = Val: Load the saved GRU model and evaluate it on the validation fold
        -   If mode = Test: Load the saved GRU model and evaluate it on the blind test set
        '''

        self.is_train = (self.mode == 'Train')

        if not os.path.exists(self.path):
            os.mkdir(self.path)

        # Load the training data
        with open('train_data.pkl', 'rb') as f:
            data_sequences = pkl.load(f)
        with open('train_labels.pkl', 'rb') as f:
            data_labels = pkl.load(f)

        dictionary, reverse_dictionary, data_lengths, self.max_seq_len, enc_sequences = build_dictionary(
            data_sequences)
        self.dictionary = sorted(dictionary.items(),
                                 key=operator.itemgetter(1))
        print(self.dictionary)
        self.vocabulary_size = len(dictionary)
        self.val_size = len(data_sequences) // self.folds
        fold = 1
        print('Training fold number %d. Each fold of size %d' %
              (fold, len(data_sequences) // self.folds))

        # Truncates sequences at length 2000 and returns descriptive statistics.
        # This is done by concatenating the first 1900 and the last 100 amino acids.

        if self.is_train:
            self.max_seq_len = 2000
            original_lengths = copy(data_lengths)

            data_sequences = enc_sequences[:, :self.max_seq_len]
            for i in range(len(data_lengths)):
                if data_lengths[i] > self.max_seq_len:
                    data_sequences[i] = np.concatenate(
                        (enc_sequences[i, :self.max_seq_len - 100],
                         enc_sequences[i, -100:]),
                        axis=0)
                    data_lengths[i] = self.max_seq_len

            if self.folds == 1:
                val_mask = np.array([False])
            else:
                val_mask = np.arange(self.val_size * (fold - 1),
                                     self.val_size * (fold))

            # Use seed to ensure same randomisation is applied for each fold
            np.random.seed(4)
            perm = np.random.permutation(len(data_sequences))
            data_labels = np.array(data_labels)

            data_sequences = data_sequences[perm]
            data_labels = data_labels[perm]
            data_lenghts = data_lengths[perm]
            original_lengths = original_lengths[perm]

            self.val_data = data_sequences[val_mask]
            self.val_labels = data_labels[val_mask]
            self.val_lengths = data_lengths[val_mask]
            self.val_original_lengths = original_lengths[val_mask]

            self.train_data = np.delete(data_sequences, val_mask, axis=0)
            self.train_labels = np.delete(data_labels, val_mask, axis=0)
            self.train_lengths = np.delete(data_lengths, val_mask, axis=0)
            self.train_original_lengths = np.delete(original_lengths,
                                                    val_mask,
                                                    axis=0)

            self.train_statistics, self.train_frame = self.summary_stats(
                self.train_lengths, self.train_labels, 'train')
            if self.folds == 1:
                self.val_statistics = np.array([])
                self.val_frame = np.array([])
                self.val_original_lengths = np.array([])
            else:
                self.val_statistics, self.val_frame = self.summary_stats(
                    self.val_lengths, self.val_labels, 'validation')

            this_data = [
                self.train_data, self.train_labels, self.train_lengths,
                self.val_data, self.val_labels, self.val_lengths,
                self.train_statistics, self.train_frame, self.val_statistics,
                self.val_frame, self.train_original_lengths,
                self.val_original_lengths
            ]

            with open(self.path + 'this_data.pkl', 'wb') as f:
                pkl.dump(this_data, f)

        else:
            with open(self.path + 'this_data.pkl', 'rb') as f:
                self.train_data, self.train_labels, self.train_lengths, self.val_data, self.val_labels, self.val_lengths, self.train_statistics, self.train_frame, self.val_statistics, self.val_frame, self.train_original_lengths, self.val_original_lengths = pkl.load(
                    f)

        # Now construct the Tensorflow graph
        print('\r~~~~~~~ Building model ~~~~~~~\r')

        # Define placeholders and variables
        initializer = tf.random_normal_initializer()
        self.word_embeddings = tf.get_variable(
            'embeddings', [self.vocabulary_size, self.embedding_size],
            tf.float32,
            initializer=initializer)
        sequences = tf.placeholder(tf.int32, [None, None], "sequences")
        sequences_lengths = tf.placeholder(tf.int32, [None],
                                           "sequences_lengths")
        labels = tf.placeholder(tf.int64, [None], "labels")
        keep_prob_dropout = tf.placeholder(tf.float32, name='dropout')
        global_step = tf.Variable(0, name='global_step', trainable=False)

        # Embed and encode sequences
        sequences_embedded = self.embed_data(sequences)
        encoded_sequences = self.encoder(sequences_embedded,
                                         sequences_lengths,
                                         keep_prob_dropout,
                                         bidirectional=self.bidirectional)

        # Take last hidden state of GRU and put them through a nonlinear and a linear FC layer
        with tf.name_scope('non_linear_layer'):
            encoded_sentences_BN = self.batch_norm_wrapper(
                encoded_sequences, self.is_train)
            non_linear = tf.nn.dropout(tf.nn.relu(
                tf.contrib.layers.linear(encoded_sentences_BN, 64)),
                                       keep_prob=keep_prob_dropout)

        with tf.name_scope('final_layer'):
            non_linear_BN = self.batch_norm_wrapper(non_linear, self.is_train)
            logits = tf.contrib.layers.linear(non_linear_BN, 4)

        # Compute mean loss on this batch, consisting of cross entropy loss and L2 loss
        CE_loss = self.get_CE_loss(labels, logits)
        L2_loss = self.get_L2_loss()
        loss = CE_loss + L2_loss

        # Perform training operation
        learning_rate = tf.train.exponential_decay(self.learning_rate,
                                                   global_step,
                                                   100,
                                                   0.96,
                                                   staircase=True)
        opt_op = tf.contrib.layers.optimize_loss(loss=loss,
                                                 global_step=global_step,
                                                 learning_rate=learning_rate,
                                                 optimizer='Adam',
                                                 clip_gradients=2.0,
                                                 learning_rate_decay_fn=None,
                                                 summaries=None)

        # Define scalars for Tensorboard
        tf.summary.scalar('CE_loss', CE_loss)
        tf.summary.scalar('L2_loss', L2_loss)
        tf.summary.scalar('loss', loss)
        tf.summary.scalar('learning_rate', learning_rate)

        # Compute accuracy of prediction
        probs = tf.nn.softmax(logits)
        with tf.name_scope('accuracy'):
            pred = tf.argmax(logits, 1)
            correct_prediction = tf.equal(labels, pred)
            accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
            tf.summary.scalar('accuracy', accuracy)

        # If in training mode:
        # - shuffle data set before each epoch
        # - train model using mini batches
        # - track performance on train and validation set throughout training

        if self.is_train == True:
            with tf.Session() as session:
                train_loss_writer = tf.summary.FileWriter(
                    str(self.path + 'tensorboard/train_loss'), session.graph)
                train_summary_writer = tf.summary.FileWriter(
                    str(self.path + 'tensorboard/train_summary'),
                    session.graph)
                val_summary_writer = tf.summary.FileWriter(
                    str(self.path + 'tensorboard/val_summary'), session.graph)

                # Use the same LOG_DIR where you stored your checkpoint.
                embedding_writer = tf.summary.FileWriter(
                    str(self.path + 'tensorboard/'), session.graph)

                config = projector.ProjectorConfig()
                embedding = config.embeddings.add()
                embedding.tensor_name = self.word_embeddings.name
                # Link this tensor to its metadata file (e.g. labels).
                embedding.metadata_path = os.path.join('./metadata.tsv')

                # Saves a configuration file that TensorBoard will read during startup.
                projector.visualize_embeddings(embedding_writer, config)

                merged = tf.summary.merge_all()
                print('\r~~~~~~~ Initializing variables ~~~~~~~\r')
                tf.global_variables_initializer().run()

                start_time = time.time()
                min_train_loss = np.inf
                batch_times = []
                n = self.train_data.shape[0]
                print('\r~~~~~~~ Starting training ~~~~~~~\r')
                try:
                    train_summaryIndex = -1

                    for epoch in range(self.num_epochs):
                        self.is_train = True
                        epoch_time = time.time()
                        print('----- Epoch', epoch, '-----')
                        print('Shuffling dataset')

                        perm = np.random.permutation(len(self.train_data))
                        self.train_data_perm = self.train_data[perm]
                        self.train_labels_perm = self.train_labels[perm]
                        self.train_lengths_perm = self.train_lengths[perm]

                        total_loss = 0

                        for i in range(n // self.batch_size):
                            batch_start = time.time()
                            batch_data = self.train_data_perm[i *
                                                              self.batch_size:
                                                              (i + 1) *
                                                              self.batch_size]
                            batch_lengths = self.train_lengths_perm[
                                i * self.batch_size:(i + 1) * self.batch_size]
                            batch_labels = self.train_labels_perm[
                                i * self.batch_size:(i + 1) * self.batch_size]

                            train_dict = {
                                sequences: batch_data,
                                sequences_lengths: batch_lengths,
                                labels: batch_labels,
                                keep_prob_dropout: self.keep_prob_dropout
                            }

                            _, batch_loss, batch_accuracy, batch_summary = session.run(
                                [opt_op, loss, accuracy, merged],
                                feed_dict=train_dict)
                            total_loss += batch_loss
                            batch_times.append(time.time() - batch_start)

                            train_loss_writer.add_summary(
                                batch_summary,
                                i + (n // self.batch_size) * epoch)

                            if i % 10 == 0 and i > 0:
                                # Print loss every 10 batches
                                time_per_epoch = np.mean(batch_times) * (
                                    n // self.batch_size)
                                remaining_time = int(time_per_epoch -
                                                     time.time() + epoch_time)
                                string_out = '\rEnd of batch ' + str(
                                    i) + '    Train loss:   ' + str(
                                        total_loss / (i * self.batch_size)
                                    ) + '    Accuracy:   ' + str(
                                        batch_accuracy)
                                string_out += '  Elapsed training time : ' + str(
                                    int(time.time() - start_time)) + "s, "
                                string_out += str(
                                    remaining_time
                                ) + "s remaining for this epoch"
                                string_out += '  (' + str(
                                    time_per_epoch * 100 / 60 // 1 /
                                    100) + ' min/epoch)'
                                stdout.write(string_out)

                        # Train accuracy
                        train_dict = {
                            sequences: self.train_data_perm[:1000],
                            sequences_lengths: self.train_lengths_perm[:1000],
                            labels: self.train_labels_perm[:1000],
                            keep_prob_dropout: 1.0
                        }

                        train_summary, train_loss, train_accuracy = session.run(
                            [merged, loss, accuracy], feed_dict=train_dict)
                        train_summary_writer.add_summary(train_summary, epoch)
                        print('\nEpoch train loss: ', train_loss,
                              'Epoch train accuracy: ', train_accuracy)

                        # Val accuracy
                        val_dict = {
                            sequences: self.val_data,
                            sequences_lengths: self.val_lengths,
                            labels: self.val_labels,
                            keep_prob_dropout: 1.0
                        }
                        val_summary, val_loss, val_accuracy = session.run(
                            [merged, loss, accuracy], feed_dict=val_dict)
                        val_summary_writer.add_summary(val_summary, epoch)
                        print('\nEpoch val loss: ', val_loss,
                              'Epoch val accuracy: ', val_accuracy)

                        self.save_model(session, epoch)

                        saver = tf.train.Saver()
                        saver.save(
                            session,
                            os.path.join(self.path + '/tensorboard/',
                                         'model.ckpt'))

                except KeyboardInterrupt:
                    save = input('save?')
                    if 'y' in save:
                        self.save_model(session, epoch)

        # If in validation mode:
        # - Load saved model and evaluate on validation fold
        # - Return list containing confusion matrices, and accuracy measures such as FPR and TPR

        elif self.mode == 'Val':
            with tf.Session() as session:
                print('Restoring model...')
                saver = tf.train.Saver()
                saver.restore(session, self.path + 'model.checkpoint')
                print('Model restored!')

                val_dict = {
                    sequences: self.val_data,
                    sequences_lengths: self.val_lengths,
                    labels: self.val_labels,
                    keep_prob_dropout: 1.0
                }

                self.val_pred, self.val_accuracy, self.val_probs = session.run(
                    [pred, accuracy, probs], feed_dict=val_dict)

                _ = self.summary_stats(self.val_lengths, self.val_labels,
                                       'val')

                print('\nConfusion matrix (all sequence lengths):')
                val_confusion_1 = self.confusion(
                    gold=self.val_labels,
                    prediction=self.val_pred,
                    lengths=self.val_original_lengths,
                    min_length=0,
                    max_length=np.inf)
                print(val_confusion_1)

                print('\nConfusion matrix (sequence length < 2000):')
                val_confusion_2 = self.confusion(
                    gold=self.val_labels,
                    prediction=self.val_pred,
                    lengths=self.val_original_lengths,
                    min_length=0,
                    max_length=2000)
                print(val_confusion_2)

                print('\nConfusion matrix (sequence length > 2000):')
                val_confusion_3 = self.confusion(
                    gold=self.val_labels,
                    prediction=self.val_pred,
                    lengths=self.val_original_lengths,
                    min_length=2000,
                    max_length=np.inf)
                print(val_confusion_3)

                print('\n Val accuracy:', self.val_accuracy)
                print(
                    '\n Val accuracy when length <2000:',
                    np.sum((self.val_pred == self.val_labels) *
                           (self.val_original_lengths <= 2000)) /
                    np.sum(self.val_original_lengths <= 2000))
                print(
                    '\n Val accuracy when length >2000:',
                    np.sum((self.val_pred == self.val_labels) *
                           (self.val_original_lengths > 2000)) /
                    np.sum(self.val_original_lengths > 2000))

                this_sum = np.zeros([3, 5])
                this_auc = np.zeros([1, 5])
                this_TPR = []
                this_FPR = []

                total_tp = 0
                total_fp = 0
                total_fn = 0
                total_tn = 0

                for i in range(4):
                    tp = np.sum((self.val_labels == i) * (self.val_pred == i))
                    fp = np.sum((self.val_labels != i) * (self.val_pred == i))
                    fn = np.sum((self.val_labels == i) * (self.val_pred != i))
                    tn = np.sum((self.val_labels != i) * (self.val_pred != i))

                    total_tp += tp
                    total_fp += fp
                    total_fn += fn
                    total_tn += tn
                    prec = tp / (tp + fp) if (tp + fp) > 0 else 0.0
                    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
                    f1 = 2 * prec * recall / (
                        prec + recall) if prec * recall > 0 else 0.0
                    this_sum[:, i] = np.array([prec, recall, f1])
                    this_auc[:, i] = roc_auc_score(self.val_labels == i,
                                                   self.val_pred == i)
                    if i < 4:
                        this_FPR.append(
                            roc_curve(self.val_labels == i,
                                      self.val_probs[:, i])[0])
                        this_TPR.append(
                            roc_curve(self.val_labels == i,
                                      self.val_probs[:, i])[1])

                prec = total_tp / (total_tp + total_fp) if (
                    total_tp + total_fp) > 0 else 0.0
                recall = total_tp / (total_tp + total_fn) if (
                    total_tp + total_fn) > 0 else 0.0
                f1 = 2 * prec * recall / (prec +
                                          recall) if prec * recall > 0 else 0.0
                this_sum[:, 4] = np.array([prec, recall, f1])
                this_sum = np.concatenate((this_sum, this_auc), 0)

                self.this_sum = pd.DataFrame(this_sum)
                self.this_sum.index = pd.Index(
                    ['Precision', 'Recall', 'F1', 'AUC'])
                self.this_sum.columns = pd.Index(
                    ['cyto', 'secreted', 'mito', 'nucleus', 'Total'])

                print(self.this_sum)

                if self.is_train == False:
                    return [
                        val_confusion_1, val_confusion_2, val_confusion_3,
                        self.this_sum, this_FPR, this_TPR
                    ]

        # If in test model:
        # - Load saved model and evaluate on test set
        # - Print predicted probabilities for each protein in the test set

        elif self.mode == 'Test':
            with tf.Session() as session:
                print('Restoring model...')
                saver = tf.train.Saver()
                saver.restore(session, self.path + 'model.checkpoint')
                print('Model restored!')

                with open('test_data.pkl', 'rb') as f:
                    test_sequences = pkl.load(f)
                with open('test_labels.pkl', 'rb') as f:
                    test_labels = pkl.load(f)

                _, _, data_lengths, _, enc_sequences = build_dictionary(
                    test_sequences, vocab=dictionary)

                test_dict = {
                    sequences: enc_sequences,
                    sequences_lengths: data_lengths,
                    keep_prob_dropout: 1.0
                }

                self.probs, self.pred = session.run([probs, pred],
                                                    feed_dict=test_dict)
                result = pd.DataFrame(
                    np.concatenate((self.probs, np.expand_dims(self.pred, 1)),
                                   1))
                result.columns = pd.Index(
                    ['cyto', 'secreted', 'mito', 'nucleus', 'prediction'])
                print(result)
示例#5
0
        exit(1)
    validation_path = options.validation_path
    model_dir = options.model_dir
    if model_dir == None:
        parser.print_help()
        exit(1)
    if not os.path.isdir(model_dir):
        os.makedirs(model_dir)
    training_iters = options.training_iters
    if not training_iters: training_iters = 30
    training_iters = int(training_iters)

    # config
    n_steps = 30  # time steps
    padd = '\t'  # special padding chracter
    char_dic = util.build_dictionary(train_path, padd)
    n_input = len(char_dic)  # input dimension, vocab size
    n_hidden = 8  # hidden layer size
    n_classes = 2  # output classes,  space or not
    vocab_size = n_input
    '''
	util.test_next_batch(train_path, char_dic, vocab_size, n_steps, padd)
	'''
    x = tf.placeholder(tf.float32, [None, n_steps, n_input])
    y_ = tf.placeholder(tf.int32, [None, n_steps])
    early_stop = tf.placeholder(tf.int32)

    # LSTM layer
    # 2 x n_hidden length (state & cell)
    istate = tf.placeholder(tf.float32, [None, 2 * n_hidden])
    weights = {
示例#6
0
    def __init__(self, corpus, parameters):
        self.corpus = corpus
        self.para = parameters
        self.dictionary, self.reverse_dictionary, sent_lengths, self.max_sent_len, enc_data, dec_data, dec_lab = build_dictionary(
            import_data(self.corpus))
        self.dictionary_sorted = sorted(self.dictionary.items(),
                                        key=operator.itemgetter(1))
        self.vocabulary_size = len(self.dictionary_sorted)
        self.max_sent_len += 1
        self.data = autoencoder_data(enc_data=enc_data,
                                     dec_data=dec_data,
                                     dec_lab=dec_lab,
                                     sent_lengths=sent_lengths)

        print('\r~~~~~~~ Building graph ~~~~~~~\r')
        self.graph = tf.get_default_graph()
        self.initializer = tf.random_normal_initializer()

        # Variables
        self.word_embeddings = tf.get_variable(
            'embeddings', [self.vocabulary_size, self.para.embedding_size],
            tf.float32,
            initializer=self.initializer)
        self.W = tf.get_variable(
            'decoder/weight', [self.para.embedding_size, self.vocabulary_size],
            tf.float32,
            initializer=self.initializer)
        self.b = tf.get_variable('decoder/bias', [self.vocabulary_size],
                                 tf.float32,
                                 initializer=self.initializer)
        self.global_step = tf.Variable(0, name='global_step', trainable=False)

        # Encoder placeholders
        self.enc_inputs = tf.placeholder(tf.int32, [None, None], "enc_inputs")
        self.enc_input_lengths = tf.placeholder(tf.int32, [None],
                                                "enc_input_lengths")

        # Decoder placeholders
        self.dec_inputs = tf.placeholder(tf.int32, [None, None], "dec_inputs")
        self.dec_labels = tf.placeholder(tf.int32, [None, None], "dec_labels")
        self.dec_input_lengths = tf.placeholder(tf.int32, [None],
                                                "dec_input_lengths")

        # Embed sentences
        enc_inputs_embedded = self.embed_data(self.enc_inputs)
        dec_inputs_embedded = self.embed_data(self.dec_inputs)

        # Encoder
        self.encoded_sentences = self.encoder(enc_inputs_embedded,
                                              self.enc_input_lengths,
                                              self.para.bidirectional)

        # Decoder for following sentence
        dec_logits_projected, dec_logits = self.decoder(
            decoder_inputs=dec_inputs_embedded,
            encoder_state=self.encoded_sentences,
            name='decoder',
            lengths=self.dec_input_lengths,
            train=True)

        # Compute loss
        if self.para.loss_function == 'softmax':
            self.loss = self.get_softmax_loss(self.dec_labels,
                                              dec_logits_projected)
        else:
            self.loss = self.get_sampled_softmax_loss(self.dec_labels,
                                                      dec_logits,
                                                      name='decoder')

        self.opt_op = tf.contrib.layers.optimize_loss(
            loss=self.loss,
            global_step=self.global_step,
            learning_rate=self.para.learning_rate,
            optimizer='Adam',
            clip_gradients=2.0,
            learning_rate_decay_fn=None,
            summaries=['loss'])

        # Decode sentences at prediction time
        self.predict = self.decoder(decoder_inputs=dec_inputs_embedded,
                                    encoder_state=self.encoded_sentences,
                                    name='decoder',
                                    lengths=self.dec_input_lengths,
                                    train=False)