def stats(): ''' Helper function to print descriptive statistics of training data ''' with open('train_data.pkl', 'rb') as f: data_sequences = pkl.load(f) with open('train_labels.pkl', 'rb') as f: labels = pkl.load(f) _, _, lengths, _, _ = build_dictionary(data_sequences) bins = [0, 100, 500, 1000, 1500, 1999] labels_string = ['cyto', 'secreted', 'mito', 'nucleus'] df = pd.DataFrame({'length': lengths, 'label': labels}) table = pd.crosstab(np.digitize(df.length, bins), df.label) table.index = pd.Index([ '[0, 100)', '[100, 500)', '[500, 1000]', '[1000, 1500)', '[1500, 2000)', '[2000, inf]' ], name="Bin") table.columns = pd.Index(labels_string, name="Class") sum_row = {col: table[col].sum() for col in table} sum_df = pd.DataFrame(sum_row, index=["Total"]) table = table.append(sum_df) table['Total'] = table.sum(axis=1) print('\n~~~~~~~ Summary stats for %s set ~~~~~~~') print('\nCount of sequence lengths by class') print(table) print('\nDescriptive statistics') print(df.describe())
dataset = 'wikipedia' files = [ 'tagged.en/englishEtiquetado_' + str(ind * 10000) + '_' + str(ind * 10000 + 10000) for ind in xrange(10) ] vocabulary_size = 20000 words = [] for filename in files: with open(data_path + dataset + '/' + filename, mode='r') as txtfile: while True: line = txtfile.readline() if line == '': break if line == '\n': continue tokens = line.split() words.append(tokens[0].lower()) print('Total number of words is %d' % len(words)) count, dictionary, reverse_dictionary = build_dictionary( words, vocabulary_size) voc_dict = dict(dic=dictionary, rev_dic=reverse_dictionary, freq=count) pickle.dump(voc_dict, open(data_path + dataset + '/voc_dict.pkl', 'wb'))
def run(self): # Load corpus corpus = import_data(self.corpus) self.dictionary, self.reverse_dictionary, sent_lengths, self.max_sent_len, enc_data, dec_data, dec_lab = build_dictionary( corpus) # Save metadata for visualisation of embedding matrix meta_data = sorted(self.dictionary, key=model.dictionary.get) print(len(meta_data)) with open('meta_data.tsv', 'w') as f: tsv_writer = csv.writer(f, dialect='excel') tsv_writer.writerow( str(i.encode('utf-8')) + '\n' for i in meta_data) # np.savetxt("meta_data.tsv", meta_data, fmt="%s") self.dictionary = sorted(self.dictionary.items(), key=operator.itemgetter(1)) self.vocabulary_size = len(self.dictionary) self.max_sent_len += 1 # Create datasets for encoder and decoders enc_data = enc_data[1:-1] enc_lengths = sent_lengths[1:-1] post_lengths = sent_lengths[2:] + 1 post_data = dec_data[2:] post_lab = dec_lab[2:] pre_lengths = sent_lengths[:-2] + 1 pre_data = dec_data[:-2] pre_lab = dec_lab[:-2] # Print summary statistics self.corpus_length = len(enc_data) self.corpus_stats() self.graph = tf.Graph() with self.graph.as_default(): print('\r~~~~~~~ Building model ~~~~~~~\r') self.initializer = tf.random_normal_initializer() # Variables self.word_embeddings = tf.get_variable( 'embeddings', [self.vocabulary_size, self.embedding_size], tf.float32, initializer=self.initializer) self.W_pre = tf.get_variable( 'precoder/weight', [self.embedding_size, self.vocabulary_size], tf.float32, initializer=self.initializer) self.b_pre = tf.get_variable('precoder/bias', [self.vocabulary_size], tf.float32, initializer=self.initializer) self.W_post = tf.get_variable( 'postcoder/weight', [self.embedding_size, self.vocabulary_size], tf.float32, initializer=self.initializer) self.b_post = tf.get_variable('postcoder/bias', [self.vocabulary_size], tf.float32, initializer=self.initializer) global_step = tf.Variable(0, name='global_step', trainable=False) # Encoder placeholders sentences = tf.placeholder(tf.int32, [None, None], "sentences") sentences_lengths = tf.placeholder(tf.int32, [None], "sentences_lengths") # Postcoder placeholders post_inputs = tf.placeholder(tf.int32, [None, None], "post_inputs") post_labels = tf.placeholder(tf.int32, [None, None], "post_labels") post_sentences_lengths = tf.placeholder(tf.int32, [None], "post_sentences_lengths") # Precoder placeholders pre_inputs = tf.placeholder(tf.int32, [None, None], "pre_inputs") pre_labels = tf.placeholder(tf.int32, [None, None], "pre_labels") pre_sentences_lengths = tf.placeholder(tf.int32, [None], "pre_sentences_lengths") # Embed sentences sentences_embedded = self.embed_data(sentences) post_inputs_embedded = self.embed_data(post_inputs) pre_inputs_embedded = self.embed_data(pre_inputs) # Encoder encoded_sentences = self.encoder(sentences_embedded, sentences_lengths, self.bidirectional) # Decoder for following sentence post_logits_projected, post_logits = self.decoder( decoder_inputs=post_inputs_embedded, encoder_state=encoded_sentences, name='postcoder', lengths=post_sentences_lengths, train=True) # Decoder for previous sentence pre_logits_projected, pre_logits = self.decoder( decoder_inputs=pre_inputs_embedded, encoder_state=encoded_sentences, name='precoder', lengths=pre_sentences_lengths, train=True) # Compute loss if self.loss_function == 'softmax': post_loss = self.get_softmax_loss(post_labels, post_logits_projected) pre_loss = self.get_softmax_loss(pre_labels, pre_logits_projected) else: post_loss = self.get_sampled_softmax_loss(post_labels, post_logits, name='postcoder') pre_loss = self.get_sampled_softmax_loss(pre_labels, pre_logits, name='precoder') loss = pre_loss + post_loss opt_op = tf.contrib.layers.optimize_loss( loss=loss, global_step=global_step, learning_rate=self.learning_rate, optimizer='Adam', clip_gradients=2.0, learning_rate_decay_fn=None, summaries=['loss']) # Decode sentences at prediction time pre_predict = self.decoder(decoder_inputs=pre_inputs_embedded, encoder_state=encoded_sentences, name='precoder', lengths=pre_sentences_lengths, train=False) post_predict = self.decoder(decoder_inputs=post_inputs_embedded, encoder_state=encoded_sentences, name='postcoder', lengths=post_sentences_lengths, train=False) predict = [pre_predict, post_predict] with tf.Session(graph=self.graph) as session: self.a = tf.contrib.graph_editor.get_tensors(self.graph) train_loss_writer = tf.summary.FileWriter( './tensorboard/train_loss', session.graph) # Use the same LOG_DIR where you stored your checkpoint. embedding_writer = tf.summary.FileWriter('./tensorboard/', session.graph) config = projector.ProjectorConfig() embedding = config.embeddings.add() embedding.tensor_name = self.word_embeddings.name # Link this tensor to its metadata file (e.g. labels). embedding.metadata_path = os.path.join('./meta_data.tsv') # Saves a configuration file that TensorBoard will read during startup. projector.visualize_embeddings(embedding_writer, config) merged = tf.summary.merge_all() print('\r~~~~~~~ Initializing variables ~~~~~~~\r') tf.global_variables_initializer().run() print('\r~~~~~~~ Starting training ~~~~~~~\r') start_time = time.time() try: train_summaryIndex = -1 for epoch in range(self.num_epochs): self.is_train = True epoch_time = time.time() print('----- Epoch', epoch, '-----') print('Shuffling dataset') perm = np.random.permutation(self.corpus_length) enc_lengths_perm = enc_lengths[perm] enc_data_perm = enc_data[perm] post_lengths_perm = post_lengths[perm] post_inputs_perm = np.array(post_data)[perm] post_labels_perm = np.array(post_lab)[perm] pre_lengths_perm = pre_lengths[perm] pre_inputs_perm = np.array(pre_data)[perm] pre_labels_perm = np.array(pre_lab)[perm] total_loss = 0 predict_step = 50 for step in range(self.corpus_length // self.batch_size): begin = step * self.batch_size end = (step + 1) * self.batch_size batch_enc_lengths = enc_lengths_perm[begin:end] batch_enc_inputs = enc_data_perm[begin:end] batch_post_lengths = post_lengths_perm[begin:end] batch_post_inputs = post_inputs_perm[ begin:end, :np.max(batch_post_lengths)] batch_post_labels = post_labels_perm[ begin:end, :np.max(batch_post_lengths)] batch_pre_lengths = pre_lengths_perm[begin:end] batch_pre_inputs = pre_inputs_perm[ begin:end, :np.max(batch_pre_lengths)] batch_pre_labels = pre_labels_perm[ begin:end, :np.max(batch_pre_lengths)] train_dict = { sentences: batch_enc_inputs, sentences_lengths: batch_enc_lengths, post_inputs: batch_post_inputs, post_labels: batch_post_labels, post_sentences_lengths: batch_post_lengths, pre_inputs: batch_pre_inputs, pre_labels: batch_pre_labels, pre_sentences_lengths: batch_pre_lengths } _, loss_val, batch_summary, glob_step = session.run( [opt_op, loss, merged, global_step], feed_dict=train_dict) train_loss_writer.add_summary( batch_summary, step + (self.corpus_length // self.batch_size) * epoch) total_loss += loss_val if glob_step % predict_step == 0: # if step > 0: print("Average loss at step ", glob_step, ": ", total_loss / predict_step) total_loss = 0 print('\nOriginal sequence:\n') print( self.print_sentence(batch_pre_inputs[0, 1:], batch_pre_lengths[0] - 1)) print( self.print_sentence(batch_enc_inputs[0], batch_enc_lengths[0])) print( self.print_sentence(batch_post_inputs[0, 1:], batch_post_lengths[0] - 1)) test_enc_lengths = np.expand_dims( batch_enc_lengths[0], 0) test_enc_inputs = np.expand_dims( batch_enc_inputs[0], 0) test_post_lengths = np.expand_dims( batch_post_lengths[0], 0) test_post_inputs = np.expand_dims( batch_post_inputs[0], 0) test_post_labels = np.expand_dims( batch_post_labels[0], 0) test_pre_lengths = np.expand_dims( batch_pre_lengths[0], 0) test_pre_inputs = np.expand_dims( batch_pre_inputs[0], 0) test_pre_labels = np.expand_dims( batch_pre_labels[0], 0) test_dict = { sentences_lengths: test_enc_lengths, sentences: test_enc_inputs, post_sentences_lengths: test_post_lengths, post_inputs: test_post_inputs, post_labels: test_post_labels, pre_sentences_lengths: test_pre_lengths, pre_inputs: test_pre_inputs, pre_labels: test_pre_labels } pre_prediction, post_prediction = session.run( [predict], feed_dict=test_dict)[0] print( '\nPredicted previous and following sequence around original sentence:\n' ) print( self.print_sentence(pre_prediction[0], len(pre_prediction[0]))) print( self.print_sentence(batch_enc_inputs[0], batch_enc_lengths[0])) print( self.print_sentence(post_prediction[0], len(post_prediction[0]))) end_time = time.time() print('\nTime for %d steps: %0.2f seconds' % (predict_step, end_time - start_time)) start_time = time.time() print( '\n\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' ) saver = tf.train.Saver() saver.save(session, os.path.join('./tensorboard/', 'model.ckpt')) except KeyboardInterrupt: save = input('save?') if 'y' in save: self.save_model(session, 0)
def run(self): ''' Runs the model according to the specified settings - If mode = Train: Train a GRU model using the training data - If mode = Val: Load the saved GRU model and evaluate it on the validation fold - If mode = Test: Load the saved GRU model and evaluate it on the blind test set ''' self.is_train = (self.mode == 'Train') if not os.path.exists(self.path): os.mkdir(self.path) # Load the training data with open('train_data.pkl', 'rb') as f: data_sequences = pkl.load(f) with open('train_labels.pkl', 'rb') as f: data_labels = pkl.load(f) dictionary, reverse_dictionary, data_lengths, self.max_seq_len, enc_sequences = build_dictionary( data_sequences) self.dictionary = sorted(dictionary.items(), key=operator.itemgetter(1)) print(self.dictionary) self.vocabulary_size = len(dictionary) self.val_size = len(data_sequences) // self.folds fold = 1 print('Training fold number %d. Each fold of size %d' % (fold, len(data_sequences) // self.folds)) # Truncates sequences at length 2000 and returns descriptive statistics. # This is done by concatenating the first 1900 and the last 100 amino acids. if self.is_train: self.max_seq_len = 2000 original_lengths = copy(data_lengths) data_sequences = enc_sequences[:, :self.max_seq_len] for i in range(len(data_lengths)): if data_lengths[i] > self.max_seq_len: data_sequences[i] = np.concatenate( (enc_sequences[i, :self.max_seq_len - 100], enc_sequences[i, -100:]), axis=0) data_lengths[i] = self.max_seq_len if self.folds == 1: val_mask = np.array([False]) else: val_mask = np.arange(self.val_size * (fold - 1), self.val_size * (fold)) # Use seed to ensure same randomisation is applied for each fold np.random.seed(4) perm = np.random.permutation(len(data_sequences)) data_labels = np.array(data_labels) data_sequences = data_sequences[perm] data_labels = data_labels[perm] data_lenghts = data_lengths[perm] original_lengths = original_lengths[perm] self.val_data = data_sequences[val_mask] self.val_labels = data_labels[val_mask] self.val_lengths = data_lengths[val_mask] self.val_original_lengths = original_lengths[val_mask] self.train_data = np.delete(data_sequences, val_mask, axis=0) self.train_labels = np.delete(data_labels, val_mask, axis=0) self.train_lengths = np.delete(data_lengths, val_mask, axis=0) self.train_original_lengths = np.delete(original_lengths, val_mask, axis=0) self.train_statistics, self.train_frame = self.summary_stats( self.train_lengths, self.train_labels, 'train') if self.folds == 1: self.val_statistics = np.array([]) self.val_frame = np.array([]) self.val_original_lengths = np.array([]) else: self.val_statistics, self.val_frame = self.summary_stats( self.val_lengths, self.val_labels, 'validation') this_data = [ self.train_data, self.train_labels, self.train_lengths, self.val_data, self.val_labels, self.val_lengths, self.train_statistics, self.train_frame, self.val_statistics, self.val_frame, self.train_original_lengths, self.val_original_lengths ] with open(self.path + 'this_data.pkl', 'wb') as f: pkl.dump(this_data, f) else: with open(self.path + 'this_data.pkl', 'rb') as f: self.train_data, self.train_labels, self.train_lengths, self.val_data, self.val_labels, self.val_lengths, self.train_statistics, self.train_frame, self.val_statistics, self.val_frame, self.train_original_lengths, self.val_original_lengths = pkl.load( f) # Now construct the Tensorflow graph print('\r~~~~~~~ Building model ~~~~~~~\r') # Define placeholders and variables initializer = tf.random_normal_initializer() self.word_embeddings = tf.get_variable( 'embeddings', [self.vocabulary_size, self.embedding_size], tf.float32, initializer=initializer) sequences = tf.placeholder(tf.int32, [None, None], "sequences") sequences_lengths = tf.placeholder(tf.int32, [None], "sequences_lengths") labels = tf.placeholder(tf.int64, [None], "labels") keep_prob_dropout = tf.placeholder(tf.float32, name='dropout') global_step = tf.Variable(0, name='global_step', trainable=False) # Embed and encode sequences sequences_embedded = self.embed_data(sequences) encoded_sequences = self.encoder(sequences_embedded, sequences_lengths, keep_prob_dropout, bidirectional=self.bidirectional) # Take last hidden state of GRU and put them through a nonlinear and a linear FC layer with tf.name_scope('non_linear_layer'): encoded_sentences_BN = self.batch_norm_wrapper( encoded_sequences, self.is_train) non_linear = tf.nn.dropout(tf.nn.relu( tf.contrib.layers.linear(encoded_sentences_BN, 64)), keep_prob=keep_prob_dropout) with tf.name_scope('final_layer'): non_linear_BN = self.batch_norm_wrapper(non_linear, self.is_train) logits = tf.contrib.layers.linear(non_linear_BN, 4) # Compute mean loss on this batch, consisting of cross entropy loss and L2 loss CE_loss = self.get_CE_loss(labels, logits) L2_loss = self.get_L2_loss() loss = CE_loss + L2_loss # Perform training operation learning_rate = tf.train.exponential_decay(self.learning_rate, global_step, 100, 0.96, staircase=True) opt_op = tf.contrib.layers.optimize_loss(loss=loss, global_step=global_step, learning_rate=learning_rate, optimizer='Adam', clip_gradients=2.0, learning_rate_decay_fn=None, summaries=None) # Define scalars for Tensorboard tf.summary.scalar('CE_loss', CE_loss) tf.summary.scalar('L2_loss', L2_loss) tf.summary.scalar('loss', loss) tf.summary.scalar('learning_rate', learning_rate) # Compute accuracy of prediction probs = tf.nn.softmax(logits) with tf.name_scope('accuracy'): pred = tf.argmax(logits, 1) correct_prediction = tf.equal(labels, pred) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) tf.summary.scalar('accuracy', accuracy) # If in training mode: # - shuffle data set before each epoch # - train model using mini batches # - track performance on train and validation set throughout training if self.is_train == True: with tf.Session() as session: train_loss_writer = tf.summary.FileWriter( str(self.path + 'tensorboard/train_loss'), session.graph) train_summary_writer = tf.summary.FileWriter( str(self.path + 'tensorboard/train_summary'), session.graph) val_summary_writer = tf.summary.FileWriter( str(self.path + 'tensorboard/val_summary'), session.graph) # Use the same LOG_DIR where you stored your checkpoint. embedding_writer = tf.summary.FileWriter( str(self.path + 'tensorboard/'), session.graph) config = projector.ProjectorConfig() embedding = config.embeddings.add() embedding.tensor_name = self.word_embeddings.name # Link this tensor to its metadata file (e.g. labels). embedding.metadata_path = os.path.join('./metadata.tsv') # Saves a configuration file that TensorBoard will read during startup. projector.visualize_embeddings(embedding_writer, config) merged = tf.summary.merge_all() print('\r~~~~~~~ Initializing variables ~~~~~~~\r') tf.global_variables_initializer().run() start_time = time.time() min_train_loss = np.inf batch_times = [] n = self.train_data.shape[0] print('\r~~~~~~~ Starting training ~~~~~~~\r') try: train_summaryIndex = -1 for epoch in range(self.num_epochs): self.is_train = True epoch_time = time.time() print('----- Epoch', epoch, '-----') print('Shuffling dataset') perm = np.random.permutation(len(self.train_data)) self.train_data_perm = self.train_data[perm] self.train_labels_perm = self.train_labels[perm] self.train_lengths_perm = self.train_lengths[perm] total_loss = 0 for i in range(n // self.batch_size): batch_start = time.time() batch_data = self.train_data_perm[i * self.batch_size: (i + 1) * self.batch_size] batch_lengths = self.train_lengths_perm[ i * self.batch_size:(i + 1) * self.batch_size] batch_labels = self.train_labels_perm[ i * self.batch_size:(i + 1) * self.batch_size] train_dict = { sequences: batch_data, sequences_lengths: batch_lengths, labels: batch_labels, keep_prob_dropout: self.keep_prob_dropout } _, batch_loss, batch_accuracy, batch_summary = session.run( [opt_op, loss, accuracy, merged], feed_dict=train_dict) total_loss += batch_loss batch_times.append(time.time() - batch_start) train_loss_writer.add_summary( batch_summary, i + (n // self.batch_size) * epoch) if i % 10 == 0 and i > 0: # Print loss every 10 batches time_per_epoch = np.mean(batch_times) * ( n // self.batch_size) remaining_time = int(time_per_epoch - time.time() + epoch_time) string_out = '\rEnd of batch ' + str( i) + ' Train loss: ' + str( total_loss / (i * self.batch_size) ) + ' Accuracy: ' + str( batch_accuracy) string_out += ' Elapsed training time : ' + str( int(time.time() - start_time)) + "s, " string_out += str( remaining_time ) + "s remaining for this epoch" string_out += ' (' + str( time_per_epoch * 100 / 60 // 1 / 100) + ' min/epoch)' stdout.write(string_out) # Train accuracy train_dict = { sequences: self.train_data_perm[:1000], sequences_lengths: self.train_lengths_perm[:1000], labels: self.train_labels_perm[:1000], keep_prob_dropout: 1.0 } train_summary, train_loss, train_accuracy = session.run( [merged, loss, accuracy], feed_dict=train_dict) train_summary_writer.add_summary(train_summary, epoch) print('\nEpoch train loss: ', train_loss, 'Epoch train accuracy: ', train_accuracy) # Val accuracy val_dict = { sequences: self.val_data, sequences_lengths: self.val_lengths, labels: self.val_labels, keep_prob_dropout: 1.0 } val_summary, val_loss, val_accuracy = session.run( [merged, loss, accuracy], feed_dict=val_dict) val_summary_writer.add_summary(val_summary, epoch) print('\nEpoch val loss: ', val_loss, 'Epoch val accuracy: ', val_accuracy) self.save_model(session, epoch) saver = tf.train.Saver() saver.save( session, os.path.join(self.path + '/tensorboard/', 'model.ckpt')) except KeyboardInterrupt: save = input('save?') if 'y' in save: self.save_model(session, epoch) # If in validation mode: # - Load saved model and evaluate on validation fold # - Return list containing confusion matrices, and accuracy measures such as FPR and TPR elif self.mode == 'Val': with tf.Session() as session: print('Restoring model...') saver = tf.train.Saver() saver.restore(session, self.path + 'model.checkpoint') print('Model restored!') val_dict = { sequences: self.val_data, sequences_lengths: self.val_lengths, labels: self.val_labels, keep_prob_dropout: 1.0 } self.val_pred, self.val_accuracy, self.val_probs = session.run( [pred, accuracy, probs], feed_dict=val_dict) _ = self.summary_stats(self.val_lengths, self.val_labels, 'val') print('\nConfusion matrix (all sequence lengths):') val_confusion_1 = self.confusion( gold=self.val_labels, prediction=self.val_pred, lengths=self.val_original_lengths, min_length=0, max_length=np.inf) print(val_confusion_1) print('\nConfusion matrix (sequence length < 2000):') val_confusion_2 = self.confusion( gold=self.val_labels, prediction=self.val_pred, lengths=self.val_original_lengths, min_length=0, max_length=2000) print(val_confusion_2) print('\nConfusion matrix (sequence length > 2000):') val_confusion_3 = self.confusion( gold=self.val_labels, prediction=self.val_pred, lengths=self.val_original_lengths, min_length=2000, max_length=np.inf) print(val_confusion_3) print('\n Val accuracy:', self.val_accuracy) print( '\n Val accuracy when length <2000:', np.sum((self.val_pred == self.val_labels) * (self.val_original_lengths <= 2000)) / np.sum(self.val_original_lengths <= 2000)) print( '\n Val accuracy when length >2000:', np.sum((self.val_pred == self.val_labels) * (self.val_original_lengths > 2000)) / np.sum(self.val_original_lengths > 2000)) this_sum = np.zeros([3, 5]) this_auc = np.zeros([1, 5]) this_TPR = [] this_FPR = [] total_tp = 0 total_fp = 0 total_fn = 0 total_tn = 0 for i in range(4): tp = np.sum((self.val_labels == i) * (self.val_pred == i)) fp = np.sum((self.val_labels != i) * (self.val_pred == i)) fn = np.sum((self.val_labels == i) * (self.val_pred != i)) tn = np.sum((self.val_labels != i) * (self.val_pred != i)) total_tp += tp total_fp += fp total_fn += fn total_tn += tn prec = tp / (tp + fp) if (tp + fp) > 0 else 0.0 recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0 f1 = 2 * prec * recall / ( prec + recall) if prec * recall > 0 else 0.0 this_sum[:, i] = np.array([prec, recall, f1]) this_auc[:, i] = roc_auc_score(self.val_labels == i, self.val_pred == i) if i < 4: this_FPR.append( roc_curve(self.val_labels == i, self.val_probs[:, i])[0]) this_TPR.append( roc_curve(self.val_labels == i, self.val_probs[:, i])[1]) prec = total_tp / (total_tp + total_fp) if ( total_tp + total_fp) > 0 else 0.0 recall = total_tp / (total_tp + total_fn) if ( total_tp + total_fn) > 0 else 0.0 f1 = 2 * prec * recall / (prec + recall) if prec * recall > 0 else 0.0 this_sum[:, 4] = np.array([prec, recall, f1]) this_sum = np.concatenate((this_sum, this_auc), 0) self.this_sum = pd.DataFrame(this_sum) self.this_sum.index = pd.Index( ['Precision', 'Recall', 'F1', 'AUC']) self.this_sum.columns = pd.Index( ['cyto', 'secreted', 'mito', 'nucleus', 'Total']) print(self.this_sum) if self.is_train == False: return [ val_confusion_1, val_confusion_2, val_confusion_3, self.this_sum, this_FPR, this_TPR ] # If in test model: # - Load saved model and evaluate on test set # - Print predicted probabilities for each protein in the test set elif self.mode == 'Test': with tf.Session() as session: print('Restoring model...') saver = tf.train.Saver() saver.restore(session, self.path + 'model.checkpoint') print('Model restored!') with open('test_data.pkl', 'rb') as f: test_sequences = pkl.load(f) with open('test_labels.pkl', 'rb') as f: test_labels = pkl.load(f) _, _, data_lengths, _, enc_sequences = build_dictionary( test_sequences, vocab=dictionary) test_dict = { sequences: enc_sequences, sequences_lengths: data_lengths, keep_prob_dropout: 1.0 } self.probs, self.pred = session.run([probs, pred], feed_dict=test_dict) result = pd.DataFrame( np.concatenate((self.probs, np.expand_dims(self.pred, 1)), 1)) result.columns = pd.Index( ['cyto', 'secreted', 'mito', 'nucleus', 'prediction']) print(result)
exit(1) validation_path = options.validation_path model_dir = options.model_dir if model_dir == None: parser.print_help() exit(1) if not os.path.isdir(model_dir): os.makedirs(model_dir) training_iters = options.training_iters if not training_iters: training_iters = 30 training_iters = int(training_iters) # config n_steps = 30 # time steps padd = '\t' # special padding chracter char_dic = util.build_dictionary(train_path, padd) n_input = len(char_dic) # input dimension, vocab size n_hidden = 8 # hidden layer size n_classes = 2 # output classes, space or not vocab_size = n_input ''' util.test_next_batch(train_path, char_dic, vocab_size, n_steps, padd) ''' x = tf.placeholder(tf.float32, [None, n_steps, n_input]) y_ = tf.placeholder(tf.int32, [None, n_steps]) early_stop = tf.placeholder(tf.int32) # LSTM layer # 2 x n_hidden length (state & cell) istate = tf.placeholder(tf.float32, [None, 2 * n_hidden]) weights = {
def __init__(self, corpus, parameters): self.corpus = corpus self.para = parameters self.dictionary, self.reverse_dictionary, sent_lengths, self.max_sent_len, enc_data, dec_data, dec_lab = build_dictionary( import_data(self.corpus)) self.dictionary_sorted = sorted(self.dictionary.items(), key=operator.itemgetter(1)) self.vocabulary_size = len(self.dictionary_sorted) self.max_sent_len += 1 self.data = autoencoder_data(enc_data=enc_data, dec_data=dec_data, dec_lab=dec_lab, sent_lengths=sent_lengths) print('\r~~~~~~~ Building graph ~~~~~~~\r') self.graph = tf.get_default_graph() self.initializer = tf.random_normal_initializer() # Variables self.word_embeddings = tf.get_variable( 'embeddings', [self.vocabulary_size, self.para.embedding_size], tf.float32, initializer=self.initializer) self.W = tf.get_variable( 'decoder/weight', [self.para.embedding_size, self.vocabulary_size], tf.float32, initializer=self.initializer) self.b = tf.get_variable('decoder/bias', [self.vocabulary_size], tf.float32, initializer=self.initializer) self.global_step = tf.Variable(0, name='global_step', trainable=False) # Encoder placeholders self.enc_inputs = tf.placeholder(tf.int32, [None, None], "enc_inputs") self.enc_input_lengths = tf.placeholder(tf.int32, [None], "enc_input_lengths") # Decoder placeholders self.dec_inputs = tf.placeholder(tf.int32, [None, None], "dec_inputs") self.dec_labels = tf.placeholder(tf.int32, [None, None], "dec_labels") self.dec_input_lengths = tf.placeholder(tf.int32, [None], "dec_input_lengths") # Embed sentences enc_inputs_embedded = self.embed_data(self.enc_inputs) dec_inputs_embedded = self.embed_data(self.dec_inputs) # Encoder self.encoded_sentences = self.encoder(enc_inputs_embedded, self.enc_input_lengths, self.para.bidirectional) # Decoder for following sentence dec_logits_projected, dec_logits = self.decoder( decoder_inputs=dec_inputs_embedded, encoder_state=self.encoded_sentences, name='decoder', lengths=self.dec_input_lengths, train=True) # Compute loss if self.para.loss_function == 'softmax': self.loss = self.get_softmax_loss(self.dec_labels, dec_logits_projected) else: self.loss = self.get_sampled_softmax_loss(self.dec_labels, dec_logits, name='decoder') self.opt_op = tf.contrib.layers.optimize_loss( loss=self.loss, global_step=self.global_step, learning_rate=self.para.learning_rate, optimizer='Adam', clip_gradients=2.0, learning_rate_decay_fn=None, summaries=['loss']) # Decode sentences at prediction time self.predict = self.decoder(decoder_inputs=dec_inputs_embedded, encoder_state=self.encoded_sentences, name='decoder', lengths=self.dec_input_lengths, train=False)