def test_non_padded_dataset(self): # Set up test data. test_data_directory = os.path.join( FLAGS.test_srcdir, './testdata' ) label_vocab_array = [ 'EMBL:AE017224', 'RefSeq:WP_002966386.1', 'ProteinModelPortal:P0CB34', 'SMR:P0CB34', 'EnsemblBacteria:AAX75635', 'GeneID:29595679', 'KEGG:bmb:BruAb2_0191', 'HOGENOM:HOG000133897', 'KO:K04078', 'OMA:PGRIDDN', 'Proteomes:UP000000540', 'GO:GO:0005737', 'GO:GO:0005524', 'GO:GO:0006457', 'CDD:cd00320', 'Gene3D:2.30.33.40', 'HAMAP:MF_00580', 'InterPro:IPR020818', 'InterPro:IPR037124', 'InterPro:IPR018369', 'InterPro:IPR011032', 'PANTHER:PTHR10772', 'Pfam:PF00166', 'PRINTS:PR00297', 'SMART:SM00883', 'SUPFAM:SSF50129', 'PROSITE:PS00681' ] with tf.Graph().as_default(): sess = tf.Session() dataset = protein_dataset.non_batched_dataset( # Dev fold instead of train fold because the train fold is repeated. train_dev_or_test=protein_dataset.DEV_FOLD, label_vocab=label_vocab_array, data_root_dir=test_data_directory) example_itr = dataset.make_initializable_iterator() sess.run(tf.tables_initializer()) sess.run(tf.global_variables_initializer()) sess.run(example_itr.initializer) # Compute actual output actual_examples = _dataset_iterator_to_list(example_itr, sess) expected_length = 4 # Compute expected values expected_sequence = 'MADIKFRPLHDRVVVRRVESEAKTAGGIIIPDTAKEKPQEGEVVAAGAGARDEAGKLVPLDVKAGDRVLFGKWSGTEVKIGGEDLLIMKESDILGIVG' expected_sequence_indexes = [ utils.AMINO_ACID_VOCABULARY.index(x) for x in expected_sequence ] expected_sequence_one_hot = _numpy_one_hot( expected_sequence_indexes, depth=len(utils.AMINO_ACID_VOCABULARY)) # Because the label vocab is exactly the labels in the first example, we # just get range(len(label_vocab_array)) expected_label_indexes = range(len(label_vocab_array)) expected_id = b'P0CB34' # Assert values correct self.assertLen(actual_examples, expected_length) np.testing.assert_equal(actual_examples[0][protein_dataset.SEQUENCE_KEY], expected_sequence_one_hot) np.testing.assert_equal( actual_examples[0][protein_dataset.SEQUENCE_LENGTH_KEY], len(expected_sequence)) np.testing.assert_equal(actual_examples[0][protein_dataset.LABEL_KEY], expected_label_indexes) np.testing.assert_equal(actual_examples[0][protein_dataset.SEQUENCE_ID_KEY], expected_id)
def test_padded_dataset(self): # Set up test data. test_data_directory = os.path.join( FLAGS.test_srcdir, './testdata' ) label_vocab_array = ['EMBL:AE017224'] batch_size = 3 with tf.Graph().as_default(): sess = tf.Session() non_padded_dataset = protein_dataset.non_batched_dataset( # Dev fold instead of train fold because the train fold is repeated. train_dev_or_test=protein_dataset.DEV_FOLD, label_vocab=label_vocab_array, data_root_dir=test_data_directory) batched_dataset = protein_dataset.batched_dataset( non_padded_dataset, batch_size=batch_size, bucket_boundaries=[11000]) batch_itr = batched_dataset.make_initializable_iterator() sess.run(tf.tables_initializer()) sess.run(tf.global_variables_initializer()) sess.run(batch_itr.initializer) # Compute actual output actual_examples = _dataset_iterator_to_list(batch_itr, sess) # Examine correctness of first element. actual_sequence_batch_shape = actual_examples[0][ protein_dataset.SEQUENCE_KEY].shape expected_longest_sequence_len_in_first_batch = 98 expected_first_batch_sequence_shape = ( batch_size, expected_longest_sequence_len_in_first_batch, len(utils.AMINO_ACID_VOCABULARY)) self.assertEqual(actual_sequence_batch_shape, expected_first_batch_sequence_shape) actual_label_batch_shape = actual_examples[0][ protein_dataset.LABEL_KEY].shape # Because the label vocab contains the labels in the first example, we # get len(label_vocab_array) as the number of labels. expected_batch_label_shape = (batch_size, len(label_vocab_array)) self.assertEqual(actual_label_batch_shape, expected_batch_label_shape)