def test_make_tfexample_and_write(self): expected = [[1., 2., 3.], [4., 5., 6.], [-1., -2., -3]] embs = tf.constant(expected, tf.float32) embeddings_output_dir = os.path.join( absltest.get_default_test_tmpdir(), 'embeddings2') tfrecord_writer = tf.io.TFRecordWriter(embeddings_output_dir) speaker_id = tf.constant('speakr') expected_lbl = ['1', '1', '2'] lbls = tf.constant(expected_lbl, tf.string) for emb, lbl in zip(embs, lbls): eval_downstream_embedding_fidelity.make_tfexample_and_write( emb, lbl, speaker_id, 'speaker_id_name', tfrecord_writer) tfrecord_writer.close() self.assertTrue(tf.io.gfile.exists(embeddings_output_dir)) # Check that they can be read correctly. embs_np, lbls_np, _ = sklearn_utils.tfexamples_to_nps( path=f'{embeddings_output_dir}*', embedding_name=eval_downstream_embedding_fidelity.EMBEDDING_KEY_, label_name=eval_downstream_embedding_fidelity.LABEL_KEY_, label_list=['1', '2'], l2_normalization=False) # Check correctness. np.testing.assert_array_equal(expected, embs_np) np.testing.assert_array_equal([0, 0, 1], lbls_np)
def _read_glob(glob, name): s = time.time() npx, npy = sklearn_utils.tfexamples_to_nps(glob, embedding_name, label_name, label_list, l2_normalization, speaker_id_name) logging.info('Finished reading %s data: %.2f sec.', name, _cur_s(s)) return npx, npy
def test_tfexample_to_nps(self, l2_normalization, int64_label_instead_of_bytes): path = os.path.join(absltest.get_default_test_tmpdir(), 'dummy_tfrecords') embedding_name = 'fake_emb' label_name = 'label/fake_lbl' if int64_label_instead_of_bytes: label_list = ['0', '1'] else: label_list = ['yes', 'no'] np.random.seed(10) # Generate fake embeddings and labels. fake_data = [ (np.random.rand(100), 1), (np.random.rand(100), 0), (np.random.rand(100), 1), ] def _emb_lbl_i_to_tfexample(emb, label_index): """Package fake data as a tf.Example.""" ex = tf.train.Example() ex.features.feature[ f'embedding/{embedding_name}'].float_list.value.extend(emb) if int64_label_instead_of_bytes: ex.features.feature[label_name].int64_list.value.append( int(label_list[label_index])) else: ex.features.feature[label_name].bytes_list.value.append( label_list[label_index].encode('utf-8')) return ex # Write TFRecord of tf.Examples to disk. with tf.python_io.TFRecordWriter(path) as writer: for emb, label_index in fake_data: ex = _emb_lbl_i_to_tfexample(emb, label_index) writer.write(ex.SerializeToString()) # Convert them back. npx, npy, _ = sklearn_utils.tfexamples_to_nps(path, embedding_name, label_name, label_list, l2_normalization) # Check that output is correct. expected_embs = np.array([d[0] for d in fake_data], np.float32) if l2_normalization: expected_embs /= np.linalg.norm(expected_embs, axis=1, ord=2, keepdims=True) self.assertAllEqual(npx, expected_embs) self.assertAllEqual(npy, (1, 0, 1))