def test_read_random_slice_1seq(): sr = SequenceReader(h5_file, key_file, shuffle_seqs=False, batch_size=5, max_seq_length=20, min_seq_length=20, seq_split_mode='random_slice_1seq') print(sr.num_batches) #read epoch 1 x1 = [] for i in xrange(sr.num_batches): x1_i = sr.read()[0] assert (len(x1_i) == 5) x1 += x1_i #read epoch 2 x2 = [] for i in xrange(sr.num_batches): x2_i = sr.read()[0] assert (len(x2_i) == 5) x2 += x2_i assert (int(len(x1) / 5) == sr.num_batches) assert (len(x1) == sr.num_seqs) assert (len(x1) == sr.num_total_subseqs) assert (len(x1) == len(x2)) for i in xrange(len(x1)): assert (x1[i].shape[0] == sr.max_batch_seq_length) assert (x2[i].shape[0] == sr.max_batch_seq_length) assert (np.all(x1[i] != x2[i]))
def test_read_random_samples(): sr = SequenceReader(h5_file, key_file, batch_size=5, max_seq_length=20, min_seq_length=20, seq_split_mode='random_samples', seq_split_overlap=5) #read epoch 1 x1 = [] for i in xrange(sr.num_batches): x1_i = sr.read()[0] assert (len(x1_i) == 5) x1 += x1_i #read epoch 2 x2 = [] for i in xrange(sr.num_batches): x2_i = sr.read()[0] assert (len(x2_i) == 5) x2 += x2_i assert (len(x1) == int(sr.num_total_subseqs / sr.batch_size) * sr.batch_size) assert (len(x1) == len(x2)) for i in xrange(len(x1)): assert (x1[i].shape[0] == sr.max_batch_seq_length) assert (np.any(x1[i] != x2[i]))
def test_read_full_seq(): create_dataset() sr = SequenceReader(h5_file, key_file, shuffle_seqs=False, batch_size=5) seq_length = sr.seq_length #read epoch 1 x1 = [] for i in xrange(sr.num_batches): x1_i = sr.read()[0] assert (len(x1_i) == 5) x1 += x1_i #read epoch 2 x2 = [] for i in xrange(sr.num_batches): x2_i = sr.read()[0] assert (len(x2_i) == 5) x2 += x2_i assert (len(x1) == len(x2)) for i in xrange(len(x1)): assert (x1[i].shape[0] == seq_length[i]) assert (np.all(x1[i] == x2[i]))
def test_read_sequential(): sr = SequenceReader(h5_file, key_file, shuffle_seqs=False, batch_size=5, max_seq_length=20, seq_split_mode='sequential', seq_split_overlap=5) #read epoch 1 x1 = [] for i in xrange(sr.num_batches): x1_i = sr.read()[0] assert (len(x1_i) == 5) x1 += x1_i #read epoch 2 x2 = [] for i in xrange(sr.num_batches): x2_i = sr.read()[0] assert (len(x2_i) == 5) x2 += x2_i assert (len(x1) == sr.num_total_subseqs) assert (len(x1) == len(x2)) for i in xrange(len(x1)): assert (x1[i].shape[0] <= sr.max_batch_seq_length) assert (np.all(x1[i] == x2[i]))
def test_num_subseqs(): create_dataset() sr = SequenceReader(h5_file, key_file, min_seq_length=min_seq_length + 1) num_subseqs_gt = np.ones((num_seqs, ), dtype=int) num_subseqs_gt[0] = 0 assert (np.all(sr.num_subseqs == num_subseqs_gt)) sr = SequenceReader(h5_file, key_file, max_seq_length=delta * 2) num_subseqs_gt = np.array([5, 6, 6, 7, 7, 8, 8, 9, 9, 10], dtype=int) assert (np.all(sr.num_subseqs == num_subseqs_gt)) sr = SequenceReader(h5_file, key_file, max_seq_length=delta * 2, min_seq_length=delta * 2) num_subseqs_gt = np.array([5, 5, 6, 6, 7, 7, 8, 8, 9, 9], dtype=int) assert (np.all(sr.num_subseqs == num_subseqs_gt)) sr = SequenceReader(h5_file, key_file, max_seq_length=delta * 2, min_seq_length=delta * 2, seq_split_overlap=delta / 2) num_subseqs_gt = np.array([6, 7, 7, 8, 9, 9, 10, 11, 11, 12], dtype=int) print(sr.num_subseqs) assert (np.all(sr.num_subseqs == num_subseqs_gt))
def test_max_batch_seq_length(): create_dataset() sr = SequenceReader(h5_file, key_file) assert (sr.max_batch_seq_length == max_seq_length) sr = SequenceReader(h5_file, key_file, max_seq_length=min_seq_length / 4) assert (sr.max_batch_seq_length == min_seq_length / 4)
def extract_ivector(seq_file, file_list, gmm_file, model_file, preproc_file, output_path, qy_only, **kwargs): set_float_cpu('float32') sr_args = SR.filter_eval_args(**kwargs) if preproc_file is not None: preproc = TransformList.load(preproc_file) else: preproc = None gmm = DiagGMM.load_from_kaldi(gmm_file) sr = SR(seq_file, file_list, batch_size=1, shuffle_seqs=False, preproc=preproc, **sr_args) t1 = time.time() # if qy_only: # model = TVAEY.load(model_file) # else: model = TVAEYZ.load(model_file) model.build(max_seq_length=sr.max_batch_seq_length) y = np.zeros((sr.num_seqs, model.y_dim), dtype=float_keras()) xx = np.zeros((1, sr.max_batch_seq_length, model.x_dim), dtype=float_keras()) rr = np.zeros((1, sr.max_batch_seq_length, model.r_dim), dtype=float_keras()) keys = [] for i in xrange(sr.num_seqs): ti1 = time.time() x, key = sr.read_next_seq() ti2 = time.time() r = gmm.compute_z(x) ti3 = time.time() logging.info('Extracting i-vector %d/%d for %s, num_frames: %d' % (i, sr.num_seqs, key, x.shape[0])) keys.append(key) xx[:,:,:] = 0 rr[:,:,:] = 0 xx[0,:x.shape[0]] = x rr[0,:x.shape[0]] = r y[i] = model.compute_qy_x([xx, rr], batch_size=1)[0] ti4 = time.time() logging.info('Elapsed time i-vector %d/%d for %s, total: %.2f read: %.2f, gmm: %.2f, vae: %.2f' % (i, sr.num_seqs, key, ti4-ti1, ti2-ti1, ti3-ti2, ti4-ti3)) logging.info('Extract elapsed time: %.2f' % (time.time() - t1)) hw = HypDataWriter(output_path) hw.write(keys, '', y)
def test_reset(): create_dataset() sr = SequenceReader(h5_file, key_file, max_seq_length=delta * 2, min_seq_length=delta * 2, seq_split_overlap=delta / 2) scp = copy.deepcopy(sr.scp) seq_length = sr.seq_length num_subseqs = sr.num_subseqs sr.reset() assert (scp != sr.scp) assert (not np.all(seq_length == sr.seq_length)) assert (not np.all(num_subseqs == sr.num_subseqs))
def extract_ivector(seq_file, file_list, model_file, preproc_file, output_path, qy_only, **kwargs): set_float_cpu('float32') sr_args = SR.filter_eval_args(**kwargs) if preproc_file is not None: preproc = TransformList.load(preproc_file) else: preproc = None sr = SR(seq_file, file_list, batch_size=1, shuffle_seqs=False, preproc=preproc, **sr_args) t1 = time.time() if qy_only: model = TVAEY.load(model_file) else: model = TVAEYZ.load(model_file) model.build(max_seq_length=sr.max_batch_seq_length) logging.info(time.time() - t1) logging.info(model.y_dim) y = np.zeros((sr.num_seqs, model.y_dim), dtype=float_keras()) xx = np.zeros((1, sr.max_batch_seq_length, model.x_dim), dtype=float_keras()) keys = [] for i in xrange(sr.num_seqs): x, key = sr.read_next_seq() logging.info('Extracting i-vector %d/%d for %s\n' % (i, sr.num_seqs, key)) keys.append(key) xx[:, :, :] = 0 xx[0, :x.shape[0]] = x y[i] = model.compute_qy_x(xx, batch_size=1)[0] logging.info('Extract elapsed time: %.2f' % (time.time() - t1)) hw = HypDataWriter(output_path) hw.write(keys, '', y)
def eval_elbo(seq_file, file_list, model_file, preproc_file, output_file, ubm_type, **kwargs): sr_args = SR.filter_eval_args(**kwargs) if preproc_file is not None: preproc = TransformList.load(preproc_file) else: preproc = None sr = SR(seq_file, file_list, batch_size=1, shuffle_seqs=False, preproc=preproc, **sr_args) t1 = time.time() if ubm_type == 'diag-gmm': model = DiagGMM.load(model_file) else: model = DiagGMM.load_from_kaldi(model_file) model.initialize() elbo = np.zeros((sr.num_seqs, ), dtype=float_cpu()) num_frames = np.zeros((sr.num_seqs, ), dtype=int) keys = [] for i in xrange(sr.num_seqs): x, key = sr.read_next_seq() keys.append(key) elbo[i] = model.elbo(x) num_frames[i] = x.shape[0] num_total_frames = np.sum(num_frames) total_elbo = np.sum(elbo) total_elbo_norm = total_elbo / num_total_frames logging.info('Extract elapsed time: %.2f' % (time.time() - t1)) s = 'Total ELBO: %f\nELBO_NORM %f' % (total_elbo, total_elbo_norm) logging.info(s) with open(output_file, 'w') as f: f.write(s)
def compute_gmm_post(seq_file, file_list, model_file, preproc_file, output_path, num_comp, **kwargs): sr_args = SR.filter_eval_args(**kwargs) if preproc_file is not None: preproc = TransformList.load(preproc_file) else: preproc = None gmm = DiagGMM.load_from_kaldi(model_file) sr = SR(seq_file, file_list, batch_size=1, shuffle_seqs=False, preproc=preproc, **sr_args) t1 = time.time() logging.info(time.time() - t1) index = np.zeros((sr.num_seqs, num_comp), dtype=int) hw = HypDataWriter(output_path) for i in xrange(sr.num_seqs): x, key = sr.read_next_seq() logging.info('Extracting i-vector %d/%d for %s, num_frames: %d' % (i, sr.num_seqs, key, x.shape[0])) r = gmm.compute_z(x) r_s, index = to_sparse(r, num_comp) if i == 0: r2 = to_dense(r_s, index, r.shape[1]) logging.degug(np.sort(r[0, :])[-12:]) logging.degug(np.sort(r2[0, :])[-12:]) logging.degug(np.argsort(r[0, :])[-12:]) logging.degug(np.argsort(r2[0, :])[-12:]) hw.write([key], '.r', [r_s]) hw.write([key], '.index', [index]) logging.info('Extract elapsed time: %.2f' % (time.time() - t1))
def init_ubm(seq_file, train_list, x_dim, num_comp, output_path, **kwargs): if seq_file is None: model = DiagGMM(x_dim=x_dim, num_comp=1) model.initialize() model.save(output_path) sr_args = SR.filter_args(**kwargs) sr = SR(seq_file, train_list, batch_size=1, **sr_args)
def test_num_batches(): create_dataset() sr = SequenceReader(h5_file, key_file, batch_size=5, max_seq_length=delta * 2, min_seq_length=delta * 2, seq_split_overlap=delta / 2) print(sr.num_batches) assert (sr.num_batches == 18)
def extract_ivector(seq_file, file_list, gmm_file, model_file, preproc_file, output_path, qy_only, **kwargs): set_float_cpu('float32') sr_args = SR.filter_eval_args(**kwargs) if preproc_file is not None: preproc = TransformList.load(preproc_file) else: preproc = None gmm = DiagGMM.load_from_kaldi(gmm_file) sr = SR(seq_file, file_list, batch_size=1, shuffle_seqs=False, preproc=preproc, **sr_args) t1 = time.time() # if qy_only: # model = TVAEY.load(model_file) # else: model = TVAEYZ.load(model_file) #model.build(max_seq_length=sr.max_batch_seq_length) #model.build(max_seq_length=1) model.x_dim = 60 model.r_dim = 2048 model.y_dim = 400 y = np.zeros((sr.num_seqs, model.y_dim), dtype=float_keras()) xx = np.zeros((1, sr.max_batch_seq_length, model.x_dim), dtype=float_keras()) rr = np.zeros((1, sr.max_batch_seq_length, model.r_dim), dtype=float_keras()) keys = [] xp = Input(shape=( sr.max_batch_seq_length, model.x_dim, )) rp = Input(shape=( sr.max_batch_seq_length, model.r_dim, )) qy_param = model.qy_net([xp, rp]) qy_net = Model([xp, rp], qy_param) for i in xrange(sr.num_seqs): ti1 = time.time() x, key = sr.read_next_seq() ti2 = time.time() r = gmm.compute_z(x) ti3 = time.time() logging.info('Extracting i-vector %d/%d for %s, num_frames: %d' % (i, sr.num_seqs, key, x.shape[0])) keys.append(key) # xp = Input(shape=(x.shape[0], model.x_dim,)) # rp = Input(shape=(x.shape[0], model.r_dim,)) # qy_param = model.qy_net([xp, rp]) ti5 = time.time() xx[:, :, :] = 0 rr[:, :, :] = 0 xx[0, :x.shape[0]] = x rr[0, :x.shape[0]] = r # x = np.expand_dims(x, axis=0) # r = np.expand_dims(r, axis=0) # qy_net = Model([xp, rp], qy_param) y[i] = qy_net.predict([xx, rr], batch_size=1)[0] # del qy_net # y[i] = model.compute_qy_x2([x, r], batch_size=1)[0] #for i in xrange(10): #gc.collect() ti4 = time.time() logging.info( 'Elapsed time i-vector %d/%d for %s, total: %.2f read: %.2f, gmm: %.2f, vae: %.2f qy: %.2f' % (i, sr.num_seqs, key, ti4 - ti1, ti2 - ti1, ti3 - ti2, ti4 - ti5, ti5 - ti3)) # print('Elapsed time i-vector %d/%d for %s, total: %.2f read: %.2f, gmm: %.2f, vae: %.2f' % # (i, sr.num_seqs, key, ti4-ti1, ti2-ti1, ti3-ti2, ti4-ti3)) logging.info('Extract elapsed time: %.2f' % (time.time() - t1)) hw = HypDataWriter(output_path) hw.write(keys, '', y)
if __name__ == "__main__": parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, fromfile_prefix_chars='@', description='Extract TVAE i-vectors') parser.add_argument('--seq-file', dest='seq_file', required=True) parser.add_argument('--file-list', dest='file_list', required=True) parser.add_argument('--preproc-file', dest='preproc_file', default=None) parser.add_argument('--gmm-file', dest='gmm_file', required=True) parser.add_argument('--model-file', dest='model_file', required=True) parser.add_argument('--output-path', dest='output_path', required=True) SR.add_argparse_eval_args(parser) parser.add_argument('--qy-only', dest='qy_only', default=False, action='store_true') # parser.add_argument('--batch-size',dest='batch_size',default=512,type=int, # help=('Batch size (default: %(default)s)')) parser.add_argument('--rng-seed', dest='rng_seed', default=1024, type=int, help=('Seed for the random number generator ' '(default: %(default)s)'))
sr_args = SR.filter_args(**kwargs) sr = SR(seq_file, train_list, batch_size=1, **sr_args) if __name__ == "__main__": parser=argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, fromfile_prefix_chars='@', description='Initializes UBM') parser.add_argument('--seq-file', dest='seq_file', default=None) parser.add_argument('--train-list', dest='train_list', default=None) parser.add_argument('--x-dim', dest='x_dim', type=int, required=True) parser.add_argument('--num-comp', dest='num_comp', default=1) parser.add_argument('--output-path', dest='output_path', required=True) parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) SR.add_argparse_args(parser) args=parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) init_ubm(**vars(args))
def extract_embed(seq_file, file_list, model_file, preproc_file, output_path, max_length, layer_names, **kwargs): set_float_cpu('float32') sr_args = SR.filter_eval_args(**kwargs) if preproc_file is not None: preproc = TransformList.load(preproc_file) else: preproc = None sr = SR(seq_file, file_list, batch_size=1, shuffle_seqs=False, preproc=preproc, **sr_args) t1 = time.time() model = SeqEmbed.load(model_file) model.build() print(layer_names) model.build_embed(layer_names) y_dim = model.embed_dim max_length = np.minimum(sr.max_batch_seq_length, max_length) y = np.zeros((sr.num_seqs, y_dim), dtype=float_keras()) xx = np.zeros((1, max_length, model.x_dim), dtype=float_keras()) keys = [] for i in xrange(sr.num_seqs): ti1 = time.time() x, key = sr.read_next_seq() ti2 = time.time() print('Extracting embeddings %d/%d for %s, num_frames: %d' % (i, sr.num_seqs, key, x.shape[0])) keys.append(key) xx[:, :, :] = 0 if x.shape[0] <= max_length: xx[0, :x.shape[0]] = x y[i] = model.predict_embed(xx, batch_size=1) else: num_chunks = int(np.ceil(float(x.shape[0]) / max_length)) chunk_size = int(np.ceil(float(x.shape[0]) / num_chunks)) for j in xrange(num_chunks - 1): start = j * chunk_size xx[0, :chunk_size] = x[start:start + chunk_size] y[i] += model.predict_embed(xx, batch_size=1).ravel() xx[0, :chunk_size] = x[-chunk_size:] y[i] += model.predict_embed(xx, batch_size=1).ravel() y[i] /= num_chunks ti4 = time.time() print( 'Elapsed time embeddings %d/%d for %s, total: %.2f read: %.2f, vae: %.2f' % (i, sr.num_seqs, key, ti4 - ti1, ti2 - ti1, ti4 - ti2)) print('Extract elapsed time: %.2f' % (time.time() - t1)) hw = HypDataWriter(output_path) hw.write(keys, '', y)
def extract_ivector(seq_file, file_list, gmm_file, model_file, preproc_file, output_path, qy_only, max_length, **kwargs): set_float_cpu('float32') sr_args = SR.filter_eval_args(**kwargs) if preproc_file is not None: preproc = TransformList.load(preproc_file) else: preproc = None gmm = DiagGMM.load_from_kaldi(gmm_file) sr = SR(seq_file, file_list, batch_size=1, shuffle_seqs=False, preproc=preproc, **sr_args) t1 = time.time() # if qy_only: # model = TVAEY.load(model_file) # else: model = TVAEYZ.load(model_file) #model.build(max_seq_length=sr.max_batch_seq_length) model.build(max_seq_length=1) max_length = np.minimum(sr.max_batch_seq_length, max_length) y = np.zeros((sr.num_seqs, model.y_dim), dtype=float_keras()) xx = np.zeros((1, max_length, model.x_dim), dtype=float_keras()) rr = np.zeros((1, max_length, model.r_dim), dtype=float_keras()) keys = [] xp = Input(shape=( max_length, model.x_dim, )) rp = Input(shape=( max_length, model.r_dim, )) qy_param = model.qy_net([xp, rp]) qy_net = Model([xp, rp], qy_param) for i in xrange(sr.num_seqs): ti1 = time.time() x, key = sr.read_next_seq() ti2 = time.time() r = gmm.compute_z(x) ti3 = time.time() logging.info('Extracting i-vector %d/%d for %s, num_frames: %d' % (i, sr.num_seqs, key, x.shape[0])) keys.append(key) xx[:, :, :] = 0 rr[:, :, :] = 0 if x.shape[0] <= max_length: xx[0, :x.shape[0]] = x rr[0, :x.shape[0]] = r y[i] = qy_net.predict([xx, rr], batch_size=1)[0] else: num_batches = int(np.ceil(x.shape[0] / max_length)) for j in xrange(num_batches - 1): start = j * max_length xx[0] = x[start:start + max_length] rr[0] = r[start:start + max_length] y[i] += qy_net.predict([xx, rr], batch_size=1)[0].ravel() xx[0] = x[-max_length:] rr[0] = r[-max_length:] y[i] += qy_net.predict([xx, rr], batch_size=1)[0].ravel() y[i] /= num_batches ti4 = time.time() logging.info( 'Elapsed time i-vector %d/%d for %s, total: %.2f read: %.2f, gmm: %.2f, vae: %.2f' % (i, sr.num_seqs, key, ti4 - ti1, ti2 - ti1, ti3 - ti2, ti4 - ti3)) logging.info('Extract elapsed time: %.2f' % (time.time() - t1)) hw = HypDataWriter(output_path) hw.write(keys, '', y)
def train_tvae(seq_file, train_list, val_list, decoder_file, qy_file, qz_file, epochs, batch_size, preproc_file, output_path, num_samples_y, num_samples_z, px_form, qy_form, qz_form, min_kl, **kwargs): set_float_cpu(float_keras()) sr_args = SR.filter_args(**kwargs) sr_val_args = SR.filter_val_args(**kwargs) opt_args = KOF.filter_args(**kwargs) cb_args = KCF.filter_args(**kwargs) if preproc_file is not None: preproc = TransformList.load(preproc_file) else: preproc = None sr = SR(seq_file, train_list, batch_size=batch_size, preproc=preproc, **sr_args) max_length = sr.max_batch_seq_length gen_val = None if val_list is not None: sr_val = SR(seq_file, val_list, batch_size=batch_size, preproc=preproc, shuffle_seqs=False, seq_split_mode='sequential', seq_split_overlap=0, reset_rng=True, **sr_val_args) max_length = max(max_length, sr_val.max_batch_seq_length) gen_val = data_generator(sr_val, max_length) gen_train = data_generator(sr, max_length) t1 = time.time() decoder = load_model_arch(decoder_file) qy = load_model_arch(qy_file) if qz_file is None: vae = TVAEY(qy, decoder, px_cond_form=px_form, qy_form=qy_form, min_kl=min_kl) vae.build(num_samples=num_samples_y, max_seq_length = max_length) else: qz = load_model_arch(qz_file) vae = TVAEYZ(qy, qz, decoder, px_cond_form=px_form, qy_form=qy_form, qz_form=qz_form, min_kl=min_kl) vae.build(num_samples_y=num_samples_y, num_samples_z=num_samples_z, max_seq_length = max_length) logging.info(time.time()-t1) cb = KCF.create_callbacks(vae, output_path, **cb_args) opt = KOF.create_optimizer(**opt_args) h = vae.fit_generator(gen_train, x_val=gen_val, steps_per_epoch=sr.num_batches, validation_steps=sr_val.num_batches, optimizer=opt, epochs=epochs, callbacks=cb, max_q_size=10) # if vae.x_chol is not None: # x_chol = np.array(K.eval(vae.x_chol)) # logging.info(x_chol[:4,:4]) logging.info('Train elapsed time: %.2f' % (time.time() - t1)) vae.save(output_path + '/model')
def test_num_seqs(): create_dataset() sr = SequenceReader(h5_file, key_file) assert (sr.num_seqs == num_seqs)
def train_tvae(seq_file, train_list, val_list, gmm_file, decoder_file, qy_file, qz_file, init_path, epochs, batch_size, preproc_file, output_path, num_samples_y, num_samples_z, px_form, qy_form, qz_form, min_kl, **kwargs): set_float_cpu(float_keras()) sr_args = SR.filter_args(**kwargs) sr_val_args = SR.filter_val_args(**kwargs) opt_args = KOF.filter_args(**kwargs) cb_args = KCF.filter_args(**kwargs) if preproc_file is not None: preproc = TransformList.load(preproc_file) else: preproc = None gmm = DiagGMM.load_from_kaldi(gmm_file) sr = SR(seq_file, train_list, batch_size=batch_size, preproc=preproc, **sr_args) max_length = sr.max_batch_seq_length gen_val = None if val_list is not None: sr_val = SR(seq_file, val_list, batch_size=batch_size, preproc=preproc, shuffle_seqs=False, seq_split_mode='sequential', seq_split_overlap=0, reset_rng=True, **sr_val_args) max_length = max(max_length, sr_val.max_batch_seq_length) gen_val = data_generator(sr_val, gmm, max_length) gen_train = data_generator(sr, gmm, max_length) t1 = time.time() if init_path is None: decoder = load_model_arch(decoder_file) qy = load_model_arch(qy_file) # if qz_file is None: # vae = TVAEY(qy, decoder, px_cond_form=px_form, # qy_form=qy_form, min_kl=min_kl) # vae.build(num_samples=num_samples_y, # max_seq_length = max_length) # else: qz = load_model_arch(qz_file) vae = TVAEYZ(qy, qz, decoder, px_cond_form=px_form, qy_form=qy_form, qz_form=qz_form, min_kl=min_kl) else: vae = TVAEYZ.load(init_path) vae.build(num_samples_y=num_samples_y, num_samples_z=num_samples_z, max_seq_length=max_length) logging.info(time.time() - t1) cb = KCF.create_callbacks(vae, output_path, **cb_args) opt = KOF.create_optimizer(**opt_args) h = vae.fit_generator(gen_train, x_val=gen_val, steps_per_epoch=sr.num_batches, validation_steps=sr_val.num_batches, optimizer=opt, epochs=epochs, callbacks=cb, max_queue_size=10) # if vae.x_chol is not None: # x_chol = np.array(K.eval(vae.x_chol)) # logging.info(x_chol[:4,:4]) logging.info('Train elapsed time: %.2f' % (time.time() - t1)) vae.save(output_path + '/model') sr_val.reset() y_val, sy_val, z_val, srz_val = vae.encoder_net.predict_generator( gen_val, steps=400) from scipy import linalg as la yy = y_val - np.mean(y_val, axis=0) cy = np.dot(yy.T, yy) / yy.shape[0] l, v = la.eigh(cy) np.savetxt(output_path + '/l1.txt', l) sr_val.reset() y_val2, sy_val2 = vae.qy_net.predict_generator(gen_val, steps=400) yy = y_val2 - np.mean(y_val, axis=0) cy = np.dot(yy.T, yy) / yy.shape[0] l, v = la.eigh(cy) np.savetxt(output_path + '/l2.txt', l) logging.info(y_val - y_val2)
def test_seq_length(): create_dataset() sr = SequenceReader(h5_file, key_file) assert (np.all(sr.seq_length == seq_length)) assert (sr.total_length == np.sum(seq_length))