def test_augmentation_data_generator(tmpdir): dist = DistributionCollection(examplary_tag_distribution()) dset_fname = str(tmpdir.join("dset.hdf5")) samples = 6000 dset = DistributionHDF5Dataset(dset_fname, nb_samples=samples, distribution=dist) labels = dist.sample(samples) labels = dist.normalize(labels) fake = np.random.random((samples, 1, 8, 8)) discriminator = np.random.random((samples, 1)) dset.append(labels=labels, fake=fake, discriminator=discriminator) dset.close() dset = DistributionHDF5Dataset(dset_fname) bs = 32 names = ['labels', 'fake'] assert 'labels' in next(dset.iter(bs, names)) assert next(dset.iter(bs))['labels'].dtype.names == tuple(dist.names) dset_iters = [lambda bs: bit_split(dataset_iterator(dset, bs))] data_gen = lambda bs: zip_dataset_iterators(dset_iters, bs) label_names = ['bit_0', 'bit_11', 'x_rotation'] aug_gen = augmentation_data_generator(data_gen, 'fake', label_names) outs = next(aug_gen(bs)) assert len(outs[0]) == 32 assert len(outs[1]) == len(label_names) gen = aug_gen(bs) for i, batch in enumerate(gen): if i == 2 * samples // bs: break assert batch is not None assert batch[0].shape == (bs, 1, 8, 8) assert len(batch[1]) == len(label_names)
def data_generator_factory(self): datasets = [ DistributionHDF5Dataset(fname) for fname in self.train_sets ] dist = None for dset in datasets: if dist is None: dist = dset.get_tag_distribution() else: if dist != dset.get_tag_distribution(): raise Exception("Distribution of datasets must match") label_output_sizes = self.get_label_output_sizes() all_label_names = ['bit_{}'.format(i) for i in range(12)] + \ [n for n, _ in label_output_sizes] dataset_names = ['labels', self.data_name] if 'discriminator' in list(dset.keys()): dataset_names.append('discriminator') dataset_iterators = [ lambda bs: bit_split( dataset_iterator(dset, bs, dataset_names, self. discriminator_threshold, self.shuffle_data)) for dset in datasets ] augmentation = self.augmentation() def wrapper(bs): for batch in zip_dataset_iterators(dataset_iterators, bs): data = batch[self.data_name] labels = [batch[l] for l in all_label_names] yield augmentation(data), labels return wrapper
def run(gt_file, videos, images, visualize_debug, output, fix_utc_2014, nb_bits=12): """ Converts bb_binary ground truth Cap'n Proto files to hdf5 files and extracts the corresponding rois from videos or images. """ def get_filenames(f): if f is None: return [] else: return [line.rstrip('\n') for line in f.readlines()] gen_factory = FrameGeneratorFactory(get_filenames(videos), get_filenames(images)) if os.path.exists(output): os.remove(output) distribution = DistributionCollection([('bits', Bernoulli(), nb_bits)]) dset = DistributionHDF5Dataset(output, distribution) camIdxs = [] periods = [] for fname in gt_file: fc = load_frame_container(fname) camIdx, start_dt, end_dt = parse_video_fname(fname) if fix_utc_2014 and start_dt.year == 2014: start_dt -= timedelta(hours=2) gt_frames = [] gen = gen_factory.get_generator(camIdx, start_dt) for frame, (video_frame, video_filename) in zip(fc.frames, gen): gt = {} np_frame = convert_frame_to_numpy(frame) rois, mask, positions = extract_gt_rois(np_frame, video_frame, start_dt) for name in np_frame.dtype.names: gt[name] = np_frame[name][mask] bits = [int_id_to_binary(id)[::-1] for id in gt["decodedId"]] gt["bits"] = 2 * np.array(bits, dtype=np.float) - 1 gt["tags"] = 2 * (rois / 255.).astype(np.float16) - 1 gt['filename'] = os.path.basename(video_filename) gt['camIdx'] = camIdx gt_frames.append(gt) print('.', end='', flush=True) print() gt_period = GTPeriod(camIdx, start_dt, end_dt, fname, gt_frames) periods.append( [int(gt_period.start.timestamp()), int(gt_period.end.timestamp())]) camIdxs.append(gt_period.camIdx) append_gt_to_hdf5(gt_period, dset) dset.attrs['periods'] = np.array(periods) dset.attrs['camIdxs'] = np.array(camIdxs) visualize_detection_tiles(dset, os.path.splitext(output)[0]) dset.close()
def run(g_weights_fname, d_weights_fname, selected_outputs, nb_samples, out_fname): generator = load_model(g_weights_fname, render_gan_custom_objects()) discriminator = load_model(d_weights_fname, render_gan_custom_objects()) generator._make_predict_function() discriminator._make_predict_function() dist_json = get_hdf5_attr(g_weights_fname, 'distribution').decode('utf-8') dist = diktya.distributions.load_from_json(dist_json) os.makedirs(os.path.dirname(out_fname), exist_ok=True) dset = DistributionHDF5Dataset(out_fname, mode='w', nb_samples=nb_samples, distribution=dist) batch_size = 100 avialable_datasets = [ name for name in generator.output_names if name != 'labels' ] print("Avialable outputs: " + ", ".join(avialable_datasets)) generator_predict = predict_wrapper( lambda x: generator.predict(x, batch_size), generator.output_names) def sample_generator(): z_shape = get_layer(generator.inputs[0]).batch_input_shape while True: z = np.random.uniform(-1, 1, (batch_size, ) + z_shape[1:]) outs = generator_predict(z) raw_labels = outs.pop('labels') pos = 0 labels = np.zeros(len(raw_labels), dtype=dist.norm_dtype) for name, size in dist.norm_nb_elems.items(): labels[name] = raw_labels[:, pos:pos + size] pos += size deleted_keys = [] if selected_outputs != 'all': for name in list(outs.keys()): if name not in selected_outputs: del outs[name] deleted_keys.append(name) if not outs: raise Exception( "Got no outputs. Removed {}. Selected outputs {}".format( deleted_keys, selected_outputs)) outs['labels'] = labels outs['discriminator'] = discriminator.predict(outs['fake']) yield outs bar = progressbar.ProgressBar(max_value=nb_samples) for batch in sample_generator(): pos = dset.append(**batch) bar.update(pos) if pos >= nb_samples: break dset.close() print("Saved dataset with fakes and labels to: {}".format(out_fname))
def get_label_distributions(self): datasets = [ DistributionHDF5Dataset(fname) for fname in self.train_sets ] if self._label_distribution is None: for dset in datasets: if self._label_distribution is None: self._label_distribution = dset.get_tag_distribution() else: if self._label_distribution != dset.get_tag_distribution(): raise Exception("Distribution of datasets must match") return self._label_distribution
def test_distribution_hdf5_dataset(tmpdir): with pytest.raises(Exception): DistributionHDF5Dataset(str( tmpdir.join('dataset_no_distribution.hdf5')), nb_samples=1000) dist = DistributionCollection(examplary_tag_distribution(nb_bits=12)) labels = dist.sample(32) image = np.random.random((32, 1, 8, 8)) dset = DistributionHDF5Dataset(str(tmpdir.join('dataset.hdf5')), distribution=dist, nb_samples=1000) dset.append(labels=labels, image=image) for name in dist.names: assert name in dset for batch in dset.iter(batch_size=32): for name in dist.names: assert name not in batch assert 'labels' in batch assert batch['labels'].dtype == dist.norm_dtype break
def data_generator_factory(self, additional_datasets=[], with_batch=False): datasets = [ DistributionHDF5Dataset(fname) for fname in self.train_sets ] dist = None for dset in datasets: if dist is None: dist = dset.get_tag_distribution() else: if dist != dset.get_tag_distribution(): raise Exception("Distribution of datasets must match") label_output_sizes = self.get_label_output_sizes() all_label_names = ['bit_{}'.format(i) for i in range(12)] + \ [n for n, _ in label_output_sizes] dataset_names = ['labels' ] + self.iterator_data_names() + additional_datasets print("Used datasets: " + str(dataset_names)) if 'discriminator' in list(dset.keys()): dataset_names.append('discriminator') dataset_iterators = [ lambda bs: bit_split( dataset_iterator(dset, bs, dataset_names, self. discriminator_threshold, self.shuffle_data)) for dset in datasets ] augmentation = self.augmentation() handmade_augmentation = self.get_handmade_augmentation() def wrapper(iterator): def data_gen(bs): for batch in iterator(bs): data = batch[self.data_name] labels = [batch[l] for l in all_label_names] label_mask = [ np.ones(l.shape[0], dtype=np.float32) for l in labels ] if self.use_handmade_augmentation: data = handmade_augmentation(batch) to_yield = [augmentation(data), labels, label_mask] if with_batch: to_yield.append(batch) yield to_yield return data_gen return lambda bs: zip_dataset_iterators( list(map(wrapper, dataset_iterators)), bs)
def store(self, outputs, fname): store_outputs = {} for name, arr in outputs.items(): if name == 'labels': labels = arr.view(dtype=self.distribution.norm_dtype) store_outputs['labels'] = labels else: store_outputs[name] = arr nb_samples = len(store_outputs['fake']) if os.path.exists(fname) and self.overwrite: os.remove(fname) dset = DistributionHDF5Dataset(fname, self.distribution, nb_samples=nb_samples) dset.append(**store_outputs) dset.close()
def run(tag_dist, output_fname, force, nb_samples): os.makedirs(os.path.dirname(output_fname), exist_ok=True) if os.path.exists(output_fname) and force: print("Deleted {}".format(output_fname)) os.remove(output_fname) else: assert not os.path.exists(output_fname), \ "File {} already exists. Use --force to override it" basename, _ = os.path.splitext(output_fname) anit_name = basename + "_anti_{}.png" hist_name = basename + "_hist_{}.png" plot_anitaliasing(tag_dist, anit_name, 1) plot_anitaliasing(tag_dist, anit_name, 2) plot_anitaliasing(tag_dist, anit_name, 4) plot_anitaliasing(tag_dist, anit_name, 8) labels, masks, _ = next(generator(tag_dist, 10000, antialiasing=2)) for key in labels.dtype.names: m = labels[key].mean() s = labels[key].std() print("{}: {:.3f}, {:.3f}".format(key, m, s)) assert abs(m) <= 0.03 for label_name in sorted(set(labels.dtype.names) - set(['bits'])): x = labels[label_name] plt.hist(x.flatten(), bins=40, normed=True) plt.savefig(hist_name.format(label_name)) plt.clf() dset = DistributionHDF5Dataset(output_fname, distribution=tag_dist, nb_samples=nb_samples, mode='w') progbar = Progbar(nb_samples) batch_size = min(25000, nb_samples) for labels, tags, depth_map in generator(tag_dist, batch_size, antialiasing=4): pos = dset.append(labels=labels, tag3d=tags, depth_map=depth_map) progbar.update(pos) if pos == nb_samples: break print("Saved tag 3d dataset to: {}".format(output_fname)) dist_fname = basename + "_distribution.json" with open(dist_fname, "w+") as dist_f: dist_f.write(tag_dist.to_json()) print("Saved distribution to: {}".format(dist_fname))
def _calculate_predictions(): from deepdecoder.scripts.train_decoder import save_samples name = get_marker(data_set) print("Loading GT data: {}".format(data_set)) h5_truth = h5py.File(data_set) decoder = Decoder(tp.model_fname(), { 'ScaleInTestPhase': ScaleInTestPhase, 'RandomSwitch': RandomSwitch, }) print( "Loaded decoder. Got model with {:.2f} million parameters.".format( decoder.model.count_params() / 1e6)) dist = decoder.distribution assert decoder.distribution == tp.get_label_distributions() fill_zeros = [(n, s) for (n, s) in dist.norm_nb_elems.items() if n != 'bits'] batch_size = 128 gen_factory = tp.truth_generator_factory(h5_truth, fill_zeros, tags_name) tp.check_generator(gen_factory, os.path.basename(data_set), 400) tags, bits, _ = next(gen_factory(20**2)) nb_bits = 12 bits = np.array(bits[:12]).T normalize_bits = bits.min() == -1 print(bits.min()) print(bits.max()) save_samples(tags[:, 0], bits, tp.outname("evaluate_{}.png".format(name))) gen = gen_factory(batch_size) bits_true = [] bits_pred = [] total_time = 0 nb_samples = 0 total_samples = min(len(h5_truth[tags_name]), 100000) h5_fname = tp.outname("gt_prediction_{}.hdf5".format(name)) if os.path.exists(h5_fname): os.remove(h5_fname) dset_output = DistributionHDF5Dataset(h5_fname, decoder.distribution, nb_samples=total_samples) with ProgressBar(max_value=total_samples) as pbar: for tags, gt, _ in gen: if nb_samples + len(tags) > total_samples: break nb_samples += len(tags) bits = np.array(gt[:nb_bits]).T if normalize_bits: bits = bits / 2. + 0.5 bits_true.append(bits) start = time.time() outputs = decoder.predict(tags) dset_output.append(outputs, tag=tags) total_time += time.time() - start bits_pred.append(outputs['bits'] / 2. + 0.5) pbar.update(nb_samples) time_per_sample = total_time / nb_samples return np.concatenate(bits_true), np.concatenate( bits_pred), time_per_sample
def run(output_dir, force, tags_3d_hdf5_fname, nb_units, depth, nb_epoch, filter_size, project_factor, nb_dense): batch_size = 64 basename = "network_tags3d_n{}_d{}_e{}".format(nb_units, depth, nb_epoch) output_basename = os.path.join(output_dir, basename) tag_dataset = DistributionHDF5Dataset(tags_3d_hdf5_fname) tag_dataset._dataset_created = True print("Got {} images from the 3d model".format(tag_dataset.nb_samples)) weights_fname = output_basename + ".hdf5" if os.path.exists(weights_fname) and not force: raise OSError("File {} already exists. Use --force to override it") elif os.path.exists(weights_fname) and force: os.remove(weights_fname) os.makedirs(output_dir, exist_ok=True) def generator(batch_size): for batch in tag_dataset.iter(batch_size): labels = [] for name in batch['labels'].dtype.names: labels.append(batch['labels'][name]) assert not np.isnan(batch['tag3d']).any() assert not np.isnan(batch['depth_map']).any() labels = np.concatenate(labels, axis=-1) yield labels, [batch['tag3d'], batch['depth_map']] labels = next(generator(batch_size))[0] print("labels.shape ", labels.shape) print("labels.dtype ", labels.dtype) nb_input = next(generator(batch_size))[0].shape[1] x = Input(shape=(nb_input, )) tag3d, depth_map = tag3d_network_dense(x, nb_units=nb_units, depth=depth, nb_dense_units=nb_dense) g = Model(x, [tag3d, depth_map]) # optimizer = SGD(momentum=0.8, nesterov=True) optimizer = Nadam() g.compile(optimizer, loss=['mse', 'mse'], loss_weights=[1, 1 / 3.]) scheduler = AutomaticLearningRateScheduler(optimizer, 'loss', epoch_patience=5, min_improvement=0.0002) history = HistoryPerBatch() save = SaveModels({basename + '_snapshot_{epoch:^03}.hdf5': g}, output_dir=output_dir, hdf5_attrs=tag_dataset.get_distribution_hdf5_attrs()) history_plot = history.plot_callback(fname=output_basename + "_loss.png", every_nth_epoch=10) g.fit_generator(generator(batch_size), samples_per_epoch=800 * batch_size, nb_epoch=nb_epoch, verbose=1, callbacks=[scheduler, save, history, history_plot]) nb_visualize = 18**2 vis_labels, (tags_3d, depth_map) = next(generator(nb_visualize)) predict_tags_3d, predict_depth_map = g.predict(vis_labels) def zip_and_save(fname, *args): clipped = list(map(lambda x: np.clip(x, 0, 1)[:, 0], args)) print(clipped[0].shape) tiled = zip_tile(*clipped) print(tiled.shape) scipy.misc.imsave(fname, tiled) zip_and_save(output_basename + "_predict_tags.png", tags_3d, predict_tags_3d) zip_and_save(output_basename + "_predict_depth_map.png", depth_map, predict_depth_map) save_model(g, weights_fname, attrs=tag_dataset.get_distribution_hdf5_attrs()) with open(output_basename + '.json', 'w+') as f: f.write(g.to_json()) with open(output_basename + '_loss_history.json', 'w+') as f: json.dump(history.history, f) fig, _ = history.plot() fig.savefig(output_basename + "_loss.png") print("Saved weights to: {}".format(weights_fname))