def test_prob_to_cum_freq(): """ Tests whether prob_to_cum_freq produces a table with the expected number of entries, number of samples, and that non-zero probabilities are represented by non-zero increases in frequency. Tests that cum_freq_to_prob is normalized and consistent with prob_to_cum_freq. """ randomState = np.random.RandomState(190) resolution = 1024 p0 = randomState.dirichlet([.1] * 50) cumFreq0 = prob_to_cum_freq(p0, resolution) p1 = cum_freq_to_prob(cumFreq0) cumFreq1 = prob_to_cum_freq(p1, resolution) # number of hypothetical samples should correspond to resolution assert cumFreq0[-1] == resolution assert len(cumFreq0) == len(p0) + 1 # non-zero probabilities should have non-zero frequencies assert np.all(np.diff(cumFreq0)[p0 > 0.] > 0) # probabilities should be normalized. assert np.isclose(np.sum(p1), 1.) # while the probabilities might change, frequencies should not assert cumFreq0 == cumFreq1
def test_prob_to_cum_freq_zero_prob(): """ Tests whether prob_to_cum_freq handles zero probabilities as expected. """ prob1 = [0.5, 0.25, 0.25] cumFreq1 = prob_to_cum_freq(prob1, resolution=8) prob0 = [0.5, 0., 0.25, 0.25, 0., 0.] cumFreq0 = prob_to_cum_freq(prob0, resolution=8) # removing entries corresponding to zeros assert [cumFreq0[0] ] + [cumFreq0[i + 1] for i, p in enumerate(prob0) if p > 0.] == cumFreq1
def build_latent_distribution(self, alpha: int = 1): """Two passes:num_images_pixels 1. we calculate the minimum latent value across our entire training distribution, 2. we then add |min| to the latent values such that they are all >= 0 and then use torch.bincount to get discrete value counts -> which we then laplace smooth and convert into a CDF. If a code is in multiple parts, e.g lateral FPN features, they are flattened and concatenated. """ self.cdf = dict() self.min_val = torch.tensor(0.0).to(self.device).long() num_images = 0 self.model.eval() if self.negative_codes: for batch in self.train_loader: with torch.no_grad(): out_dict = self.model(batch) for code_feat in self.code_feats: self.min_val = torch.min( self.min_val, out_dict[code_feat].long().min(), ) num_images += len(batch) if num_images > self.num_train_images: break self.min_val = self.min_val.abs() self.bins = torch.tensor([0.0]).to(self.device).long() num_images = 0 for batch in self.train_loader: with torch.no_grad(): out_dict = self.model(batch) flat_codes = [] for code_feat in self.code_feats: flat_codes.append(out_dict[code_feat].long().flatten() + self.min_val) batch_bins = torch.bincount(torch.cat(flat_codes)) if len(batch_bins) > len(self.bins): batch_bins[:len(self.bins)] += self.bins self.bins = batch_bins elif len(self.bins) > len(batch_bins): self.bins[:len(batch_bins)] += batch_bins else: self.bins += batch_bins num_images += len(batch) if num_images > self.num_train_images: break bins = self.bins.float() bins_smooth = ((bins + alpha) / (bins.sum() + len(bins) * alpha) ).cpu() # additive smooth counts using alpha self.cdf = rc.prob_to_cum_freq(bins_smooth, resolution=2 * len(bins_smooth)) # convert pdf -> cdf self.model.train()
def apply_range_encoder(seq_data, encodepath, args, config): resolution = config['resolution'] prob = np.load('data_info/distribution_info_{}.npy'.format(args.model_num)) # Avoid zero prob modified_freq = prob * resolution + 1 modified_prob = modified_freq / np.sum(modified_freq) # print(modified_prob) cum_freq = range_coder.prob_to_cum_freq(modified_prob, resolution=resolution) # print('-----') # print(cum_freq) # cum_freq = [0] + [i for i in range(1, 256 + 1)] range_encoder = range_coder.RangeEncoder(encodepath) # Whether cum_freq resolution influences performance ? range_encoder.encode(seq_data, cum_freq) range_encoder.close()
def test_range_coder_overflow(): """ Cumulative frequencies must fit in an unsigned integer (assumed to be represented by 32 bits). This test checks that no error is thrown if the frequencies exceed that limit. """ numBytes = 17 filepath = mkstemp()[1] # encoding one sequence should require 1 byte prob = [4, 6, 8] prob = np.asarray(prob, dtype=np.float64) / np.sum(prob) cumFreq = prob_to_cum_freq(prob, 128) cumFreq[-1] = 2**32 sequence = [2, 2] data = sequence * numBytes encoder = RangeEncoder(filepath) with pytest.raises(OverflowError): encoder.encode(data, cumFreq) encoder.close()
def apply_range_decoder(seq_data_len, decodepath, config): resolution = config['resolution'] prob = np.load('data_info/distribution_info.npy') # Avoid zero prob modified_freq = prob * resolution + 1 modified_prob = modified_freq / np.sum(modified_freq) # print(modified_prob) cum_freq = range_coder.prob_to_cum_freq(modified_prob, resolution=resolution) # print('-----') # print(cum_freq) # cum_freq = [0] + [i for i in range(1, 256 + 1)] range_decoder = range_coder.RangeDecoder(decodepath) # Whether cum_freq resolution influences performance ? seq_data = range_decoder.decode(seq_data_len, cum_freq) return seq_data
from range_coder import RangeEncoder, RangeDecoder, prob_to_cum_freq import os data = [2, 0, 1, 0, 0, 0, 1, 2, 2] prob = [0.5, 0.2, 0.3] # convert probabilities to cumulative integer frequency table cumFreq = prob_to_cum_freq(prob, resolution=4) print(cumFreq) filepath="output.txt" # encode data encoder = RangeEncoder(filepath) encoder.encode(data, cumFreq) encoder.close() # decode data decoder = RangeDecoder(filepath) dataRec = decoder.decode(len(data), cumFreq) decoder.close() print(os.stat(filepath)) print (dataRec)