def generate_dataset(num_rows, num_cols, num_cats=4, rate=1.0): """Generate a random dataset. Returns: A dataset dict with fields 'schema' and 'table'. """ set_random_seed(0) N = num_rows V = num_cols K = V * (V - 1) // 2 ragged_index = np.arange(0, num_cats * (V + 1), num_cats, np.int32) ragged_index.flags.writeable = False data = np.zeros((N, V * num_cats), np.int8) for v in range(V): beg, end = ragged_index[v:v + 2] column = data[:, beg:end] probs = np.random.dirichlet(np.zeros(num_cats) + 0.5) for n in range(N): count = np.random.poisson(rate) column[n, :] = np.random.multinomial(count, probs) data.flags.writeable = False feature_types = [TY_MULTINOMIAL] * V table = Table(feature_types, ragged_index, data) dataset = { 'schema': { 'ragged_index': ragged_index, 'tree_prior': np.zeros(K, np.float32), }, 'table': table, } return dataset
def test_estimate_tree(num_edges): set_random_seed(0) E = num_edges V = 1 + E grid = make_complete_graph(V) K = grid.shape[1] edge_logits = np.random.random([K]) - 0.5 edges = estimate_tree(grid, edge_logits) # Check size. assert len(edges) == E for v in range(V): assert any(v in edge for edge in edges) # Check optimality. edges = tuple(edges) if V < len(TREE_GENERATORS): all_trees = get_spanning_trees(V) assert edges in all_trees all_trees = list(all_trees) logits = [] for tree in all_trees: logits.append( sum(edge_logits[find_complete_edge(u, v)] for (u, v) in tree)) expected = all_trees[np.argmax(logits)] assert edges == expected
def split_data(ragged_index, num_rows, num_parts, partid): """Split a dataset into training + holdout for n-fold crossvalidation. This splits a dataset into num_parts disjoint parts by randomly holding out cells. Note that whereas supervised crossvalidation typically holds out entire rows, our unsupervised crossvalidation is intended to evaluate a model of the full joint distribution. Args: ragged_index: A [V+1]-shaped numpy array of indices into the ragged data array, where V is the number of features. num_rows: An integer, the number of rows in the dataset. num_parts: An integer, the number of folds in n-fold crossvalidation. partid: An integer in [0, num_parts). Returns: A [N,R]-shaped mask where True means held-out and False means training. Here N = num_rows and R = ragged_index[-1]. """ set_random_seed(0) assert 0 <= partid < num_parts N = num_rows V = ragged_index.shape[0] - 1 R = ragged_index[-1] dense_mask = (partid == np.random.randint(num_parts, size=(N, V))) ragged_mask = make_ragged_mask(ragged_index, dense_mask.T).T assert ragged_mask.shape == (N, R) return ragged_mask
def test_server_conditional_gof(N, V, C, M): set_random_seed(make_seed(N, V, C, M, 1)) model = generate_fake_model(N, V, C, M) config = TINY_CONFIG.copy() config['model_num_clusters'] = M model['config'] = config server = TreeCatServer(model) validate_gof(N, V, C, M, server, conditional=True)
def test_ensemble_latent_perplexity(N, V, C, M): set_random_seed(make_seed(N, V, C, M)) ensemble = generate_fake_ensemble(N, V, C, M) server = EnsembleServer(ensemble) perplexity = server.latent_perplexity() print(perplexity) assert perplexity.shape == (V, ) assert np.all(1 <= perplexity) assert np.all(perplexity <= M)
def train(self): """Train a model using subsample-annealed MCMC. Returns: A trained model as a dictionary with keys: config: A global config dict. tree: A TreeStructure instance with the learned latent structure. edge_logits: A [K]-shaped array of all edge logits. """ logger.info('TreeTrainer.train') set_random_seed(self._config['seed']) init_epochs = self._config['learning_init_epochs'] full_epochs = self._config['learning_full_epochs'] sample_tree_rate = self._config['learning_sample_tree_rate'] num_rows = self._num_rows # Initialize using subsample annealing. assert len(self._added_rows) == 0 schedule = make_annealing_schedule(num_rows, init_epochs, sample_tree_rate) for action, row_id in schedule: if action == 'add_row': self.add_row(row_id) elif action == 'remove_row': self.remove_row(row_id) elif action == 'sample_tree': edges, edge_logits = self.sample_tree() self.set_edges(edges) else: raise ValueError(action) # Run full gibbs scans. assert len(self._added_rows) == num_rows for step in range(full_epochs): edges, edge_logits = self.sample_tree() self.set_edges(edges) for row_id in range(num_rows): self.remove_row(row_id) self.add_row(row_id) # Compute optimal tree. assert len(self._added_rows) == num_rows edges, edge_logits = self.estimate_tree() if self._config['learning_estimate_tree']: self.set_edges(edges) self._tree.gc() return { 'config': self._config, 'tree': self._tree, 'edge_logits': edge_logits, }
def test_sample_from_probs_gof(size): set_random_seed(size) probs = np.exp(2 * np.random.random(size)).astype(np.float32) counts = np.zeros(size, dtype=np.int32) num_samples = 2000 * size for _ in range(num_samples): counts[sample_from_probs(probs)] += 1 probs /= probs.sum() # Normalize afterwards. print(counts) print(probs * num_samples) gof = multinomial_goodness_of_fit(probs, counts, num_samples, plot=True) assert 1e-2 < gof
def generate_fake_ensemble(num_rows, num_cols, num_cats, num_components): dataset = generate_dataset(num_rows, num_cols, num_cats) ensemble = [] config = make_config(model_num_clusters=num_components, seed=0) for sub_seed in range(3): sub_config = config.copy() sub_config['seed'] += sub_seed set_random_seed(sub_config['seed']) model = generate_fake_model(num_rows, num_cols, num_cats, num_components, dataset) model['config'] = sub_config ensemble.append(model) return ensemble
def test_sample_from_probs2_gof(size): set_random_seed(size) probs = np.exp(2 * np.random.random(size)).astype(np.float32) counts = np.zeros(size, dtype=np.int32) num_samples = 2000 * size probs2 = np.tile(probs, (num_samples, 1)) samples = sample_from_probs2(probs2) probs /= probs.sum() # Normalize afterwards. counts = np.bincount(samples, minlength=size) print(counts) print(probs * num_samples) gof = multinomial_goodness_of_fit(probs, counts, num_samples, plot=True) assert 1e-2 < gof
def test_ensemble_latent_correlation(N, V, C, M): set_random_seed(make_seed(N, V, C, M)) ensemble = generate_fake_ensemble(N, V, C, M) server = EnsembleServer(ensemble) correlation = server.latent_correlation() print(correlation) assert np.all(0 <= correlation) assert np.all(correlation <= 1) assert np.allclose(correlation, correlation.T) for v in range(V): assert correlation[v, :].argmax() == v assert correlation[:, v].argmax() == v
def test_latent_perplexity(N, V, C, M): set_random_seed(make_seed(N, V, C, M)) model = generate_fake_model(N, V, C, M) config = TINY_CONFIG.copy() config['model_num_clusters'] = M model['config'] = config server = TreeCatServer(model) perplexity = server.latent_perplexity() print(perplexity) assert perplexity.shape == (V, ) assert np.all(1 <= perplexity) assert np.all(perplexity <= M)
def test_latent_correlation(N, V, C, M): set_random_seed(make_seed(N, V, C, M)) model = generate_fake_model(N, V, C, M) config = TINY_CONFIG.copy() config['model_num_clusters'] = M model['config'] = config server = TreeCatServer(model) correlation = server.latent_correlation() print(correlation) assert np.all(0 <= correlation) assert np.all(correlation <= 1) assert np.allclose(correlation, correlation.T) for v in range(V): assert correlation[v, :].argmax() == v assert correlation[:, v].argmax() == v
def test_observed_perplexity(N, V, C, M): set_random_seed(make_seed(N, V, C, M)) model = generate_fake_model(N, V, C, M) config = TINY_CONFIG.copy() config['model_num_clusters'] = M model['config'] = config server = TreeCatServer(model) for count in [1, 2, 3]: if count > 1 and C > 2: continue # NotImplementedError. counts = 1 perplexity = server.observed_perplexity(counts) print(perplexity) assert perplexity.shape == (V, ) assert np.all(1 <= perplexity) assert np.all(perplexity <= count * C)
def test_recover_structure(V, C): set_random_seed(V + C * 10) N = 200 M = 2 * C K = V * (V - 1) // 2 tree_prior = np.zeros(K, np.float32) tree = generate_tree(num_cols=V) table = generate_clean_dataset(tree, num_rows=N, num_cats=C)['table'] config = make_config(model_num_clusters=M) model = train_model(table, tree_prior, config) # Compute three types of edges. expected_edges = tree.get_edges() optimal_edges = estimate_tree(tree.complete_grid, model['edge_logits']) actual_edges = model['tree'].get_edges() # Print debugging information. feature_names = [str(v) for v in range(V)] root = '0' readable_data = np.zeros([N, V], np.int8) for v in range(V): beg, end = table.ragged_index[v:v + 2] readable_data[:, v] = table.data[:, beg:end].argmax(axis=1) with np_printoptions(precision=2, threshold=100, edgeitems=5): print('Expected:') print(print_tree(expected_edges, feature_names, root)) print('Optimal:') print(print_tree(optimal_edges, feature_names, root)) print('Actual:') print(print_tree(actual_edges, feature_names, root)) print('Correlation:') print(np.corrcoef(readable_data.T)) print('Edge logits:') print(triangular_to_square(tree.complete_grid, model['edge_logits'])) print('Data:') print(readable_data) print('Feature Sufficient Statistics:') print(model['suffstats']['feat_ss']) print('Edge Sufficient Statistics:') print(model['suffstats']['edge_ss']) # Check agreement. assert actual_edges == optimal_edges, 'Error in sample_tree' assert actual_edges == expected_edges, 'Error in likelihood'
def test_make_annealing_schedule(): set_random_seed(0) num_rows = 10 init_epochs = 10 sample_tree_rate = 3 schedule = make_annealing_schedule(num_rows, init_epochs, sample_tree_rate) assigned_rows = 0 for step, (action, row_id) in enumerate(schedule): assert step < 1000 assert action in ['add_row', 'remove_row', 'sample_tree'] if action == 'sample_tree': assert row_id is None else: assert 0 <= row_id and row_id < num_rows if action == 'add_row': assigned_rows += 1 elif action == 'remove_row': assigned_rows -= 1 assert assigned_rows == num_rows
def test_assignment_sampler_gof(N, V, C, M): config = make_config(model_num_clusters=M) K = V * (V - 1) // 2 dataset = generate_dataset(num_rows=N, num_cols=V, num_cats=C) table = dataset['table'] tree_prior = np.exp(np.random.random(K), dtype=np.float32) trainer = TreeCatTrainer(table, tree_prior, config) print('Data:') print(dataset['table'].data) # Add all rows. set_random_seed(1) for row_id in range(N): trainer.add_row(row_id) # Collect samples. num_samples = 500 * M**(N * V) counts = {} logprobs = {} for _ in range(num_samples): for row_id in range(N): # This is a single-site Gibbs sampler. trainer.remove_row(row_id) trainer.add_row(row_id) key = hash_assignments(trainer._assignments) if key in counts: counts[key] += 1 else: counts[key] = 1 logprobs[key] = trainer.logprob() assert len(counts) == M**(N * V) # Check accuracy using Pearson's chi-squared test. keys = sorted(counts.keys()) counts = np.array([counts[k] for k in keys], dtype=np.int32) probs = np.exp(np.array([logprobs[k] for k in keys])) probs /= probs.sum() print('Actual\tExpected\tAssignment') for count, prob, key in zip(counts, probs, keys): print('{:}\t{:0.1f}\t{}'.format(count, prob * num_samples, key)) gof = multinomial_goodness_of_fit(probs, counts, num_samples, plot=True) assert 1e-2 < gof
def test_quantize_from_probs2(size, resolution): set_random_seed(make_seed(size, resolution)) probs = np.exp(np.random.random(size)).astype(np.float32) probs2 = probs.reshape((1, size)) quantized = quantize_from_probs2(probs2, resolution) assert quantized.shape == probs2.shape assert quantized.dtype == np.int8 assert np.all(quantized.sum(axis=1) == resolution) # Check that quantized result is closer to target than any other value. quantized = quantized.reshape((size, )) target = resolution * probs / probs.sum() distance = np.abs(quantized - target).sum() for combo in itertools.combinations(range(size), resolution): other = np.zeros(size, np.int8) for i in combo: other[i] += 1 assert other.sum() == resolution other_distance = np.abs(other - target).sum() assert distance <= other_distance
def test_sample_tree_gof(num_edges): set_random_seed(num_edges) E = num_edges V = 1 + E grid = make_complete_graph(V) K = grid.shape[1] edge_logits = np.random.random([K]) edge_probs = np.exp(edge_logits) edge_probs_dict = {(v1, v2): edge_probs[k] for k, v1, v2 in grid.T} # Generate many samples via MCMC. num_samples = 30 * NUM_SPANNING_TREES[V] counts = defaultdict(lambda: 0) edges = [(v, v + 1) for v in range(V - 1)] for _ in range(num_samples): edges = sample_tree(grid, edge_logits, edges) counts[tuple(edges)] += 1 assert len(counts) == NUM_SPANNING_TREES[V] # Check accuracy using Pearson's chi-squared test. keys = counts.keys() counts = np.array([counts[key] for key in keys]) probs = np.array( [np.prod([edge_probs_dict[edge] for edge in key]) for key in keys]) probs /= probs.sum() # Possibly truncate. T = 100 truncated = False if len(counts) > T: counts = counts[:T] probs = probs[:T] truncated = True gof = multinomial_goodness_of_fit(probs, counts, num_samples, plot=True, truncated=truncated) assert 1e-2 < gof