Пример #1
0
def generate_dataset(num_rows, num_cols, num_cats=4, rate=1.0):
    """Generate a random dataset.

    Returns:
        A dataset dict with fields 'schema' and 'table'.
    """
    set_random_seed(0)
    N = num_rows
    V = num_cols
    K = V * (V - 1) // 2
    ragged_index = np.arange(0, num_cats * (V + 1), num_cats, np.int32)
    ragged_index.flags.writeable = False
    data = np.zeros((N, V * num_cats), np.int8)
    for v in range(V):
        beg, end = ragged_index[v:v + 2]
        column = data[:, beg:end]
        probs = np.random.dirichlet(np.zeros(num_cats) + 0.5)
        for n in range(N):
            count = np.random.poisson(rate)
            column[n, :] = np.random.multinomial(count, probs)
    data.flags.writeable = False
    feature_types = [TY_MULTINOMIAL] * V
    table = Table(feature_types, ragged_index, data)
    dataset = {
        'schema': {
            'ragged_index': ragged_index,
            'tree_prior': np.zeros(K, np.float32),
        },
        'table': table,
    }
    return dataset
Пример #2
0
def test_estimate_tree(num_edges):
    set_random_seed(0)
    E = num_edges
    V = 1 + E
    grid = make_complete_graph(V)
    K = grid.shape[1]
    edge_logits = np.random.random([K]) - 0.5
    edges = estimate_tree(grid, edge_logits)

    # Check size.
    assert len(edges) == E
    for v in range(V):
        assert any(v in edge for edge in edges)

    # Check optimality.
    edges = tuple(edges)
    if V < len(TREE_GENERATORS):
        all_trees = get_spanning_trees(V)
        assert edges in all_trees
        all_trees = list(all_trees)
        logits = []
        for tree in all_trees:
            logits.append(
                sum(edge_logits[find_complete_edge(u, v)] for (u, v) in tree))
        expected = all_trees[np.argmax(logits)]
        assert edges == expected
Пример #3
0
def split_data(ragged_index, num_rows, num_parts, partid):
    """Split a dataset into training + holdout for n-fold crossvalidation.

    This splits a dataset into num_parts disjoint parts by randomly holding out
    cells. Note that whereas supervised crossvalidation typically holds out
    entire rows, our unsupervised crossvalidation is intended to evaluate a
    model of the full joint distribution.

    Args:
        ragged_index: A [V+1]-shaped numpy array of indices into the ragged
            data array, where V is the number of features.
        num_rows: An integer, the number of rows in the dataset.
        num_parts: An integer, the number of folds in n-fold crossvalidation.
        partid: An integer in [0, num_parts).

    Returns:
        A [N,R]-shaped mask where True means held-out and False means training.
        Here N = num_rows and R = ragged_index[-1].
    """
    set_random_seed(0)
    assert 0 <= partid < num_parts
    N = num_rows
    V = ragged_index.shape[0] - 1
    R = ragged_index[-1]
    dense_mask = (partid == np.random.randint(num_parts, size=(N, V)))
    ragged_mask = make_ragged_mask(ragged_index, dense_mask.T).T
    assert ragged_mask.shape == (N, R)
    return ragged_mask
Пример #4
0
def test_server_conditional_gof(N, V, C, M):
    set_random_seed(make_seed(N, V, C, M, 1))
    model = generate_fake_model(N, V, C, M)
    config = TINY_CONFIG.copy()
    config['model_num_clusters'] = M
    model['config'] = config
    server = TreeCatServer(model)
    validate_gof(N, V, C, M, server, conditional=True)
Пример #5
0
def test_ensemble_latent_perplexity(N, V, C, M):
    set_random_seed(make_seed(N, V, C, M))
    ensemble = generate_fake_ensemble(N, V, C, M)
    server = EnsembleServer(ensemble)

    perplexity = server.latent_perplexity()
    print(perplexity)
    assert perplexity.shape == (V, )
    assert np.all(1 <= perplexity)
    assert np.all(perplexity <= M)
Пример #6
0
    def train(self):
        """Train a model using subsample-annealed MCMC.

        Returns:
            A trained model as a dictionary with keys:
                config: A global config dict.
                tree: A TreeStructure instance with the learned latent
                    structure.
                edge_logits: A [K]-shaped array of all edge logits.
        """
        logger.info('TreeTrainer.train')
        set_random_seed(self._config['seed'])
        init_epochs = self._config['learning_init_epochs']
        full_epochs = self._config['learning_full_epochs']
        sample_tree_rate = self._config['learning_sample_tree_rate']
        num_rows = self._num_rows

        # Initialize using subsample annealing.
        assert len(self._added_rows) == 0
        schedule = make_annealing_schedule(num_rows, init_epochs,
                                           sample_tree_rate)
        for action, row_id in schedule:
            if action == 'add_row':
                self.add_row(row_id)
            elif action == 'remove_row':
                self.remove_row(row_id)
            elif action == 'sample_tree':
                edges, edge_logits = self.sample_tree()
                self.set_edges(edges)
            else:
                raise ValueError(action)

        # Run full gibbs scans.
        assert len(self._added_rows) == num_rows
        for step in range(full_epochs):
            edges, edge_logits = self.sample_tree()
            self.set_edges(edges)
            for row_id in range(num_rows):
                self.remove_row(row_id)
                self.add_row(row_id)

        # Compute optimal tree.
        assert len(self._added_rows) == num_rows
        edges, edge_logits = self.estimate_tree()
        if self._config['learning_estimate_tree']:
            self.set_edges(edges)

        self._tree.gc()

        return {
            'config': self._config,
            'tree': self._tree,
            'edge_logits': edge_logits,
        }
Пример #7
0
def test_sample_from_probs_gof(size):
    set_random_seed(size)
    probs = np.exp(2 * np.random.random(size)).astype(np.float32)
    counts = np.zeros(size, dtype=np.int32)
    num_samples = 2000 * size
    for _ in range(num_samples):
        counts[sample_from_probs(probs)] += 1
    probs /= probs.sum()  # Normalize afterwards.
    print(counts)
    print(probs * num_samples)
    gof = multinomial_goodness_of_fit(probs, counts, num_samples, plot=True)
    assert 1e-2 < gof
Пример #8
0
def generate_fake_ensemble(num_rows, num_cols, num_cats, num_components):
    dataset = generate_dataset(num_rows, num_cols, num_cats)
    ensemble = []
    config = make_config(model_num_clusters=num_components, seed=0)
    for sub_seed in range(3):
        sub_config = config.copy()
        sub_config['seed'] += sub_seed
        set_random_seed(sub_config['seed'])
        model = generate_fake_model(num_rows, num_cols, num_cats,
                                    num_components, dataset)
        model['config'] = sub_config
        ensemble.append(model)
    return ensemble
Пример #9
0
def test_sample_from_probs2_gof(size):
    set_random_seed(size)
    probs = np.exp(2 * np.random.random(size)).astype(np.float32)
    counts = np.zeros(size, dtype=np.int32)
    num_samples = 2000 * size
    probs2 = np.tile(probs, (num_samples, 1))
    samples = sample_from_probs2(probs2)
    probs /= probs.sum()  # Normalize afterwards.
    counts = np.bincount(samples, minlength=size)
    print(counts)
    print(probs * num_samples)
    gof = multinomial_goodness_of_fit(probs, counts, num_samples, plot=True)
    assert 1e-2 < gof
Пример #10
0
def test_ensemble_latent_correlation(N, V, C, M):
    set_random_seed(make_seed(N, V, C, M))
    ensemble = generate_fake_ensemble(N, V, C, M)
    server = EnsembleServer(ensemble)

    correlation = server.latent_correlation()
    print(correlation)
    assert np.all(0 <= correlation)
    assert np.all(correlation <= 1)
    assert np.allclose(correlation, correlation.T)
    for v in range(V):
        assert correlation[v, :].argmax() == v
        assert correlation[:, v].argmax() == v
Пример #11
0
def test_latent_perplexity(N, V, C, M):
    set_random_seed(make_seed(N, V, C, M))
    model = generate_fake_model(N, V, C, M)
    config = TINY_CONFIG.copy()
    config['model_num_clusters'] = M
    model['config'] = config
    server = TreeCatServer(model)

    perplexity = server.latent_perplexity()
    print(perplexity)
    assert perplexity.shape == (V, )
    assert np.all(1 <= perplexity)
    assert np.all(perplexity <= M)
Пример #12
0
def test_latent_correlation(N, V, C, M):
    set_random_seed(make_seed(N, V, C, M))
    model = generate_fake_model(N, V, C, M)
    config = TINY_CONFIG.copy()
    config['model_num_clusters'] = M
    model['config'] = config
    server = TreeCatServer(model)

    correlation = server.latent_correlation()
    print(correlation)
    assert np.all(0 <= correlation)
    assert np.all(correlation <= 1)
    assert np.allclose(correlation, correlation.T)
    for v in range(V):
        assert correlation[v, :].argmax() == v
        assert correlation[:, v].argmax() == v
Пример #13
0
def test_observed_perplexity(N, V, C, M):
    set_random_seed(make_seed(N, V, C, M))
    model = generate_fake_model(N, V, C, M)
    config = TINY_CONFIG.copy()
    config['model_num_clusters'] = M
    model['config'] = config
    server = TreeCatServer(model)

    for count in [1, 2, 3]:
        if count > 1 and C > 2:
            continue  # NotImplementedError.
        counts = 1
        perplexity = server.observed_perplexity(counts)
        print(perplexity)
        assert perplexity.shape == (V, )
        assert np.all(1 <= perplexity)
        assert np.all(perplexity <= count * C)
Пример #14
0
def test_recover_structure(V, C):
    set_random_seed(V + C * 10)
    N = 200
    M = 2 * C
    K = V * (V - 1) // 2
    tree_prior = np.zeros(K, np.float32)
    tree = generate_tree(num_cols=V)
    table = generate_clean_dataset(tree, num_rows=N, num_cats=C)['table']
    config = make_config(model_num_clusters=M)
    model = train_model(table, tree_prior, config)

    # Compute three types of edges.
    expected_edges = tree.get_edges()
    optimal_edges = estimate_tree(tree.complete_grid, model['edge_logits'])
    actual_edges = model['tree'].get_edges()

    # Print debugging information.
    feature_names = [str(v) for v in range(V)]
    root = '0'
    readable_data = np.zeros([N, V], np.int8)
    for v in range(V):
        beg, end = table.ragged_index[v:v + 2]
        readable_data[:, v] = table.data[:, beg:end].argmax(axis=1)
    with np_printoptions(precision=2, threshold=100, edgeitems=5):
        print('Expected:')
        print(print_tree(expected_edges, feature_names, root))
        print('Optimal:')
        print(print_tree(optimal_edges, feature_names, root))
        print('Actual:')
        print(print_tree(actual_edges, feature_names, root))
        print('Correlation:')
        print(np.corrcoef(readable_data.T))
        print('Edge logits:')
        print(triangular_to_square(tree.complete_grid, model['edge_logits']))
        print('Data:')
        print(readable_data)
        print('Feature Sufficient Statistics:')
        print(model['suffstats']['feat_ss'])
        print('Edge Sufficient Statistics:')
        print(model['suffstats']['edge_ss'])

    # Check agreement.
    assert actual_edges == optimal_edges, 'Error in sample_tree'
    assert actual_edges == expected_edges, 'Error in likelihood'
Пример #15
0
def test_make_annealing_schedule():
    set_random_seed(0)
    num_rows = 10
    init_epochs = 10
    sample_tree_rate = 3
    schedule = make_annealing_schedule(num_rows, init_epochs, sample_tree_rate)
    assigned_rows = 0
    for step, (action, row_id) in enumerate(schedule):
        assert step < 1000
        assert action in ['add_row', 'remove_row', 'sample_tree']
        if action == 'sample_tree':
            assert row_id is None
        else:
            assert 0 <= row_id and row_id < num_rows
            if action == 'add_row':
                assigned_rows += 1
            elif action == 'remove_row':
                assigned_rows -= 1
    assert assigned_rows == num_rows
Пример #16
0
def test_assignment_sampler_gof(N, V, C, M):
    config = make_config(model_num_clusters=M)
    K = V * (V - 1) // 2
    dataset = generate_dataset(num_rows=N, num_cols=V, num_cats=C)
    table = dataset['table']
    tree_prior = np.exp(np.random.random(K), dtype=np.float32)
    trainer = TreeCatTrainer(table, tree_prior, config)
    print('Data:')
    print(dataset['table'].data)

    # Add all rows.
    set_random_seed(1)
    for row_id in range(N):
        trainer.add_row(row_id)

    # Collect samples.
    num_samples = 500 * M**(N * V)
    counts = {}
    logprobs = {}
    for _ in range(num_samples):
        for row_id in range(N):
            # This is a single-site Gibbs sampler.
            trainer.remove_row(row_id)
            trainer.add_row(row_id)
        key = hash_assignments(trainer._assignments)
        if key in counts:
            counts[key] += 1
        else:
            counts[key] = 1
            logprobs[key] = trainer.logprob()
    assert len(counts) == M**(N * V)

    # Check accuracy using Pearson's chi-squared test.
    keys = sorted(counts.keys())
    counts = np.array([counts[k] for k in keys], dtype=np.int32)
    probs = np.exp(np.array([logprobs[k] for k in keys]))
    probs /= probs.sum()
    print('Actual\tExpected\tAssignment')
    for count, prob, key in zip(counts, probs, keys):
        print('{:}\t{:0.1f}\t{}'.format(count, prob * num_samples, key))
    gof = multinomial_goodness_of_fit(probs, counts, num_samples, plot=True)
    assert 1e-2 < gof
Пример #17
0
def test_quantize_from_probs2(size, resolution):
    set_random_seed(make_seed(size, resolution))
    probs = np.exp(np.random.random(size)).astype(np.float32)
    probs2 = probs.reshape((1, size))
    quantized = quantize_from_probs2(probs2, resolution)
    assert quantized.shape == probs2.shape
    assert quantized.dtype == np.int8
    assert np.all(quantized.sum(axis=1) == resolution)

    # Check that quantized result is closer to target than any other value.
    quantized = quantized.reshape((size, ))
    target = resolution * probs / probs.sum()
    distance = np.abs(quantized - target).sum()
    for combo in itertools.combinations(range(size), resolution):
        other = np.zeros(size, np.int8)
        for i in combo:
            other[i] += 1
        assert other.sum() == resolution
        other_distance = np.abs(other - target).sum()
        assert distance <= other_distance
Пример #18
0
def test_sample_tree_gof(num_edges):
    set_random_seed(num_edges)
    E = num_edges
    V = 1 + E
    grid = make_complete_graph(V)
    K = grid.shape[1]
    edge_logits = np.random.random([K])
    edge_probs = np.exp(edge_logits)
    edge_probs_dict = {(v1, v2): edge_probs[k] for k, v1, v2 in grid.T}

    # Generate many samples via MCMC.
    num_samples = 30 * NUM_SPANNING_TREES[V]
    counts = defaultdict(lambda: 0)
    edges = [(v, v + 1) for v in range(V - 1)]
    for _ in range(num_samples):
        edges = sample_tree(grid, edge_logits, edges)
        counts[tuple(edges)] += 1
    assert len(counts) == NUM_SPANNING_TREES[V]

    # Check accuracy using Pearson's chi-squared test.
    keys = counts.keys()
    counts = np.array([counts[key] for key in keys])
    probs = np.array(
        [np.prod([edge_probs_dict[edge] for edge in key]) for key in keys])
    probs /= probs.sum()

    # Possibly truncate.
    T = 100
    truncated = False
    if len(counts) > T:
        counts = counts[:T]
        probs = probs[:T]
        truncated = True

    gof = multinomial_goodness_of_fit(probs,
                                      counts,
                                      num_samples,
                                      plot=True,
                                      truncated=truncated)
    assert 1e-2 < gof