示例#1
0
def train_files(dataset_path, config_path):
    """INTERNAL Train from pickled dataset, config."""
    from treecat.training import train_ensemble
    dataset = pickle_load(dataset_path)
    table = dataset['table']
    V = table.num_cols
    K = V * (V - 1) // 2
    tree_prior = np.zeros(K, dtype=np.float32)
    config = pickle_load(config_path)
    train_ensemble(table, tree_prior, config)
示例#2
0
def serve_files(model_path, config_path, num_samples):
    """INTERNAL Serve from pickled model, config."""
    from treecat.serving import TreeCatServer
    import numpy as np
    model = pickle_load(model_path)
    config = pickle_load(config_path)
    model['config'] = config
    server = TreeCatServer(model)
    counts = np.ones(model['tree'].num_vertices, np.int8)
    samples = server.sample(int(num_samples), counts)
    server.logprob(samples)
    server.median(counts, samples)
    server.latent_correlation()
示例#3
0
def process_eval_task(task):
    (dataset_path, config, models_dir) = task

    # Load a server with the trained model.
    model_path = os.path.join(models_dir,
                              'model.{}.pkz'.format(serialize_config(config)))
    try:
        model = pickle_load(model_path)
    except (OSError, EOFError):
        return {'config': config}
    print('Eval {}'.format(os.path.basename(model_path)))
    server = TreeCatServer(model)

    # Split data for crossvalidation.
    num_parts = config['model_ensemble_size']
    partid = config['seed']
    assert 0 <= partid < num_parts
    dataset = pickle_load(dataset_path)
    table = dataset['table']
    ragged_index = table.ragged_index
    data = table.data
    mask = split_data(ragged_index, table.num_rows, num_parts, partid)
    training_data = data.copy()
    training_data[mask] = 0
    validation_data = data.copy()
    validation_data[~mask] = 0

    # Compute posterior predictive log probability of held-out data.
    logprob = np.mean(server.logprob(data) - server.logprob(training_data))

    # Compute L1 loss on observed validation data.
    N, R = data.shape
    V = len(ragged_index) - 1
    obs_counts = count_observations(ragged_index, data)
    assert obs_counts.shape == (N, V)
    max_counts = obs_counts.max(axis=0)
    median = server.median(max_counts, training_data)
    observed = (obs_counts == max_counts[np.newaxis, :])
    observed = make_ragged_mask(ragged_index, observed.T).T
    relevant = observed & mask
    validation_data[~relevant] = 0
    median[~relevant] = 0
    l1_loss = 0.5 * np.abs(median - validation_data).sum()
    l1_loss /= relevant.sum() + 0.1

    return {
        'config': config,
        'logprob': logprob,
        'l1_loss': l1_loss,
        'profiling_stats': model.get('profiling_stats', {}),
    }
示例#4
0
def train(dataset_in, ensemble_out, **options):
    """Train a TreeCat ensemble model on imported data."""
    from treecat.training import train_ensemble
    dataset = pickle_load(dataset_in)
    table = dataset['table']
    tree_prior = dataset['schema']['tree_prior']
    config = make_config(**options)
    ensemble = train_ensemble(table, tree_prior, config)
    pickle_dump(ensemble, ensemble_out)
示例#5
0
def serve_model(dataset, model):
    """Create a server object from the given dataset and model.

    Args:
        dataset: Either the filename of a pickled dataset or an already loaded
            dataset.
        model: Either the filename of a pickled TreeCat model or ensemble, or
            an already loaded model or ensemble.

    Returns:
        A DataServer object.
    """
    if isinstance(dataset, str):
        dataset = pickle_load(dataset)
    if isinstance(model, str):
        model = pickle_load(model)
    if isinstance(model, dict):
        model = [model]
    return DataServer(dataset, model)
示例#6
0
def train_task(dataset_path, model_path, config_str):
    """INTERNAL Train a single model."""
    print('Train {}'.format(os.path.basename(model_path)))
    config = deserialize_config(config_str)
    dataset = pickle_load(dataset_path)
    tree_prior = dataset['schema']['tree_prior']

    # Split data for crossvalidation.
    num_parts = config['model_ensemble_size']
    partid = config['seed']
    assert 0 <= partid < num_parts
    table = dataset['table']
    ragged_index = table.ragged_index
    mask = split_data(ragged_index, table.num_rows, num_parts, partid)
    training_data = table.data.copy()
    training_data[mask] = 0
    training_table = Table(table.feature_types, ragged_index, training_data)

    # Train a model.
    model = train_model(training_table, tree_prior, config)
    model['profiling_stats'] = get_profiling_stats()
    pickle_dump(model, model_path)
示例#7
0
def generate_model_file(num_rows, num_cols, num_cats=4, rate=1.0):
    """Generate a random model.

    Returns:
        The path to a gzipped pickled model.
    """
    path = os.path.join(
        DATA, '{}-{}-{}-{:0.1f}.model.pkz'.format(num_rows, num_cols, num_cats,
                                                  rate))
    V = num_cols
    K = V * (V - 1) // 2
    if os.path.exists(path):
        return path
    print('Generating {}'.format(path))
    if not os.path.exists(DATA):
        os.makedirs(DATA)
    dataset_path = generate_dataset_file(num_rows, num_cols, num_cats, rate)
    dataset = pickle_load(dataset_path)
    table = dataset['table']
    tree_prior = np.zeros(K, dtype=np.float32)
    config = make_config(learning_init_epochs=5)
    model = train_model(table, tree_prior, config)
    pickle_dump(model, path)
    return path
示例#8
0
def test_pickle(data, ext):
    with tempdir() as dirname:
        filename = os.path.join(dirname, 'test.{}'.format(ext))
        pickle_dump(data, filename)
        actual = pickle_load(filename)
        assert_equal(actual, data)