def train_files(dataset_path, config_path): """INTERNAL Train from pickled dataset, config.""" from treecat.training import train_ensemble dataset = pickle_load(dataset_path) table = dataset['table'] V = table.num_cols K = V * (V - 1) // 2 tree_prior = np.zeros(K, dtype=np.float32) config = pickle_load(config_path) train_ensemble(table, tree_prior, config)
def serve_files(model_path, config_path, num_samples): """INTERNAL Serve from pickled model, config.""" from treecat.serving import TreeCatServer import numpy as np model = pickle_load(model_path) config = pickle_load(config_path) model['config'] = config server = TreeCatServer(model) counts = np.ones(model['tree'].num_vertices, np.int8) samples = server.sample(int(num_samples), counts) server.logprob(samples) server.median(counts, samples) server.latent_correlation()
def process_eval_task(task): (dataset_path, config, models_dir) = task # Load a server with the trained model. model_path = os.path.join(models_dir, 'model.{}.pkz'.format(serialize_config(config))) try: model = pickle_load(model_path) except (OSError, EOFError): return {'config': config} print('Eval {}'.format(os.path.basename(model_path))) server = TreeCatServer(model) # Split data for crossvalidation. num_parts = config['model_ensemble_size'] partid = config['seed'] assert 0 <= partid < num_parts dataset = pickle_load(dataset_path) table = dataset['table'] ragged_index = table.ragged_index data = table.data mask = split_data(ragged_index, table.num_rows, num_parts, partid) training_data = data.copy() training_data[mask] = 0 validation_data = data.copy() validation_data[~mask] = 0 # Compute posterior predictive log probability of held-out data. logprob = np.mean(server.logprob(data) - server.logprob(training_data)) # Compute L1 loss on observed validation data. N, R = data.shape V = len(ragged_index) - 1 obs_counts = count_observations(ragged_index, data) assert obs_counts.shape == (N, V) max_counts = obs_counts.max(axis=0) median = server.median(max_counts, training_data) observed = (obs_counts == max_counts[np.newaxis, :]) observed = make_ragged_mask(ragged_index, observed.T).T relevant = observed & mask validation_data[~relevant] = 0 median[~relevant] = 0 l1_loss = 0.5 * np.abs(median - validation_data).sum() l1_loss /= relevant.sum() + 0.1 return { 'config': config, 'logprob': logprob, 'l1_loss': l1_loss, 'profiling_stats': model.get('profiling_stats', {}), }
def train(dataset_in, ensemble_out, **options): """Train a TreeCat ensemble model on imported data.""" from treecat.training import train_ensemble dataset = pickle_load(dataset_in) table = dataset['table'] tree_prior = dataset['schema']['tree_prior'] config = make_config(**options) ensemble = train_ensemble(table, tree_prior, config) pickle_dump(ensemble, ensemble_out)
def serve_model(dataset, model): """Create a server object from the given dataset and model. Args: dataset: Either the filename of a pickled dataset or an already loaded dataset. model: Either the filename of a pickled TreeCat model or ensemble, or an already loaded model or ensemble. Returns: A DataServer object. """ if isinstance(dataset, str): dataset = pickle_load(dataset) if isinstance(model, str): model = pickle_load(model) if isinstance(model, dict): model = [model] return DataServer(dataset, model)
def train_task(dataset_path, model_path, config_str): """INTERNAL Train a single model.""" print('Train {}'.format(os.path.basename(model_path))) config = deserialize_config(config_str) dataset = pickle_load(dataset_path) tree_prior = dataset['schema']['tree_prior'] # Split data for crossvalidation. num_parts = config['model_ensemble_size'] partid = config['seed'] assert 0 <= partid < num_parts table = dataset['table'] ragged_index = table.ragged_index mask = split_data(ragged_index, table.num_rows, num_parts, partid) training_data = table.data.copy() training_data[mask] = 0 training_table = Table(table.feature_types, ragged_index, training_data) # Train a model. model = train_model(training_table, tree_prior, config) model['profiling_stats'] = get_profiling_stats() pickle_dump(model, model_path)
def generate_model_file(num_rows, num_cols, num_cats=4, rate=1.0): """Generate a random model. Returns: The path to a gzipped pickled model. """ path = os.path.join( DATA, '{}-{}-{}-{:0.1f}.model.pkz'.format(num_rows, num_cols, num_cats, rate)) V = num_cols K = V * (V - 1) // 2 if os.path.exists(path): return path print('Generating {}'.format(path)) if not os.path.exists(DATA): os.makedirs(DATA) dataset_path = generate_dataset_file(num_rows, num_cols, num_cats, rate) dataset = pickle_load(dataset_path) table = dataset['table'] tree_prior = np.zeros(K, dtype=np.float32) config = make_config(learning_init_epochs=5) model = train_model(table, tree_prior, config) pickle_dump(model, path) return path
def test_pickle(data, ext): with tempdir() as dirname: filename = os.path.join(dirname, 'test.{}'.format(ext)) pickle_dump(data, filename) actual = pickle_load(filename) assert_equal(actual, data)