def _test_dataset(args): dim, feature_type, density, infer_kinds, debug, hyper_prior = args object_count, feature_count = dim with tempdir(cleanup_on_error=(not debug)): seed_all(SEED) config_name = os.path.abspath("config.pb") model_base_name = "model.pb" model_name = os.path.abspath(model_base_name) rows_name = os.path.abspath("rows.pbs") models = generate_model(feature_count, feature_type, hyper_prior) model, fixed_hyper_models = models dump_model(model, model_name) fixed_model_names = [] for i, fm in enumerate(fixed_hyper_models): fixed_model_base = "fixed-{}-{}".format(i, model_base_name) fixed_model_name = os.path.abspath(fixed_model_base) fixed_model_names.append(fixed_model_name) dump_model(fm, fixed_model_name) if hyper_prior is None: assert len(fixed_model_names) == 0 rows = generate_rows(object_count, feature_count, feature_type, density) dump_rows(rows, rows_name) infer_cats = object_count > 1 infer_hypers = hyper_prior is not None if infer_kinds: sample_count = 10 * LATENT_SIZES[object_count][feature_count] iterations = 32 else: sample_count = 10 * LATENT_SIZES[object_count][1] iterations = 0 config = { "posterior_enum": {"sample_count": sample_count, "sample_skip": 10}, "kernels": { "hyper": {"run": infer_hypers, "parallel": False}, "kind": {"iterations": iterations, "row_queue_capacity": 0, "score_parallel": False}, }, } loom.config.config_dump(config, config_name) casename = "{}-{}-{}-{}-{}{}{}".format( object_count, feature_count, feature_type, density, ("C" if infer_cats else ""), ("K" if infer_kinds else ""), ("H" if infer_hypers else ""), ) # LOG('Run', casename) error = _test_dataset_config( casename, object_count, feature_count, config_name, model_name, fixed_model_names, rows_name, config, debug ) return [] if error is None else [error]
def _test_modify_schema(modify, name, schema, rows_csv, **unused): with tempdir(cleanup_on_error=CLEANUP_ON_ERROR) as store: with mock.patch('loom.store.STORE', new=store): modified_schema = os.path.join(store, 'schema.json') data = json_load(schema) data = modify(data) json_dump(data, modified_schema) loom.tasks.ingest(name, modified_schema, rows_csv, debug=True)
def test_dump_rows(): for feature_type in FEATURE_TYPES: table = generate_rows(10, 10, feature_type, 0.5) with tempdir(): rows_name = os.path.abspath('rows.pbs') dump_rows(table, rows_name) message = loom.schema_pb2.Row() for string in protobuf_stream_load(rows_name): message.ParseFromString(string)
def _test_modify_csv(modify, name, schema, encoding, rows, **unused): with tempdir(cleanup_on_error=CLEANUP_ON_ERROR) as store: with mock.patch('loom.store.STORE', new=store): rows_dir = os.path.join(store, 'rows_csv') loom.format.export_rows(encoding, rows, rows_dir) rows_csv = os.path.join(rows_dir, os.listdir(rows_dir)[0]) data = csv_load(rows_csv) data = modify(data) csv_dump(data, rows_csv) loom.tasks.ingest(name, schema, rows_csv, debug=True)
def generate_samples(casename, dataset, debug): root = os.getcwd() with tempdir(cleanup_on_error=(not debug)): results = { 'schema': os.path.abspath('schema.json'), 'schema_row': os.path.abspath('schema_row.pb'), 'tares': os.path.abspath('tares.pbs'), 'diffs': os.path.abspath('diffs.pbs'), 'samples': os.path.abspath('samples.pbs.gz'), } os.chdir(root) run_posterior_enum(casename, dataset, results, debug) for sample in load_samples(results['samples']): yield sample
def generate_samples(casename, dataset, debug): root = os.getcwd() with tempdir(cleanup_on_error=(not debug)): results = { "schema": os.path.abspath("schema.json"), "schema_row": os.path.abspath("schema_row.pb"), "tares": os.path.abspath("tares.pbs"), "diffs": os.path.abspath("diffs.pbs"), "samples": os.path.abspath("samples.pbs.gz"), } os.chdir(root) run_posterior_enum(casename, dataset, results, debug) for sample in load_samples(results["samples"]): yield sample
def generate( feature_type='mixed', row_count=1000, feature_count=100, density=0.5, rows_out='rows.pbs.gz', model_out='model.pb.gz', groups_out=None, assign_out=None, init_out=None, debug=False, profile=None): ''' Generate a synthetic dataset. ''' root = os.getcwd() rows_out = os.path.abspath(rows_out) model_out = os.path.abspath(model_out) if groups_out is not None: groups_out = os.path.abspath(groups_out) if assign_out is not None: assign_out = os.path.abspath(assign_out) if init_out is not None: init_out = os.path.abspath(init_out) features = generate_features(feature_count, feature_type) model = generate_model(features) with tempdir(cleanup_on_error=(not debug)): if init_out is None: init_out = os.path.abspath('init.pb.gz') with open_compressed(init_out, 'wb') as f: f.write(model.SerializeToString()) config = {'generate': {'row_count': row_count, 'density': density}} config_in = os.path.abspath('config.pb.gz') loom.config.config_dump(config, config_in) os.chdir(root) loom.runner.generate( config_in=config_in, model_in=init_out, rows_out=rows_out, model_out=model_out, groups_out=groups_out, assign_out=assign_out, debug=debug, profile=profile)
def generate_samples(model_name, rows_name, config_name, debug): with tempdir(cleanup_on_error=(not debug)): samples_name = os.path.abspath('samples.pbs.gz') with chdir(CWD): loom.runner.posterior_enum( config_name, model_name, rows_name, samples_name, debug=debug) message = loom.schema_pb2.PosteriorEnum.Sample() for string in protobuf_stream_load(samples_name): message.ParseFromString(string) sample = parse_sample(message) score = float(message.score) yield sample, score
def generate(feature_type='mixed', row_count=1000, feature_count=100, density=0.5, rows_out='rows.pbs.gz', model_out='model.pb.gz', groups_out=None, assign_out=None, init_out=None, debug=False, profile=None): ''' Generate a synthetic dataset. ''' root = os.getcwd() rows_out = os.path.abspath(rows_out) model_out = os.path.abspath(model_out) if groups_out is not None: groups_out = os.path.abspath(groups_out) if assign_out is not None: assign_out = os.path.abspath(assign_out) if init_out is not None: init_out = os.path.abspath(init_out) features = generate_features(feature_count, feature_type) model = generate_model(features) with tempdir(cleanup_on_error=(not debug)): if init_out is None: init_out = os.path.abspath('init.pb.gz') with open_compressed(init_out, 'wb') as f: f.write(model.SerializeToString()) config = {'generate': {'row_count': row_count, 'density': density}} config_in = os.path.abspath('config.pb.gz') loom.config.config_dump(config, config_in) os.chdir(root) loom.runner.generate(config_in=config_in, model_in=init_out, rows_out=rows_out, model_out=model_out, groups_out=groups_out, assign_out=assign_out, debug=debug, profile=profile)
def generate( feature_type='mixed', row_count=1000, feature_count=100, density=0.5, init_out='init.pb.gz', rows_out='rows.pbs.gz', model_out='model.pb.gz', groups_out='groups', debug=False, profile=None): ''' Generate a synthetic dataset. ''' root = os.path.abspath(os.path.curdir) init_out = os.path.abspath(init_out) rows_out = os.path.abspath(rows_out) model_out = os.path.abspath(model_out) groups_out = os.path.abspath(groups_out) model = generate_model(row_count, feature_count, feature_type, density) with open_compressed(init_out, 'w') as f: f.write(model.SerializeToString()) with tempdir(cleanup_on_error=(not debug)): config = {'generate': {'row_count': row_count, 'density': density}} config_in = os.path.abspath('config.pb.gz') loom.config.config_dump(config, config_in) os.chdir(root) loom.runner.generate( config_in=config_in, model_in=init_out, rows_out=rows_out, model_out=model_out, groups_out=groups_out, debug=debug, profile=profile)
def _test_dataset(args): dim, feature_type, density, infer_kinds, debug, hyper_prior = args object_count, feature_count = dim with tempdir(cleanup_on_error=(not debug)): seed_all(SEED) config_name = os.path.abspath('config.pb') model_base_name = 'model.pb' model_name = os.path.abspath(model_base_name) rows_name = os.path.abspath('rows.pbs') models = generate_model(feature_count, feature_type, hyper_prior) model, fixed_hyper_models = models dump_model(model, model_name) fixed_model_names = [] for i, fm in enumerate(fixed_hyper_models): fixed_model_base = 'fixed-{}-{}'.format(i, model_base_name) fixed_model_name = os.path.abspath(fixed_model_base) fixed_model_names.append(fixed_model_name) dump_model(fm, fixed_model_name) if hyper_prior is None: assert len(fixed_model_names) == 0 rows = generate_rows(object_count, feature_count, feature_type, density) dump_rows(rows, rows_name) infer_cats = (object_count > 1) infer_hypers = (hyper_prior is not None) if infer_kinds: sample_count = 10 * LATENT_SIZES[object_count][feature_count] iterations = 32 else: sample_count = 10 * LATENT_SIZES[object_count][1] iterations = 0 config = { 'posterior_enum': { 'sample_count': sample_count, 'sample_skip': 10, }, 'kernels': { 'hyper': { 'run': infer_hypers, 'parallel': False, }, 'kind': { 'iterations': iterations, 'row_queue_capacity': 0, 'score_parallel': False, }, }, } loom.config.config_dump(config, config_name) casename = '{}-{}-{}-{}-{}{}{}'.format(object_count, feature_count, feature_type, density, ('C' if infer_cats else ''), ('K' if infer_kinds else ''), ('H' if infer_hypers else '')) # LOG('Run', casename) error = _test_dataset_config(casename, object_count, feature_count, config_name, model_name, fixed_model_names, rows_name, config, debug) return [] if error is None else [error]
def test_missing_rows_error(name, schema, **unused): with tempdir(cleanup_on_error=CLEANUP_ON_ERROR) as store: with mock.patch('loom.store.STORE', new=store): rows_csv = os.path.join(store, 'missing.rows_csv') loom.tasks.ingest(name, schema, rows_csv, debug=True)
def _test_dataset(args): dim, feature_type, density, infer_kinds, debug, hyper_prior = args object_count, feature_count = dim with tempdir(cleanup_on_error=(not debug)): seed_all(SEED) config_name = os.path.abspath('config.pb') model_base_name = 'model.pb' model_name = os.path.abspath(model_base_name) rows_name = os.path.abspath('rows.pbs') models = generate_model(feature_count, feature_type, hyper_prior) model, fixed_hyper_models = models dump_model(model, model_name) fixed_model_names = [] for i, fm in enumerate(fixed_hyper_models): fixed_model_base = 'fixed-{}-{}'.format(i, model_base_name) fixed_model_name = os.path.abspath(fixed_model_base) fixed_model_names.append(fixed_model_name) dump_model(fm, fixed_model_name) if hyper_prior is None: assert len(fixed_model_names) == 0 rows = generate_rows( object_count, feature_count, feature_type, density) dump_rows(rows, rows_name) infer_cats = (object_count > 1) infer_hypers = (hyper_prior is not None) if infer_kinds: sample_count = 10 * LATENT_SIZES[object_count][feature_count] iterations = 32 else: sample_count = 10 * LATENT_SIZES[object_count][1] iterations = 0 config = { 'posterior_enum': { 'sample_count': sample_count, 'sample_skip': 10, }, 'kernels': { 'hyper': { 'run': infer_hypers, 'parallel': False, }, 'kind': { 'iterations': iterations, 'row_queue_capacity': 0, 'score_parallel': False, }, }, } loom.config.config_dump(config, config_name) casename = '{}-{}-{}-{}-{}{}{}'.format( object_count, feature_count, feature_type, density, ('C' if infer_cats else ''), ('K' if infer_kinds else ''), ('H' if infer_hypers else '')) # LOG('Run', casename) error = _test_dataset_config( casename, object_count, feature_count, config_name, model_name, fixed_model_names, rows_name, config, debug) return [] if error is None else [error]