def test_import_rows(encoding, rows, rows_csv, **unused): with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): rows_pbs = os.path.abspath('rows.pbs.gz') loom.format.import_rows(encoding_in=encoding, rows_csv_in=rows_csv, rows_out=rows_pbs) assert_found(rows_pbs) expected_count = sum(1 for _ in protobuf_stream_load(rows)) actual_count = sum(1 for _ in protobuf_stream_load(rows_pbs)) assert_equal(actual_count, expected_count)
def test_import_rows(encoding, rows, rows_csv, **unused): with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): rows_pbs = os.path.abspath('rows.pbs.gz') loom.format.import_rows( encoding_in=encoding, rows_csv_in=rows_csv, rows_out=rows_pbs) assert_found(rows_pbs) expected_count = sum(1 for _ in protobuf_stream_load(rows)) actual_count = sum(1 for _ in protobuf_stream_load(rows_pbs)) assert_equal(actual_count, expected_count)
def crossvalidate_one(seed, test_count, train_count, inputs, results, extra_passes, debug): LOG('running seed {}:'.format(seed)) results['train'] = os.path.join(results['root'], 'train', 'diffs.pbs.gz') results['test'] = os.path.join(results['root'], 'test', 'rows.pbs.gz') results['scores'] = os.path.join(results['root'], 'scores.json.gz') config = { 'seed': seed, 'schedule': { 'extra_passes': extra_passes }, } loom.config.config_dump(config, results['samples'][0]['config']) numpy.random.seed(seed) split = [True] * train_count + [False] * test_count numpy.random.shuffle(split) diffs_in = protobuf_stream_load(inputs['ingest']['diffs']) protobuf_stream_dump((row for s, row in izip(split, diffs_in) if s), results['train']) rows_in = protobuf_stream_load(inputs['ingest']['rows']) protobuf_stream_dump((row for s, row in izip(split, rows_in) if not s), results['test']) LOG(' shuffle') loom.runner.shuffle(rows_in=results['train'], rows_out=results['samples'][0]['shuffled'], seed=seed, debug=debug) LOG(' init') loom.generate.generate_init(encoding_in=inputs['ingest']['encoding'], model_out=results['samples'][0]['init'], seed=seed) LOG(' infer') loom.runner.infer(config_in=results['samples'][0]['config'], rows_in=results['samples'][0]['shuffled'], tares_in=inputs['ingest']['tares'], model_in=results['samples'][0]['init'], model_out=results['samples'][0]['model'], groups_out=results['samples'][0]['groups'], debug=debug) LOG(' query') rows = loom.query.load_data_rows(results['test']) loom.config.config_dump({}, results['query']['config']) with loom.query.get_server(results['root'], debug=debug) as query: scores = [query.score(row) for row in rows] json_dump(scores, results['scores']) LOG(' done\n') return numpy.mean(scores)
def test_infer(name, tares, shuffled, init, **unused): with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): row_count = sum(1 for _ in protobuf_stream_load(shuffled)) with open_compressed(init) as f: message = CrossCat() message.ParseFromString(f.read()) kind_count = len(message.kinds) for config in CONFIGS: loom.config.fill_in_defaults(config) schedule = config['schedule'] print 'config: {}'.format(config) greedy = (schedule['extra_passes'] == 0) kind_iters = config['kernels']['kind']['iterations'] kind_structure_is_fixed = greedy or kind_iters == 0 with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): config_in = os.path.abspath('config.pb.gz') model_out = os.path.abspath('model.pb.gz') groups_out = os.path.abspath('groups') assign_out = os.path.abspath('assign.pbs.gz') log_out = os.path.abspath('log.pbs.gz') os.mkdir(groups_out) loom.config.config_dump(config, config_in) loom.runner.infer( config_in=config_in, rows_in=shuffled, tares_in=tares, model_in=init, model_out=model_out, groups_out=groups_out, assign_out=assign_out, log_out=log_out, debug=True,) if kind_structure_is_fixed: assert_equal(len(os.listdir(groups_out)), kind_count) group_counts = get_group_counts(groups_out) assign_count = sum(1 for _ in protobuf_stream_load(assign_out)) assert_equal(assign_count, row_count) print 'row_count: {}'.format(row_count) print 'group_counts: {}'.format(' '.join(map(str, group_counts))) for group_count in group_counts: assert_true( group_count <= row_count, 'groups are all singletons')
def test_infer(name, tares, shuffled, init, **unused): with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): row_count = sum(1 for _ in protobuf_stream_load(shuffled)) with open_compressed(init) as f: message = CrossCat() message.ParseFromString(f.read()) kind_count = len(message.kinds) for config in CONFIGS: loom.config.fill_in_defaults(config) schedule = config['schedule'] print 'config: {}'.format(config) greedy = (schedule['extra_passes'] == 0) kind_iters = config['kernels']['kind']['iterations'] kind_structure_is_fixed = greedy or kind_iters == 0 with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): config_in = os.path.abspath('config.pb.gz') model_out = os.path.abspath('model.pb.gz') groups_out = os.path.abspath('groups') assign_out = os.path.abspath('assign.pbs.gz') log_out = os.path.abspath('log.pbs.gz') os.mkdir(groups_out) loom.config.config_dump(config, config_in) loom.runner.infer( config_in=config_in, rows_in=shuffled, tares_in=tares, model_in=init, model_out=model_out, groups_out=groups_out, assign_out=assign_out, log_out=log_out, debug=True, ) if kind_structure_is_fixed: assert_equal(len(os.listdir(groups_out)), kind_count) group_counts = get_group_counts(groups_out) assign_count = sum(1 for _ in protobuf_stream_load(assign_out)) assert_equal(assign_count, row_count) print 'row_count: {}'.format(row_count) print 'group_counts: {}'.format(' '.join(map(str, group_counts))) for group_count in group_counts: assert_true(group_count <= row_count, 'groups are all singletons')
def load_rows(filename): rows = [] for string in protobuf_stream_load(filename): row = Row() row.ParseFromString(string) rows.append(row) return rows
def load_samples(filename): message = loom.schema_pb2.PosteriorEnum.Sample() for string in protobuf_stream_load(filename): message.ParseFromString(string) sample = parse_sample(message) score = float(message.score) yield sample, score
def batch_predict( config_in, model_in, groups_in, requests, debug=False, profile=None): root = os.path.abspath(os.path.curdir) with tempdir(cleanup_on_error=(not debug)): requests_in = os.path.abspath('requests.pbs.gz') responses_out = os.path.abspath('responses.pbs.gz') protobuf_stream_dump( (q.SerializeToString() for q in requests), requests_in) os.chdir(root) loom.runner.query( config_in=config_in, model_in=model_in, groups_in=groups_in, requests_in=requests_in, responses_out=responses_out, debug=debug, profile=profile) return map(parse_response, protobuf_stream_load(responses_out))
def run_posterior_enum(casename, dataset, results, debug, sparsify=True): if not sparsify: loom.runner.posterior_enum( config_in=dataset["config"], rows_in=dataset["rows"], model_in=dataset["model"], samples_out=results["samples"], debug=debug, ) else: loom.format.make_schema(model_in=dataset["model"], schema_out=results["schema"]) loom.format.make_schema_row(schema_in=results["schema"], schema_row_out=results["schema_row"]) loom.runner.tare( schema_row_in=results["schema_row"], rows_in=dataset["rows"], tares_out=results["tares"], debug=debug ) tare_count = sum(1 for _ in protobuf_stream_load(results["tares"])) if casename is not None and tare_count: LOG("Info", casename, "found {} tare rows".format(tare_count)) loom.runner.sparsify( schema_row_in=results["schema_row"], tares_in=results["tares"], rows_in=dataset["rows"], rows_out=results["diffs"], debug=debug, ) loom.runner.posterior_enum( config_in=dataset["config"], rows_in=results["diffs"], tares_in=results["tares"], model_in=dataset["model"], samples_out=results["samples"], debug=debug, )
def pretty_print(filename, message_type='guess'): ''' Print text/json/protobuf messages from a raw/gz/bz2 file. ''' parts = os.path.basename(filename).split('.') if parts[-1] in ['gz', 'bz2']: parts.pop() protocol = parts[-1] if protocol == 'json': data = json_load(filename) print json.dumps(data, sort_keys=True, indent=4) elif protocol == 'pb': message = get_message(filename, message_type) with open_compressed(filename) as f: message.ParseFromString(f.read()) print message elif protocol == 'pbs': message = get_message(filename, message_type) for string in protobuf_stream_load(filename): message.ParseFromString(string) print message elif protocol == 'pickle': data = pickle_load(filename) print repr(data) else: with open_compressed(filename) as f: for line in f: print line,
def test_posterior_enum(name, tares, diffs, init, **unused): with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): config_in = os.path.abspath('config.pb.gz') config = { 'posterior_enum': { 'sample_count': 7, }, 'kernels': { 'kind': { 'row_queue_capacity': 0, 'score_parallel': False, }, }, } loom.config.config_dump(config, config_in) assert_found(config_in) samples_out = os.path.abspath('samples.pbs.gz') loom.runner.posterior_enum(config_in=config_in, model_in=init, tares_in=tares, rows_in=diffs, samples_out=samples_out, debug=True) assert_found(samples_out) actual_count = sum(1 for _ in protobuf_stream_load(samples_out)) assert_equal(actual_count, config['posterior_enum']['sample_count'])
def test_posterior_enum(rows, model, **unused): with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): config_in = os.path.abspath('config.pb.gz') config = { 'posterior_enum': { 'sample_count': 7, }, 'kernels': { 'kind': { 'row_queue_capacity': 0, 'score_parallel': False, }, }, } loom.config.config_dump(config, config_in) assert_true(os.path.exists(config_in)) samples_out = os.path.abspath('samples.pbs.gz') loom.runner.posterior_enum( config_in=config_in, model_in=model, rows_in=rows, samples_out=samples_out, debug=True) assert_true(os.path.exists(samples_out)) actual_count = sum(1 for _ in protobuf_stream_load(samples_out)) assert_equal(actual_count, config['posterior_enum']['sample_count'])
def run_posterior_enum(casename, dataset, results, debug, sparsify=True): if not sparsify: loom.runner.posterior_enum(config_in=dataset['config'], rows_in=dataset['rows'], model_in=dataset['model'], samples_out=results['samples'], debug=debug) else: loom.format.make_schema(model_in=dataset['model'], schema_out=results['schema']) loom.format.make_schema_row(schema_in=results['schema'], schema_row_out=results['schema_row']) loom.runner.tare(schema_row_in=results['schema_row'], rows_in=dataset['rows'], tares_out=results['tares'], debug=debug) tare_count = sum(1 for _ in protobuf_stream_load(results['tares'])) if casename is not None and tare_count: LOG('Info', casename, 'found {} tare rows'.format(tare_count)) loom.runner.sparsify(schema_row_in=results['schema_row'], tares_in=results['tares'], rows_in=dataset['rows'], rows_out=results['diffs'], debug=debug) loom.runner.posterior_enum(config_in=dataset['config'], rows_in=results['diffs'], tares_in=results['tares'], model_in=dataset['model'], samples_out=results['samples'], debug=debug)
def ingest(name, schema=None, rows_csv=None, id_field=None, debug=False): ''' Ingest dataset with optional json config. Arguments: name A unique identifier for ingest + inference schema Json schema file, e.g., {"feature1": "nich"} rows_csv File or directory of csv files or csv.gz files id_field Column name of id field in input csv debug Whether to run debug versions of C++ code Environment variables: LOOM_THREADS Number of concurrent ingest tasks LOOM_VERBOSITY Verbosity level ''' paths = loom.store.get_paths(name) if schema is None: schema = paths['ingest']['schema'] if rows_csv is None: rows_csv = paths['ingest']['rows_csv'] if not os.path.exists(schema): raise LoomError('Missing schema file: {}'.format(schema)) if not os.path.exists(rows_csv): raise LoomError('Missing rows_csv file: {}'.format(rows_csv)) with open_compressed(paths['ingest']['version'], 'w') as f: f.write(loom.__version__) LOG('making schema row') loom.format.make_schema_row(schema_in=schema, schema_row_out=paths['ingest']['schema_row']) LOG('making encoding') loom.format.make_encoding(schema_in=schema, rows_in=rows_csv, encoding_out=paths['ingest']['encoding']) LOG('importing rows') loom.format.import_rows(encoding_in=paths['ingest']['encoding'], rows_csv_in=rows_csv, rows_out=paths['ingest']['rows']) LOG('importing rowids') loom.format.import_rowids(rows_csv_in=rows_csv, rowids_out=paths['ingest']['rowids'], id_field=id_field) LOG('making tare rows') loom.runner.tare(schema_row_in=paths['ingest']['schema_row'], rows_in=paths['ingest']['rows'], tares_out=paths['ingest']['tares'], debug=debug) tare_count = sum(1 for _ in protobuf_stream_load(paths['ingest']['tares'])) LOG('sparsifying rows WRT {} tare rows'.format(tare_count)) loom.runner.sparsify(schema_row_in=paths['ingest']['schema_row'], tares_in=paths['ingest']['tares'], rows_in=paths['ingest']['rows'], rows_out=paths['ingest']['diffs'], debug=debug) loom.config.config_dump({}, paths['query']['config'])
def test_dump_rows(): for feature_type in FEATURE_TYPES: table = generate_rows(10, 10, feature_type, 0.5) with tempdir(): rows_name = os.path.abspath('rows.pbs') dump_rows(table, rows_name) message = loom.schema_pb2.Row() for string in protobuf_stream_load(rows_name): message.ParseFromString(string)
def ingest(name, schema='schema.json', rows_csv='rows.csv.gz', debug=False): ''' Ingest dataset with optional json config. Arguments: name A unique identifier for ingest + inference schema Json schema file, e.g., {"feature1": "nich"} rows_csv File or directory of csv files debug Whether to run debug versions of C++ code Environment variables: LOOM_THREADS Number of concurrent ingest tasks LOOM_VERBOSITY Verbosity level ''' if not os.path.exists(schema): raise IOError('Missing schema file: {}'.format(schema)) if not os.path.exists(rows_csv): raise IOError('Missing rows_csv file: {}'.format(rows_csv)) paths = loom.store.get_paths(name) with open_compressed(paths['ingest']['version'], 'w') as f: f.write(loom.__version__) LOG('making schema row') loom.format.make_schema_row( schema_in=schema, schema_row_out=paths['ingest']['schema_row']) LOG('making encoding') loom.format.make_encoding( schema_in=schema, rows_in=rows_csv, encoding_out=paths['ingest']['encoding']) LOG('importing rows') loom.format.import_rows( encoding_in=paths['ingest']['encoding'], rows_csv_in=rows_csv, rows_out=paths['ingest']['rows']) LOG('making tare rows') loom.runner.tare( schema_row_in=paths['ingest']['schema_row'], rows_in=paths['ingest']['rows'], tares_out=paths['ingest']['tares'], debug=debug) tare_count = sum(1 for _ in protobuf_stream_load(paths['ingest']['tares'])) LOG('sparsifying rows WRT {} tare rows'.format(tare_count)) loom.runner.sparsify( schema_row_in=paths['ingest']['schema_row'], tares_in=paths['ingest']['tares'], rows_in=paths['ingest']['rows'], rows_out=paths['ingest']['diffs'], debug=debug)
def test_group_pandas(root, rows_csv, rows, **unused): row_count = sum(1 for _ in protobuf_stream_load(rows)) with loom.preql.get_server(root, debug=True) as preql: feature_names = preql.feature_names for feature in feature_names[:10]: result_string = preql.group(feature) result_df = pandas.read_csv(StringIO(result_string), index_col=0) print 'result_df =' print result_df assert_equal(result_df.ndim, 2) assert_equal(result_df.shape[0], row_count) assert_equal(result_df.shape[1], 2)
def get_group_counts(groups_out): group_counts = [] for f in os.listdir(groups_out): group_count = 0 groups = os.path.join(groups_out, f) for string in protobuf_stream_load(groups): group = ProductModel.Group() group.ParseFromString(string) group_count += 1 group_counts.append(group_count) assert group_counts, 'no groups found' return group_counts
def generate_samples(model_name, rows_name, config_name, debug): with tempdir(cleanup_on_error=(not debug)): samples_name = os.path.abspath('samples.pbs.gz') with chdir(CWD): loom.runner.posterior_enum( config_name, model_name, rows_name, samples_name, debug=debug) message = loom.schema_pb2.PosteriorEnum.Sample() for string in protobuf_stream_load(samples_name): message.ParseFromString(string) sample = parse_sample(message) score = float(message.score) yield sample, score
def test_posterior_enum(name, tares, diffs, init, **unused): with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): config_in = os.path.abspath("config.pb.gz") config = { "posterior_enum": {"sample_count": 7}, "kernels": {"kind": {"row_queue_capacity": 0, "score_parallel": False}}, } loom.config.config_dump(config, config_in) assert_found(config_in) samples_out = os.path.abspath("samples.pbs.gz") loom.runner.posterior_enum( config_in=config_in, model_in=init, tares_in=tares, rows_in=diffs, samples_out=samples_out, debug=True ) assert_found(samples_out) actual_count = sum(1 for _ in protobuf_stream_load(samples_out)) assert_equal(actual_count, config["posterior_enum"]["sample_count"])
def crossvalidate( name=None, sample_count=10, portion=0.9, extra_passes=loom.config.DEFAULTS['schedule']['extra_passes'], debug=False): ''' Randomly split dataset; train models; score held-out data. ''' assert 0 < portion and portion < 1, portion assert sample_count > 0, sample_count loom.store.require(name, [ 'ingest.encoding', 'ingest.tares', 'ingest.diffs', ]) inputs = loom.store.get_paths(name) row_count = sum(1 for _ in protobuf_stream_load(inputs['ingest']['diffs'])) assert row_count > 1, 'too few rows to crossvalidate: {}'.format(row_count) train_count = max(1, min(row_count - 1, int(round(portion * row_count)))) test_count = row_count - train_count assert 1 <= train_count and 1 <= test_count mean_scores = [] for seed in xrange(sample_count): results = loom.store.get_paths( os.path.join(name, 'crossvalidate/{}'.format(seed))) mean = crossvalidate_one( seed, test_count, train_count, inputs, results, extra_passes, debug) mean_scores.append(mean) results = loom.store.get_paths(os.path.join(name, 'crossvalidate')) results['scores'] = os.path.join(results['root'], 'scores.json.gz') json_dump(mean_scores, results['scores']) print 'score = {} +- {}'.format( numpy.mean(mean_scores), numpy.std(mean_scores))
def infer( name=None, extra_passes=loom.config.DEFAULTS['schedule']['extra_passes'], parallel=True, debug=False, profile='time'): ''' Run inference on a dataset, or list available datasets. ''' assert extra_passes > 0, 'cannot initialize with extra_passes = 0' loom.store.require(name, ['samples.0.init', 'samples.0.shuffled']) inputs, results = get_paths(name, 'infer') config = {'schedule': {'extra_passes': extra_passes}} if not parallel: loom.config.fill_in_sequential(config) loom.config.config_dump(config, results['samples'][0]['config']) loom.runner.infer( config_in=results['samples'][0]['config'], rows_in=inputs['samples'][0]['shuffled'], tares_in=inputs['ingest']['tares'], model_in=inputs['samples'][0]['init'], model_out=results['samples'][0]['model'], groups_out=results['samples'][0]['groups'], log_out=results['samples'][0]['infer_log'], debug=debug, profile=profile) loom.store.provide(name, results, [ 'samples.0.config', 'samples.0.model', 'samples.0.groups', ]) groups = results['samples'][0]['groups'] assert os.listdir(groups), 'no groups were written' group_counts = [] for f in os.listdir(groups): group_count = 0 for _ in protobuf_stream_load(os.path.join(groups, f)): group_count += 1 group_counts.append(group_count) print 'group_counts: {}'.format(' '.join(map(str, group_counts)))
def infer( name=None, extra_passes=loom.config.DEFAULTS['schedule']['extra_passes'], debug=False, profile='time'): ''' Run inference on a dataset, or list available datasets. ''' if name is None: list_options_and_exit(ROWS) init = INIT.format(name) rows = ROWS.format(name) assert os.path.exists(init), 'First load dataset' assert os.path.exists(rows), 'First load dataset' assert extra_passes > 0, 'cannot initialize with extra_passes = 0' destin = os.path.join(RESULTS, name) mkdir_p(destin) groups_out = os.path.join(destin, 'groups') mkdir_p(groups_out) config = {'schedule': {'extra_passes': extra_passes}} config_in = os.path.join(destin, 'config.pb.gz') loom.config.config_dump(config, config_in) loom.runner.infer( config_in=config_in, rows_in=rows, model_in=init, groups_out=groups_out, debug=debug, profile=profile) assert os.listdir(groups_out), 'no groups were written' group_counts = [] for f in os.listdir(groups_out): group_count = 0 for _ in protobuf_stream_load(os.path.join(groups_out, f)): group_count += 1 group_counts.append(group_count) print 'group_counts: {}'.format(' '.join(map(str, group_counts)))
def export_rows(encoding_in, rows_in, rows_csv_out, chunk_size=1000000): ''' Export rows from gzipped-protobuf-stream to directory-of-gzipped-csv-files. ''' rows_csv_out = os.path.abspath(rows_csv_out) if rows_csv_out == os.getcwd(): raise LoomError('Cannot export_rows to working directory') for ext in ['.csv', '.gz', '.bz2']: if rows_csv_out.endswith(ext): raise LoomError( 'Expected rows_csv_out to be a dirname, actual'.format( rows_csv_out)) if not (chunk_size > 0): raise LoomError('Invalid chunk_size {}, must be positive'.format( chunk_size)) encoders = json_load(encoding_in) fields = [loom.schema.MODEL_TO_DATATYPE[e['model']] for e in encoders] decoders = [load_decoder(e) for e in encoders] header = ['_id'] + [e['name'] for e in encoders] if os.path.exists(rows_csv_out): shutil.rmtree(rows_csv_out) os.makedirs(rows_csv_out) row_count = sum(1 for _ in protobuf_stream_load(rows_in)) rows = loom.cFormat.row_stream_load(rows_in) chunk_count = (row_count + chunk_size - 1) / chunk_size chunks = sorted( os.path.join(rows_csv_out, 'rows.{}.csv.gz'.format(i)) for i in xrange(chunk_count) ) with ExitStack() as stack: with_ = stack.enter_context writers = [with_(csv_writer(f)) for f in chunks] for writer in writers: writer.writerow(header) for row, writer in izip(rows, cycle(writers)): data = row.iter_data() schema = izip(data['observed'], fields, decoders) csv_row = [row.id] for observed, field, decode in schema: csv_row.append(decode(data[field].next()) if observed else '') writer.writerow(csv_row)
def crossvalidate( name=None, sample_count=10, portion=0.9, extra_passes=loom.config.DEFAULTS['schedule']['extra_passes'], debug=False): ''' Randomly split dataset; train models; score held-out data. ''' assert 0 < portion and portion < 1, portion assert sample_count > 0, sample_count loom.store.require(name, [ 'ingest.encoding', 'ingest.tares', 'ingest.diffs', ]) inputs = loom.store.get_paths(name) row_count = sum(1 for _ in protobuf_stream_load(inputs['ingest']['diffs'])) assert row_count > 1, 'too few rows to crossvalidate: {}'.format(row_count) train_count = max(1, min(row_count - 1, int(round(portion * row_count)))) test_count = row_count - train_count assert 1 <= train_count and 1 <= test_count mean_scores = [] for seed in xrange(sample_count): results = loom.store.get_paths( os.path.join(name, 'crossvalidate/{}'.format(seed))) mean = crossvalidate_one(seed, test_count, train_count, inputs, results, extra_passes, debug) mean_scores.append(mean) results = loom.store.get_paths(os.path.join(name, 'crossvalidate')) results['scores'] = os.path.join(results['root'], 'scores.json.gz') json_dump(mean_scores, results['scores']) print 'score = {} +- {}'.format(numpy.mean(mean_scores), numpy.std(mean_scores))
def run_posterior_enum(casename, dataset, results, debug, sparsify=True): if not sparsify: loom.runner.posterior_enum( config_in=dataset['config'], rows_in=dataset['rows'], model_in=dataset['model'], samples_out=results['samples'], debug=debug) else: loom.format.make_schema( model_in=dataset['model'], schema_out=results['schema']) loom.format.make_schema_row( schema_in=results['schema'], schema_row_out=results['schema_row']) loom.runner.tare( schema_row_in=results['schema_row'], rows_in=dataset['rows'], tares_out=results['tares'], debug=debug) tare_count = sum(1 for _ in protobuf_stream_load(results['tares'])) if casename is not None and tare_count: LOG('Info', casename, 'found {} tare rows'.format(tare_count)) loom.runner.sparsify( schema_row_in=results['schema_row'], tares_in=results['tares'], rows_in=dataset['rows'], rows_out=results['diffs'], debug=debug) loom.runner.posterior_enum( config_in=dataset['config'], rows_in=results['diffs'], tares_in=results['tares'], model_in=dataset['model'], samples_out=results['samples'], debug=debug)
def load_rows_raw(filename): return list(protobuf_stream_load(filename))
def crossvalidate_one( seed, test_count, train_count, inputs, results, extra_passes, debug): LOG('running seed {}:'.format(seed)) results['train'] = os.path.join( results['root'], 'train', 'diffs.pbs.gz') results['test'] = os.path.join(results['root'], 'test', 'rows.pbs.gz') results['scores'] = os.path.join(results['root'], 'scores.json.gz') config = { 'seed': seed, 'schedule': {'extra_passes': extra_passes}, } loom.config.config_dump(config, results['samples'][0]['config']) numpy.random.seed(seed) split = [True] * train_count + [False] * test_count numpy.random.shuffle(split) diffs_in = protobuf_stream_load(inputs['ingest']['diffs']) protobuf_stream_dump( (row for s, row in izip(split, diffs_in) if s), results['train']) rows_in = protobuf_stream_load(inputs['ingest']['rows']) protobuf_stream_dump( (row for s, row in izip(split, rows_in) if not s), results['test']) LOG(' shuffle') loom.runner.shuffle( rows_in=results['train'], rows_out=results['samples'][0]['shuffled'], seed=seed, debug=debug) LOG(' init') loom.generate.generate_init( encoding_in=inputs['ingest']['encoding'], model_out=results['samples'][0]['init'], seed=seed) LOG(' infer') loom.runner.infer( config_in=results['samples'][0]['config'], rows_in=results['samples'][0]['shuffled'], tares_in=inputs['ingest']['tares'], model_in=results['samples'][0]['init'], model_out=results['samples'][0]['model'], groups_out=results['samples'][0]['groups'], debug=debug) LOG(' query') rows = loom.query.load_data_rows(results['test']) loom.config.config_dump({}, results['query']['config']) with loom.query.get_server(results['root'], debug=debug) as query: scores = [query.score(row) for row in rows] json_dump(scores, results['scores']) LOG(' done\n') return numpy.mean(scores)