def test_infer(name, tares, shuffled, init, **unused): with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): row_count = sum(1 for _ in protobuf_stream_load(shuffled)) with open_compressed(init) as f: message = CrossCat() message.ParseFromString(f.read()) kind_count = len(message.kinds) for config in CONFIGS: loom.config.fill_in_defaults(config) schedule = config['schedule'] print 'config: {}'.format(config) greedy = (schedule['extra_passes'] == 0) kind_iters = config['kernels']['kind']['iterations'] kind_structure_is_fixed = greedy or kind_iters == 0 with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): config_in = os.path.abspath('config.pb.gz') model_out = os.path.abspath('model.pb.gz') groups_out = os.path.abspath('groups') assign_out = os.path.abspath('assign.pbs.gz') log_out = os.path.abspath('log.pbs.gz') os.mkdir(groups_out) loom.config.config_dump(config, config_in) loom.runner.infer( config_in=config_in, rows_in=shuffled, tares_in=tares, model_in=init, model_out=model_out, groups_out=groups_out, assign_out=assign_out, log_out=log_out, debug=True, ) if kind_structure_is_fixed: assert_equal(len(os.listdir(groups_out)), kind_count) group_counts = get_group_counts(groups_out) assign_count = sum(1 for _ in protobuf_stream_load(assign_out)) assert_equal(assign_count, row_count) print 'row_count: {}'.format(row_count) print 'group_counts: {}'.format(' '.join(map(str, group_counts))) for group_count in group_counts: assert_true(group_count <= row_count, 'groups are all singletons')
def test_infer(name, tares, shuffled, init, **unused): with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): row_count = sum(1 for _ in protobuf_stream_load(shuffled)) with open_compressed(init) as f: message = CrossCat() message.ParseFromString(f.read()) kind_count = len(message.kinds) for config in CONFIGS: loom.config.fill_in_defaults(config) schedule = config['schedule'] print 'config: {}'.format(config) greedy = (schedule['extra_passes'] == 0) kind_iters = config['kernels']['kind']['iterations'] kind_structure_is_fixed = greedy or kind_iters == 0 with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): config_in = os.path.abspath('config.pb.gz') model_out = os.path.abspath('model.pb.gz') groups_out = os.path.abspath('groups') assign_out = os.path.abspath('assign.pbs.gz') log_out = os.path.abspath('log.pbs.gz') os.mkdir(groups_out) loom.config.config_dump(config, config_in) loom.runner.infer( config_in=config_in, rows_in=shuffled, tares_in=tares, model_in=init, model_out=model_out, groups_out=groups_out, assign_out=assign_out, log_out=log_out, debug=True,) if kind_structure_is_fixed: assert_equal(len(os.listdir(groups_out)), kind_count) group_counts = get_group_counts(groups_out) assign_count = sum(1 for _ in protobuf_stream_load(assign_out)) assert_equal(assign_count, row_count) print 'row_count: {}'.format(row_count) print 'group_counts: {}'.format(' '.join(map(str, group_counts))) for group_count in group_counts: assert_true( group_count <= row_count, 'groups are all singletons')
def test_generate(model, **unused): for row_count in [0, 1, 100]: for density in [0.0, 0.5, 1.0]: with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): config_in = os.path.abspath('config.pb.gz') config = { 'generate': { 'row_count': row_count, 'density': density, }, } loom.config.config_dump(config, config_in) assert_found(config_in) rows_out = os.path.abspath('rows.pbs.gz') model_out = os.path.abspath('model.pb.gz') groups_out = os.path.abspath('groups') loom.runner.generate(config_in=config_in, model_in=model, rows_out=rows_out, model_out=model_out, groups_out=groups_out, debug=True) assert_found(rows_out, model_out, groups_out) group_counts = get_group_counts(groups_out) print 'group_counts: {}'.format(' '.join(map( str, group_counts)))
def test_posterior_enum(name, tares, diffs, init, **unused): with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): config_in = os.path.abspath('config.pb.gz') config = { 'posterior_enum': { 'sample_count': 7, }, 'kernels': { 'kind': { 'row_queue_capacity': 0, 'score_parallel': False, }, }, } loom.config.config_dump(config, config_in) assert_found(config_in) samples_out = os.path.abspath('samples.pbs.gz') loom.runner.posterior_enum(config_in=config_in, model_in=init, tares_in=tares, rows_in=diffs, samples_out=samples_out, debug=True) assert_found(samples_out) actual_count = sum(1 for _ in protobuf_stream_load(samples_out)) assert_equal(actual_count, config['posterior_enum']['sample_count'])
def test_generate(init, **unused): for row_count in [0, 1, 100]: for density in [0.0, 0.5, 1.0]: with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): config_in = os.path.abspath('config.pb.gz') config = { 'generate': { 'row_count': row_count, 'density': density, }, } loom.config.config_dump(config, config_in) assert_true(os.path.exists(config_in)) rows_out = os.path.abspath('rows.pbs.gz') model_out = os.path.abspath('model.pb.gz') groups_out = os.path.abspath('groups') loom.runner.generate( config_in=config_in, model_in=init, rows_out=rows_out, model_out=model_out, groups_out=groups_out, debug=True) assert_true(os.path.exists(rows_out)) assert_true(os.path.exists(model_out)) assert_true(os.path.exists(groups_out)) group_counts = get_group_counts(groups_out) print 'group_counts: {}'.format( ' '.join(map(str, group_counts)))
def test_generate_init(encoding, **unused): with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): init_out = os.path.abspath('init.pb.gz') loom.generate.generate_init( encoding_in=encoding, model_out=init_out) assert_found(init_out)
def generate( feature_type='mixed', rows=10000, cols=100, density=0.5, debug=False, profile='time'): ''' Generate a synthetic dataset. ''' root = os.path.abspath(os.path.curdir) with tempdir(cleanup_on_error=(not debug)): init_out = os.path.abspath('init.pb.gz') rows_out = os.path.abspath('rows.pbs.gz') model_out = os.path.abspath('model.pb.gz') groups_out = os.path.abspath('groups') os.chdir(root) loom.generate.generate( row_count=rows, feature_count=cols, feature_type=feature_type, density=density, init_out=init_out, rows_out=rows_out, model_out=model_out, groups_out=groups_out, debug=debug, profile=profile) print 'model file is {} bytes'.format(os.path.getsize(model_out)) print 'rows file is {} bytes'.format(os.path.getsize(rows_out))
def test_make_schema(model, **unused): with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): schema_out = os.path.abspath('schema.json.gz') loom.format.make_schema( model_in=model, schema_out=schema_out) assert_found(schema_out)
def test_tare(rows, schema_row, **unused): with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): tares = os.path.abspath('tares.pbs.gz') loom.runner.tare(schema_row_in=schema_row, rows_in=rows, tares_out=tares) assert_found(tares)
def test_make_fake_encoding(schema, model, **unused): with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): encoding_out = os.path.abspath('encoding.json.gz') loom.format.make_fake_encoding(schema_in=schema, model_in=model, encoding_out=encoding_out) assert_found(encoding_out)
def batch_predict( config_in, model_in, groups_in, requests, debug=False, profile=None): root = os.path.abspath(os.path.curdir) with tempdir(cleanup_on_error=(not debug)): requests_in = os.path.abspath('requests.pbs.gz') responses_out = os.path.abspath('responses.pbs.gz') protobuf_stream_dump( (q.SerializeToString() for q in requests), requests_in) os.chdir(root) loom.runner.query( config_in=config_in, model_in=model_in, groups_in=groups_in, requests_in=requests_in, responses_out=responses_out, debug=debug, profile=profile) return map(parse_response, protobuf_stream_load(responses_out))
def _import_dir(import_file, args): rows_csv_in, file_out, id_offset, id_stride, misc = args assert os.path.isdir(rows_csv_in) parts_in = sorted( os.path.abspath(os.path.join(rows_csv_in, f)) for f in os.listdir(rows_csv_in) ) part_count = len(parts_in) assert part_count > 0, 'no files in {}'.format(rows_csv_in) parts_out = [] tasks = [] for i, part_in in enumerate(parts_in): part_out = 'part.{}.{}'.format(i, os.path.basename(file_out)) offset = id_offset + id_stride * i stride = id_stride * part_count parts_out.append(part_out) tasks.append((part_in, part_out, offset, stride, misc)) with tempdir(): loom.util.parallel_map(import_file, tasks) # It is safe use open instead of open_compressed even for .gz files; # see http://stackoverflow.com/questions/8005114 with open(file_out, 'wb') as whole: for part_out in parts_out: with open(part_out, 'rb') as part: shutil.copyfileobj(part, whole) os.remove(part_out)
def test_predict(root, rows_csv, encoding, **unused): COUNT = 10 with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): with loom.query.get_server(root, debug=True) as query_server: result_out = 'predictions_out.csv' rows_in = os.listdir(rows_csv)[0] rows_in = os.path.join(rows_csv, rows_in) encoders = json_load(encoding) name_to_encoder = {e['name']: load_encoder(e) for e in encoders} preql = loom.preql.PreQL(query_server, encoding) preql.predict(rows_in, COUNT, result_out, id_offset=False) with open_compressed(rows_in, 'rb') as fin: with open(result_out, 'r') as fout: in_reader = csv.reader(fin) out_reader = csv.reader(fout) fnames = in_reader.next() out_reader.next() for in_row in in_reader: for i in range(COUNT): out_row = out_reader.next() bundle = zip(fnames, in_row, out_row) for name, in_val, out_val in bundle: encode = name_to_encoder[name] observed = bool(in_val.strip()) if observed: assert_almost_equal( encode(in_val), encode(out_val)) else: assert_true(bool(out_val.strip()))
def _import_rows_dir(encoding_in, rows_csv_in, rows_out, id_offset, id_stride): assert os.path.isdir(rows_csv_in) files_in = sorted( os.path.abspath(os.path.join(rows_csv_in, f)) for f in os.listdir(rows_csv_in) ) file_count = len(files_in) assert file_count > 0, 'no files in {}'.format(rows_csv_in) assert file_count < 1e6, 'too many files in {}'.format(rows_csv_in) files_out = [] tasks = [] for i, file_in in enumerate(files_in): file_out = 'part_{:06d}.{}'.format(i, os.path.basename(rows_out)) offset = id_offset + id_stride * i stride = id_stride * file_count files_out.append(file_out) tasks.append((encoding_in, file_in, file_out, offset, stride)) rows_out = os.path.abspath(rows_out) with tempdir(): loom.util.parallel_map(_import_rows_file, tasks) # It is safe use open instead of open_compressed even for .gz files; # see http://stackoverflow.com/questions/8005114 with open(rows_out, 'wb') as whole: for file_out in files_out: with open(file_out, 'rb') as part: shutil.copyfileobj(part, whole) os.remove(file_out)
def _import_dir(import_file, args): rows_csv_in, file_out, id_offset, id_stride, misc = args assert os.path.isdir(rows_csv_in) parts_in = sorted( os.path.abspath(os.path.join(rows_csv_in, f)) for f in os.listdir(rows_csv_in)) part_count = len(parts_in) assert part_count > 0, 'no files in {}'.format(rows_csv_in) parts_out = [] tasks = [] for i, part_in in enumerate(parts_in): part_out = 'part.{}.{}'.format(i, os.path.basename(file_out)) offset = id_offset + id_stride * i stride = id_stride * part_count parts_out.append(part_out) tasks.append((part_in, part_out, offset, stride, misc)) with tempdir(): loom.util.parallel_map(import_file, tasks) # It is safe use open instead of open_compressed even for .gz files; # see http://stackoverflow.com/questions/8005114 with open(file_out, 'wb') as whole: for part_out in parts_out: with open(part_out, 'rb') as part: shutil.copyfileobj(part, whole) os.remove(part_out)
def test_posterior_enum(rows, model, **unused): with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): config_in = os.path.abspath('config.pb.gz') config = { 'posterior_enum': { 'sample_count': 7, }, 'kernels': { 'kind': { 'row_queue_capacity': 0, 'score_parallel': False, }, }, } loom.config.config_dump(config, config_in) assert_true(os.path.exists(config_in)) samples_out = os.path.abspath('samples.pbs.gz') loom.runner.posterior_enum( config_in=config_in, model_in=model, rows_in=rows, samples_out=samples_out, debug=True) assert_true(os.path.exists(samples_out)) actual_count = sum(1 for _ in protobuf_stream_load(samples_out)) assert_equal(actual_count, config['posterior_enum']['sample_count'])
def test_tare(rows, schema_row, **unused): with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): tares = os.path.abspath('tares.pbs.gz') loom.runner.tare( schema_row_in=schema_row, rows_in=rows, tares_out=tares) assert_found(tares)
def test_sparsify(rows, schema_row, **unused): with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): tares = os.path.abspath("tares.pbs.gz") diffs = os.path.abspath("diffs.pbs.gz") loom.runner.tare(schema_row_in=schema_row, rows_in=rows, tares_out=tares) assert_found(tares) loom.runner.sparsify(schema_row_in=schema_row, tares_in=tares, rows_in=rows, rows_out=diffs, debug=True) assert_found(diffs)
def test_samples_match_scores(root, rows, **unused): rows = load_rows(rows) rows = rows[::len(rows) / 5] with tempdir(): loom.config.config_dump({'seed': SEED}, 'config.pb.gz') with loom.query.get_server(root, 'config.pb.gz', debug=True) as server: for row in rows: _check_marginal_samples_match_scores(server, row, 0)
def test_group_runs(root, schema, encoding, **unused): with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): with loom.preql.get_server(root, encoding, debug=True) as preql: test_columns = json_load(schema).keys()[:10] for column in test_columns: groupings_csv = 'group.{}.csv'.format(column) preql.group(column, result_out=groupings_csv) print open(groupings_csv).read()
def test_make_fake_encoding(schema, model, **unused): with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): encoding_out = os.path.abspath('encoding.json.gz') loom.format.make_fake_encoding( schema_in=schema, model_in=model, encoding_out=encoding_out) assert_found(encoding_out)
def test_transforms(): name = 'test_transforms.test_transforms' with tempdir() as temp: schema_csv = os.path.join(temp, 'schema.csv') rows_csv = os.path.join(temp, 'rows.csv.gz') generate_example(schema_csv, rows_csv) loom.tasks.transform(name, schema_csv, rows_csv) loom.tasks.ingest(name) loom.tasks.infer(name, sample_count=1)
def _test_dump_load(dump, load, filetype): for example in EXAMPLES: print example with fileutil.tempdir() as d, fileutil.chdir(d): expected = example filename = 'test.json' + filetype dump(expected, filename) actual = list(load(filename)) assert_equal(actual, expected)
def _test_protobuf_stream(filetype): filename = 'test.stream' + filetype expected = ['asdf', '', 'asdfasdfasdf', 'a', 's', '', '', '', 'd', 'f'] with fileutil.tempdir(): print 'dumping' io.stream.protobuf_stream_dump(expected, filename) print 'loading' actual = list(io.stream.protobuf_stream_load(filename)) assert_equal(actual, expected)
def test_shuffle(rows, **unused): with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): seed = 12345 rows_out = os.path.abspath('rows_out.pbs.gz') loom.runner.shuffle( rows_in=rows, rows_out=rows_out, seed=seed) assert_true(os.path.exists(rows_out))
def test_shuffle(diffs, **unused): with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): seed = 12345 rows_out = os.path.abspath('shuffled.pbs.gz') loom.runner.shuffle( rows_in=diffs, rows_out=rows_out, seed=seed) assert_found(rows_out)
def test_relate(root, encoding, **unused): with loom.query.get_server(root, debug=True) as query_server: with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): result_out = 'related_out.csv' preql = loom.preql.PreQL(query_server, encoding) preql.relate(preql.feature_names, result_out, sample_count=10) with open(result_out, 'r') as f: reader = csv.reader(f) for row in reader: pass
def test_import_rows(encoding, rows, rows_csv, **unused): with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): rows_pbs = os.path.abspath('rows.pbs.gz') loom.format.import_rows(encoding_in=encoding, rows_csv_in=rows_csv, rows_out=rows_pbs) assert_found(rows_pbs) expected_count = sum(1 for _ in protobuf_stream_load(rows)) actual_count = sum(1 for _ in protobuf_stream_load(rows_pbs)) assert_equal(actual_count, expected_count)
def test_predict(root, rows_csv, encoding, **unused): with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): with loom.preql.get_server(root, debug=True) as preql: result_out = 'predictions_out.csv' rows_in = os.listdir(rows_csv)[0] rows_in = os.path.join(rows_csv, rows_in) preql.predict(rows_in, COUNT, result_out, id_offset=True) print 'DEBUG', open_compressed(rows_in).read() print 'DEBUG', open_compressed(result_out).read() _check_predictions(rows_in, result_out, encoding)
def test_seed(root, model, rows, **unused): requests = get_example_requests(model, rows, 'mixed') with tempdir(): loom.config.config_dump({'seed': 0}, 'config.pb.gz') with loom.query.ProtobufServer(root, config='config.pb.gz') as server: responses1 = [get_response(server, req) for req in requests] with tempdir(): loom.config.config_dump({'seed': 0}, 'config.pb.gz') with loom.query.ProtobufServer(root, config='config.pb.gz') as server: responses2 = [get_response(server, req) for req in requests] with tempdir(): loom.config.config_dump({'seed': 10}, 'config.pb.gz') with loom.query.ProtobufServer(root, config='config.pb.gz') as server: responses3 = [get_response(server, req) for req in requests] assert_equal(responses1, responses2) assert_not_equal(responses1, responses3)
def _test_pair(dump, load, filetype): dump, load = named_pairs[dump, load] for example in EXAMPLES: print example with fileutil.tempdir(): expected = example filename = 'test.json' + filetype dump(expected, filename) actual = list(load(filename)) assert_equal(actual, expected)
def test_import_rows(encoding, rows, rows_csv, **unused): with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): rows_pbs = os.path.abspath('rows.pbs.gz') loom.format.import_rows( encoding_in=encoding, rows_csv_in=rows_csv, rows_out=rows_pbs) assert_found(rows_pbs) expected_count = sum(1 for _ in protobuf_stream_load(rows)) actual_count = sum(1 for _ in protobuf_stream_load(rows_pbs)) assert_equal(actual_count, expected_count)
def test_make_encoding(schema, rows_csv, **unused): with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): encoding = os.path.abspath('encoding.json.gz') rows = os.path.abspath('rows.pbs.gz') loom.format.make_encoding(schema_in=schema, rows_in=rows_csv, encoding_out=encoding) assert_found(encoding) loom.format.import_rows(encoding_in=encoding, rows_csv_in=rows_csv, rows_out=rows) assert_found(rows)
def test_sparsify(rows, schema_row, **unused): with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): tares = os.path.abspath('tares.pbs.gz') diffs = os.path.abspath('diffs.pbs.gz') loom.runner.tare(schema_row_in=schema_row, rows_in=rows, tares_out=tares) assert_found(tares) loom.runner.sparsify(schema_row_in=schema_row, tares_in=tares, rows_in=rows, rows_out=diffs, debug=True) assert_found(diffs)
def test_similar_runs(root, rows_csv, **unused): rows = load_rows_csv(rows_csv) header = rows.pop(0) try: id_pos = header.index('_id') except ValueError: id_pos = None rows = rows[0:10] for row in rows: row.pop(id_pos) with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): with loom.preql.get_server(root, debug=True) as preql: search_csv = 'search.csv' preql.similar(rows, result_out=search_csv)
def test_make_encoding(schema, rows_csv, **unused): with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): encoding = os.path.abspath('encoding.json.gz') rows = os.path.abspath('rows.pbs.gz') loom.format.make_encoding( schema_in=schema, rows_in=rows_csv, encoding_out=encoding) assert_found(encoding) loom.format.import_rows( encoding_in=encoding, rows_csv_in=rows_csv, rows_out=rows) assert_found(rows)
def test_server(model, groups, **unused): requests = get_example_requests(model) with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): config_in = os.path.abspath('config.pb.gz') loom.config.config_dump(CONFIG, config_in) kwargs = { 'config_in': config_in, 'model_in': model, 'groups_in': groups, 'debug': True, } with loom.query.serve(**kwargs) as server: responses = [server.call_protobuf(request) for request in requests] with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): config_in = os.path.abspath('config.pb.gz') loom.config.config_dump(CONFIG, config_in) kwargs = { 'config_in': config_in, 'model_in': model, 'groups_in': groups, 'debug': True, } with loom.query.serve(**kwargs) as server: for request in requests: req = Query.Request() req.id = request.id req.score.data.observed[:] = request.sample.data.observed[:] res = server.call_protobuf(req) assert_equal(req.id, res.id) assert_false(hasattr(req, 'error')) assert_true(isinstance(res.score.score, float)) for request, response in izip(requests, responses): assert_equal(request.id, response.id) assert_false(hasattr(request, 'error')) assert_equal(len(response.sample.samples), 1)
def test_search_runs(root, rows_csv, **unused): rows = load_rows_csv(rows_csv) header = rows.pop(0) try: id_pos = header.index('_id') except ValueError: id_pos = None rows = rows[0:10] with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): with loom.preql.get_server(root, debug=True) as preql: for i, row in enumerate(rows): row.pop(id_pos) search_csv = 'search.{}.csv'.format(i) preql.search(row, result_out=search_csv) open(search_csv).read()
def test_one_to_one(rows, **unused): with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): seed = 12345 rows_out = os.path.abspath('rows_out.pbs.gz') loom.runner.shuffle(rows_in=rows, rows_out=rows_out, seed=seed) assert_found(rows_out) original = load_rows(rows) shuffled = load_rows(rows_out) assert_equal(len(shuffled), len(original)) assert_not_equal(shuffled, original) actual = sorted(shuffled, key=lambda row: row.id) expected = sorted(original, key=lambda row: row.id) assert_list_equal(expected, actual)
def test_batch_predict(model, groups, **unused): requests = get_example_requests(model) with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): config_in = os.path.abspath('config.pb.gz') loom.config.config_dump(CONFIG, config_in) responses = loom.query.batch_predict( config_in=config_in, model_in=model, groups_in=groups, requests=requests, debug=True) assert_equal(len(responses), len(requests)) for request, response in izip(requests, responses): assert_equal(request.id, response.id) assert_false(hasattr(request, 'error')) assert_equal(len(response.sample.samples), 1)
def _test_generate(feature_type): root = os.getcwd() with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): init_out = os.path.abspath('init.pb.gz') rows_out = os.path.abspath('rows.pbs.gz') model_out = os.path.abspath('model.pb.gz') groups_out = os.path.abspath('groups') os.chdir(root) loom.generate.generate(feature_type=feature_type, row_count=100, feature_count=100, density=0.5, init_out=init_out, rows_out=rows_out, model_out=model_out, groups_out=groups_out, debug=True, profile=None)
def test_relate(root, **unused): with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): with loom.preql.get_server(root, debug=True) as preql: result_out = 'related_out.csv' preql.relate(preql.feature_names, result_out, sample_count=10) with open(result_out, 'r') as f: reader = csv.reader(f) header = reader.next() columns = header[1:] assert_equal(columns, preql.feature_names) zmatrix = numpy.zeros((len(columns), len(columns))) for i, row in enumerate(reader): column = row.pop(0) assert_equal(column, preql.feature_names[i]) for j, score in enumerate(row): score = float(score) zmatrix[i][j] = score assert_close(zmatrix, zmatrix.T)
def test_chunking(rows, **unused): targets = [10.0**i for i in xrange(6)] with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): seed = 12345 rows_out = os.path.abspath('rows.out.{}.pbs.gz') for i, target in enumerate(targets): loom.runner.shuffle(rows_in=rows, rows_out=rows_out.format(i), seed=seed, target_mem_bytes=target) results = [ load_rows_raw(rows_out.format(i)) for i in xrange(len(targets)) ] for i, actual in enumerate(results): for expected in results[:i]: assert_list_equal(actual, expected)
def test_export_rows(encoding, rows, **unused): with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): rows_csv = os.path.abspath('rows_csv') rows_pbs = os.path.abspath('rows.pbs.gz') loom.format.export_rows(encoding_in=encoding, rows_in=rows, rows_csv_out=rows_csv, chunk_size=51) assert_found(rows_csv) assert_found(os.path.join(rows_csv, 'rows.0.csv.gz')) loom.format.import_rows(encoding_in=encoding, rows_csv_in=rows_csv, rows_out=rows_pbs) assert_found(rows_pbs) expected = load_rows(rows) actual = load_rows(rows_pbs) assert_equal(len(actual), len(expected)) actual.sort(key=lambda row: row.id) expected.sort(key=lambda row: row.id) expected_data = [row.diff for row in expected] actual_data = [row.diff for row in actual] assert_close(actual_data, expected_data)
def test_shuffle(diffs, **unused): with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): seed = 12345 rows_out = os.path.abspath('shuffled.pbs.gz') loom.runner.shuffle(rows_in=diffs, rows_out=rows_out, seed=seed) assert_found(rows_out)