def transform_rows(schema_in, transforms_in, rows_in, rows_out, id_field=None): transforms = pickle_load(transforms_in) if not transforms: cp_ns(rows_in, rows_out) else: transform = TransformSequence(transforms) transformed_header = sorted(json_load(schema_in).iterkeys()) if id_field is not None: assert id_field not in transformed_header transformed_header = [id_field] + transformed_header tasks = [] if os.path.isdir(rows_in): loom.util.mkdir_p(rows_out) for f in os.listdir(rows_in): tasks.append(( transform, transformed_header, os.path.join(rows_in, f), os.path.join(rows_out, f), )) else: tasks.append((transform, transformed_header, rows_in, rows_out)) parallel_map(_transform_rows, tasks)
def make_fake_consensus(paths, debug=False): for key, destin in loom.store.iter_paths("samples.0", paths["consensus"]): source = loom.store.get_path(paths, key) cp_ns(source, destin)
def make_fake_consensus(paths, debug=False): for key, destin in loom.store.iter_paths('samples.0', paths['consensus']): source = loom.store.get_path(paths, key) cp_ns(source, destin)
def generate_one((name, sample_count, force, debug)): paths = loom.store.get_paths(name, sample_count=sample_count) if not force and all(os.path.exists(f) for f in paths.itervalues()): with open_compressed(paths['ingest']['version']) as f: version = f.read().strip() if version == loom.__version__: return print 'generating', name mkdir_p(paths['root']) with open_compressed(paths['ingest']['version'], 'w') as f: f.write(loom.__version__) config = CONFIGS[name] chunk_size = max(10, (config['row_count'] + 7) / 8) loom.transforms.make_fake_transforms( transforms_out=paths['ingest']['transforms']) loom.generate.generate( init_out=paths['samples'][0]['init'], rows_out=paths['ingest']['rows'], model_out=paths['samples'][0]['model'], groups_out=paths['samples'][0]['groups'], assign_out=paths['samples'][0]['assign'], **config) loom.format.make_schema( model_in=paths['samples'][0]['model'], schema_out=paths['ingest']['schema']) loom.format.make_fake_encoding( schema_in=paths['ingest']['schema'], model_in=paths['samples'][0]['model'], encoding_out=paths['ingest']['encoding']) loom.format.make_schema_row( schema_in=paths['ingest']['schema'], schema_row_out=paths['ingest']['schema_row']) loom.runner.tare( schema_row_in=paths['ingest']['schema_row'], rows_in=paths['ingest']['rows'], tares_out=paths['ingest']['tares'], debug=debug) loom.runner.sparsify( schema_row_in=paths['ingest']['schema_row'], tares_in=paths['ingest']['tares'], rows_in=paths['ingest']['rows'], rows_out=paths['ingest']['diffs'], debug=debug) loom.format.export_rows( encoding_in=paths['ingest']['encoding'], rows_in=paths['ingest']['rows'], rows_csv_out=paths['ingest']['rows_csv'], chunk_size=chunk_size) loom.format.import_rowids( rows_csv_in=paths['ingest']['rows_csv'], rowids_out=paths['ingest']['rowids'], id_field='_id') protobuf_stream_dump([], paths['query']['query_log']) loom.config.config_dump({}, paths['query']['config']) for seed, sample in enumerate(paths['samples']): loom.config.config_dump({'seed': seed}, sample['config']) loom.generate.generate_init( encoding_in=paths['ingest']['encoding'], model_out=sample['init'], seed=seed) loom.runner.shuffle( rows_in=paths['ingest']['diffs'], rows_out=sample['shuffled'], seed=seed, debug=debug) protobuf_stream_dump([], sample['infer_log']) sample0 = paths['samples'][0] for seed, sample in enumerate(paths['samples'][1:]): if LOOM_DEBUG_MIX: cp_ns(sample0['model'], sample['model']) cp_ns(sample0['groups'], sample['groups']) cp_ns(sample0['assign'], sample['assign']) else: loom.runner.mix( config_in=sample['config'], rows_in=paths['ingest']['rows'], model_in=sample0['model'], groups_in=sample0['groups'], assign_in=sample0['assign'], model_out=sample['model'], groups_out=sample['groups'], assign_out=sample['assign'], debug=debug) loom.consensus.make_fake_consensus( paths=paths, debug=debug)