def infer_checkpoint(name=None, period_sec=0, debug=False, profile='time'): ''' Run inference from checkpoint, or list available checkpoints. ''' if name is None: list_options_and_exit(CHECKPOINTS) rows = ROWS.format(name) model = MODEL.format(name) checkpoint = CHECKPOINTS.format(name) assert os.path.exists(rows), 'First load dataset' assert os.path.exists(model), 'First load dataset' assert os.path.exists(checkpoint), 'First load checkpoint' destin = os.path.join(RESULTS, name) mkdir_p(destin) config = {'schedule': {'checkpoint_period_sec': period_sec}} config_in = os.path.join(destin, 'config.pb.gz') loom.config.config_dump(config, config_in) kwargs = {'debug': debug, 'profile': profile} kwargs.update(checkpoint_files(checkpoint, '_in')) loom.runner.infer(config_in=config_in, rows_in=rows, **kwargs)
def test(sample_count=2, force=True, debug=False): ''' Generate small synthetic datasets for testing. ''' mkdir_p(loom.store.STORE) configs = sorted(TEST_CONFIGS, key=(lambda c: -get_cost(CONFIGS[c]))) parallel_map(generate_one, [(name, sample_count, force, debug) for name in configs])
def test(sample_count=2, force=True, debug=False): ''' Generate small synthetic datasets for testing. ''' mkdir_p(loom.store.STORE) configs = sorted(TEST_CONFIGS, key=(lambda c: -get_cost(CONFIGS[c]))) parallel_map(generate_one, [ (name, sample_count, force, debug) for name in configs ])
def load_one(name): dataset = os.path.join(DATASETS, name) mkdir_p(dataset) init_out = INIT.format(name) rows_out = ROWS.format(name) model_out = MODEL.format(name) groups_out = GROUPS.format(name) if not all(os.path.exists(f) for f in [rows_out, model_out, groups_out]): print 'generating', name config = CONFIGS[name] loom.generate.generate( init_out=init_out, rows_out=rows_out, model_out=model_out, groups_out=groups_out, **config)
def infer( name=None, extra_passes=loom.config.DEFAULTS['schedule']['extra_passes'], debug=False, profile='time'): ''' Run inference on a dataset, or list available datasets. ''' if name is None: list_options_and_exit(ROWS) init = INIT.format(name) rows = ROWS.format(name) assert os.path.exists(init), 'First load dataset' assert os.path.exists(rows), 'First load dataset' assert extra_passes > 0, 'cannot initialize with extra_passes = 0' destin = os.path.join(RESULTS, name) mkdir_p(destin) groups_out = os.path.join(destin, 'groups') mkdir_p(groups_out) config = {'schedule': {'extra_passes': extra_passes}} config_in = os.path.join(destin, 'config.pb.gz') loom.config.config_dump(config, config_in) loom.runner.infer( config_in=config_in, rows_in=rows, model_in=init, groups_out=groups_out, debug=debug, profile=profile) assert os.listdir(groups_out), 'no groups were written' group_counts = [] for f in os.listdir(groups_out): group_count = 0 for _ in protobuf_stream_load(os.path.join(groups_out, f)): group_count += 1 group_counts.append(group_count) print 'group_counts: {}'.format(' '.join(map(str, group_counts)))
def shuffle(name=None, debug=False, profile='time'): ''' Shuffle dataset for inference. ''' if name is None: list_options_and_exit(ROWS) rows_in = ROWS.format(name) assert os.path.exists(rows_in), 'First load dataset' destin = os.path.join(RESULTS, name) mkdir_p(destin) rows_out = os.path.join(destin, 'rows.pbs.gz') loom.runner.shuffle( rows_in=rows_in, rows_out=rows_out, debug=debug, profile=profile) assert os.path.exists(rows_out)
def download(s3_url=S3_URL): ''' Download dataset from S3 and load into loom.benchmark jig. ''' import boto bucket, path = s3_split(s3_url) conn = boto.connect_s3().get_bucket(bucket) keys = [ key.name for key in conn.list(path) if re.match(r'.*\d\d\d\.csv\.gz$', key.name) ] assert keys, 'nothing to download' files = [os.path.join(ROWS_CSV, os.path.basename(key)) for key in keys] tasks = [(bucket, source, destin) for source, destin in izip(keys, files) if not os.path.exists(destin)] if tasks: print 'starting download of {} files'.format(len(tasks)) mkdir_p(ROWS_CSV) parallel_map(s3_get, tasks) print 'finished download of {} files'.format(len(keys))
def download(s3_url=S3_URL): ''' Download dataset from S3 and load into loom.benchmark jig. ''' import boto bucket, path = s3_split(s3_url) conn = boto.connect_s3().get_bucket(bucket) keys = [ key.name for key in conn.list(path) if re.match(r'.*\d\d\d\.csv\.gz$', key.name) ] assert keys, 'nothing to download' files = [os.path.join(ROWS_CSV, os.path.basename(key)) for key in keys] tasks = [ (bucket, source, destin) for source, destin in izip(keys, files) if not os.path.exists(destin) ] if tasks: print 'starting download of {} files'.format(len(tasks)) mkdir_p(ROWS_CSV) parallel_map(s3_get, tasks) print 'finished download of {} files'.format(len(keys))
def load_checkpoint(name=None, period_sec=5, debug=False): ''' Grab last full checkpoint for profiling, or list available datasets. ''' loom.store.require(name, ['samples.0.init', 'samples.0.shuffled']) inputs, results = get_paths(name, 'checkpoints') rm_rf(results['root']) mkdir_p(results['root']) with chdir(results['root']): config = {'schedule': {'checkpoint_period_sec': period_sec}} loom.config.config_dump(config, results['samples'][0]['config']) # run first iteration step = 0 mkdir_p(str(step)) kwargs = checkpoint_files(step, '_out') print 'running checkpoint {}, tardis_iter 0'.format(step) loom.runner.infer( config_in=results['samples'][0]['config'], rows_in=inputs['samples'][0]['shuffled'], tares_in=inputs['ingest']['tares'], model_in=inputs['samples'][0]['init'], log_out=results['samples'][0]['infer_log'], debug=debug, **kwargs) checkpoint = _load_checkpoint(step) # find penultimate checkpoint while not checkpoint.finished: rm_rf(str(step - 3)) step += 1 print 'running checkpoint {}, tardis_iter {}'.format( step, checkpoint.tardis_iter) kwargs = checkpoint_files(step - 1, '_in') mkdir_p(str(step)) kwargs.update(checkpoint_files(step, '_out')) loom.runner.infer( config_in=results['samples'][0]['config'], rows_in=inputs['samples'][0]['shuffled'], tares_in=inputs['ingest']['tares'], log_out=results['samples'][0]['infer_log'], debug=debug, **kwargs) checkpoint = _load_checkpoint(step) print 'final checkpoint {}, tardis_iter {}'.format( step, checkpoint.tardis_iter) last_full = str(step - 2) assert os.path.exists(last_full), 'too few checkpoints' checkpoint = _load_checkpoint(step) print 'saving checkpoint {}, tardis_iter {}'.format( last_full, checkpoint.tardis_iter) for f in checkpoint_files(last_full).values(): shutil.move(f, results['root']) for f in glob.glob(os.path.join(results['root'], '[0-9]*/')): shutil.rmtree(f)
def generate_one((name, sample_count, force, debug)): paths = loom.store.get_paths(name, sample_count=sample_count) if not force and all(os.path.exists(f) for f in paths.itervalues()): with open_compressed(paths['ingest']['version']) as f: version = f.read().strip() if version == loom.__version__: return print 'generating', name mkdir_p(paths['root']) with open_compressed(paths['ingest']['version'], 'w') as f: f.write(loom.__version__) config = CONFIGS[name] chunk_size = max(10, (config['row_count'] + 7) / 8) loom.transforms.make_fake_transforms( transforms_out=paths['ingest']['transforms']) loom.generate.generate( init_out=paths['samples'][0]['init'], rows_out=paths['ingest']['rows'], model_out=paths['samples'][0]['model'], groups_out=paths['samples'][0]['groups'], assign_out=paths['samples'][0]['assign'], **config) loom.format.make_schema( model_in=paths['samples'][0]['model'], schema_out=paths['ingest']['schema']) loom.format.make_fake_encoding( schema_in=paths['ingest']['schema'], model_in=paths['samples'][0]['model'], encoding_out=paths['ingest']['encoding']) loom.format.make_schema_row( schema_in=paths['ingest']['schema'], schema_row_out=paths['ingest']['schema_row']) loom.runner.tare( schema_row_in=paths['ingest']['schema_row'], rows_in=paths['ingest']['rows'], tares_out=paths['ingest']['tares'], debug=debug) loom.runner.sparsify( schema_row_in=paths['ingest']['schema_row'], tares_in=paths['ingest']['tares'], rows_in=paths['ingest']['rows'], rows_out=paths['ingest']['diffs'], debug=debug) loom.format.export_rows( encoding_in=paths['ingest']['encoding'], rows_in=paths['ingest']['rows'], rows_csv_out=paths['ingest']['rows_csv'], chunk_size=chunk_size) loom.format.import_rowids( rows_csv_in=paths['ingest']['rows_csv'], rowids_out=paths['ingest']['rowids'], id_field='_id') protobuf_stream_dump([], paths['query']['query_log']) loom.config.config_dump({}, paths['query']['config']) for seed, sample in enumerate(paths['samples']): loom.config.config_dump({'seed': seed}, sample['config']) loom.generate.generate_init( encoding_in=paths['ingest']['encoding'], model_out=sample['init'], seed=seed) loom.runner.shuffle( rows_in=paths['ingest']['diffs'], rows_out=sample['shuffled'], seed=seed, debug=debug) protobuf_stream_dump([], sample['infer_log']) sample0 = paths['samples'][0] for seed, sample in enumerate(paths['samples'][1:]): if LOOM_DEBUG_MIX: cp_ns(sample0['model'], sample['model']) cp_ns(sample0['groups'], sample['groups']) cp_ns(sample0['assign'], sample['assign']) else: loom.runner.mix( config_in=sample['config'], rows_in=paths['ingest']['rows'], model_in=sample0['model'], groups_in=sample0['groups'], assign_in=sample0['assign'], model_out=sample['model'], groups_out=sample['groups'], assign_out=sample['assign'], debug=debug) loom.consensus.make_fake_consensus( paths=paths, debug=debug)
def load_checkpoint(name=None, period_sec=5, debug=False): ''' Grab last full checkpoint for profiling, or list available datasets. ''' if name is None: list_options_and_exit(MODEL) rows = ROWS.format(name) model = MODEL.format(name) assert os.path.exists(model), 'First load dataset' assert os.path.exists(rows), 'First load dataset' destin = CHECKPOINTS.format(name) rm_rf(destin) mkdir_p(os.path.dirname(destin)) def load_checkpoint(name): checkpoint = loom.schema_pb2.Checkpoint() with open_compressed(checkpoint_files(name)['checkpoint']) as f: checkpoint.ParseFromString(f.read()) return checkpoint with tempdir(cleanup_on_error=(not debug)): config = {'schedule': {'checkpoint_period_sec': period_sec}} config_in = os.path.abspath('config.pb.gz') loom.config.config_dump(config, config_in) # run first iteration step = 0 mkdir_p(str(step)) kwargs = checkpoint_files(str(step), '_out') print 'running checkpoint {}, tardis_iter 0'.format(step) loom.runner.infer( config_in=config_in, rows_in=rows, model_in=model, debug=debug, **kwargs) checkpoint = load_checkpoint(step) # find penultimate checkpoint while not checkpoint.finished: rm_rf(str(step - 3)) step += 1 print 'running checkpoint {}, tarids_iter {}'.format( step, checkpoint.tardis_iter) kwargs = checkpoint_files(step - 1, '_in') mkdir_p(str(step)) kwargs.update(checkpoint_files(step, '_out')) loom.runner.infer( config_in=config_in, rows_in=rows, debug=debug, **kwargs) checkpoint = load_checkpoint(step) print 'final checkpoint {}, tardis_iter {}'.format( step, checkpoint.tardis_iter) last_full = str(step - 2) assert os.path.exists(last_full), 'too few checkpoints' checkpoint = load_checkpoint(step) print 'saving checkpoint {}, tardis_iter {}'.format( last_full, checkpoint.tardis_iter) shutil.move(last_full, destin)