def init(sample_count=1, force=False, debug=False): ''' Generate synthetic datasets for testing and benchmarking. ''' configs = sorted(CONFIGS.keys(), key=(lambda c: -get_cost(CONFIGS[c]))) parallel_map(generate_one, [(name, sample_count, force, debug) for name in configs])
def init(sample_count=1, force=False, debug=False): ''' Generate synthetic datasets for testing and benchmarking. ''' configs = sorted(CONFIGS.keys(), key=(lambda c: -get_cost(CONFIGS[c]))) parallel_map(generate_one, [ (name, sample_count, force, debug) for name in configs ])
def test(sample_count=2, force=True, debug=False): ''' Generate small synthetic datasets for testing. ''' mkdir_p(loom.store.STORE) configs = sorted(TEST_CONFIGS, key=(lambda c: -get_cost(CONFIGS[c]))) parallel_map(generate_one, [(name, sample_count, force, debug) for name in configs])
def test(sample_count=2, force=True, debug=False): ''' Generate small synthetic datasets for testing. ''' mkdir_p(loom.store.STORE) configs = sorted(TEST_CONFIGS, key=(lambda c: -get_cost(CONFIGS[c]))) parallel_map(generate_one, [ (name, sample_count, force, debug) for name in configs ])
def infer(name, sample_count=10, config=None, debug=False): ''' Infer samples in parallel. Arguments: name A unique identifier for ingest + inference sample_count The number of samples to draw, typically 10-100 config An optional json config file, e.g., {"schedule": {"extra_passes": 500.0}} debug Whether to run debug versions of C++ code Environment variables: LOOM_THREADS Number of concurrent inference tasks LOOM_VERBOSITY Verbosity level ''' assert sample_count >= 1, 'too few samples: {}'.format(sample_count) parallel_map(_infer_one, [ (name, seed, config, debug) for seed in xrange(sample_count) ])
def infer(name, sample_count=DEFAULTS['sample_count'], config=None, debug=False): ''' Infer samples in parallel. Arguments: name A unique identifier for ingest + inference sample_count The number of samples to draw, typically 10-100 config An optional json config file, e.g., {"schedule": {"extra_passes": 500.0}} debug Whether to run debug versions of C++ code Environment variables: LOOM_THREADS Number of concurrent inference tasks LOOM_VERBOSITY Verbosity level ''' if not (sample_count >= 1): raise LoomError('Too few samples: {}'.format(sample_count)) parallel_map(_infer_one, [(name, seed, config, debug) for seed in xrange(sample_count)])
def download(s3_url=S3_URL): ''' Download dataset from S3 and load into loom.benchmark jig. ''' import boto bucket, path = s3_split(s3_url) conn = boto.connect_s3().get_bucket(bucket) keys = [ key.name for key in conn.list(path) if re.match(r'.*\d\d\d\.csv\.gz$', key.name) ] assert keys, 'nothing to download' files = [os.path.join(ROWS_CSV, os.path.basename(key)) for key in keys] tasks = [(bucket, source, destin) for source, destin in izip(keys, files) if not os.path.exists(destin)] if tasks: print 'starting download of {} files'.format(len(tasks)) mkdir_p(ROWS_CSV) parallel_map(s3_get, tasks) print 'finished download of {} files'.format(len(keys))
def transform_rows(schema_in, transforms_in, rows_in, rows_out, id_field=None): transforms = pickle_load(transforms_in) if not transforms: cp_ns(rows_in, rows_out) else: transform = TransformSequence(transforms) transformed_header = sorted(json_load(schema_in).iterkeys()) if id_field is not None: assert id_field not in transformed_header transformed_header = [id_field] + transformed_header tasks = [] if os.path.isdir(rows_in): loom.util.mkdir_p(rows_out) for f in os.listdir(rows_in): tasks.append(( transform, transformed_header, os.path.join(rows_in, f), os.path.join(rows_out, f), )) else: tasks.append((transform, transformed_header, rows_in, rows_out)) parallel_map(_transform_rows, tasks)
def download(s3_url=S3_URL): ''' Download dataset from S3 and load into loom.benchmark jig. ''' import boto bucket, path = s3_split(s3_url) conn = boto.connect_s3().get_bucket(bucket) keys = [ key.name for key in conn.list(path) if re.match(r'.*\d\d\d\.csv\.gz$', key.name) ] assert keys, 'nothing to download' files = [os.path.join(ROWS_CSV, os.path.basename(key)) for key in keys] tasks = [ (bucket, source, destin) for source, destin in izip(keys, files) if not os.path.exists(destin) ] if tasks: print 'starting download of {} files'.format(len(tasks)) mkdir_p(ROWS_CSV) parallel_map(s3_get, tasks) print 'finished download of {} files'.format(len(keys))
def init(): ''' Generate synthetic datasets for testing and benchmarking. ''' configs = sorted(CONFIGS.keys(), key=(lambda c: -get_cost(CONFIGS[c]))) parallel_map(load_one, configs)