示例#1
0
def transform_rows(schema_in, transforms_in, rows_in, rows_out, id_field=None):
    transforms = pickle_load(transforms_in)
    if not transforms:
        cp_ns(rows_in, rows_out)
    else:
        transform = TransformSequence(transforms)
        transformed_header = sorted(json_load(schema_in).iterkeys())
        if id_field is not None:
            assert id_field not in transformed_header
            transformed_header = [id_field] + transformed_header
        tasks = []
        if os.path.isdir(rows_in):
            loom.util.mkdir_p(rows_out)
            for f in os.listdir(rows_in):
                tasks.append((
                    transform,
                    transformed_header,
                    os.path.join(rows_in, f),
                    os.path.join(rows_out, f),
                ))
        else:
            tasks.append((transform, transformed_header, rows_in, rows_out))
        parallel_map(_transform_rows, tasks)
示例#2
0
def transform_rows(schema_in, transforms_in, rows_in, rows_out, id_field=None):
    transforms = pickle_load(transforms_in)
    if not transforms:
        cp_ns(rows_in, rows_out)
    else:
        transform = TransformSequence(transforms)
        transformed_header = sorted(json_load(schema_in).iterkeys())
        if id_field is not None:
            assert id_field not in transformed_header
            transformed_header = [id_field] + transformed_header
        tasks = []
        if os.path.isdir(rows_in):
            loom.util.mkdir_p(rows_out)
            for f in os.listdir(rows_in):
                tasks.append((
                    transform,
                    transformed_header,
                    os.path.join(rows_in, f),
                    os.path.join(rows_out, f),
                ))
        else:
            tasks.append((transform, transformed_header, rows_in, rows_out))
        parallel_map(_transform_rows, tasks)
示例#3
0
def make_fake_consensus(paths, debug=False):
    for key, destin in loom.store.iter_paths("samples.0", paths["consensus"]):
        source = loom.store.get_path(paths, key)
        cp_ns(source, destin)
示例#4
0
def make_fake_consensus(paths, debug=False):
    for key, destin in loom.store.iter_paths('samples.0', paths['consensus']):
        source = loom.store.get_path(paths, key)
        cp_ns(source, destin)
示例#5
0
文件: datasets.py 项目: jostheim/loom
def generate_one((name, sample_count, force, debug)):
    paths = loom.store.get_paths(name, sample_count=sample_count)
    if not force and all(os.path.exists(f) for f in paths.itervalues()):
        with open_compressed(paths['ingest']['version']) as f:
            version = f.read().strip()
        if version == loom.__version__:
            return
    print 'generating', name
    mkdir_p(paths['root'])
    with open_compressed(paths['ingest']['version'], 'w') as f:
        f.write(loom.__version__)
    config = CONFIGS[name]
    chunk_size = max(10, (config['row_count'] + 7) / 8)
    loom.transforms.make_fake_transforms(
        transforms_out=paths['ingest']['transforms'])
    loom.generate.generate(
        init_out=paths['samples'][0]['init'],
        rows_out=paths['ingest']['rows'],
        model_out=paths['samples'][0]['model'],
        groups_out=paths['samples'][0]['groups'],
        assign_out=paths['samples'][0]['assign'],
        **config)
    loom.format.make_schema(
        model_in=paths['samples'][0]['model'],
        schema_out=paths['ingest']['schema'])
    loom.format.make_fake_encoding(
        schema_in=paths['ingest']['schema'],
        model_in=paths['samples'][0]['model'],
        encoding_out=paths['ingest']['encoding'])
    loom.format.make_schema_row(
        schema_in=paths['ingest']['schema'],
        schema_row_out=paths['ingest']['schema_row'])
    loom.runner.tare(
        schema_row_in=paths['ingest']['schema_row'],
        rows_in=paths['ingest']['rows'],
        tares_out=paths['ingest']['tares'],
        debug=debug)
    loom.runner.sparsify(
        schema_row_in=paths['ingest']['schema_row'],
        tares_in=paths['ingest']['tares'],
        rows_in=paths['ingest']['rows'],
        rows_out=paths['ingest']['diffs'],
        debug=debug)
    loom.format.export_rows(
        encoding_in=paths['ingest']['encoding'],
        rows_in=paths['ingest']['rows'],
        rows_csv_out=paths['ingest']['rows_csv'],
        chunk_size=chunk_size)
    loom.format.import_rowids(
        rows_csv_in=paths['ingest']['rows_csv'],
        rowids_out=paths['ingest']['rowids'],
        id_field='_id')
    protobuf_stream_dump([], paths['query']['query_log'])
    loom.config.config_dump({}, paths['query']['config'])
    for seed, sample in enumerate(paths['samples']):
        loom.config.config_dump({'seed': seed}, sample['config'])
        loom.generate.generate_init(
            encoding_in=paths['ingest']['encoding'],
            model_out=sample['init'],
            seed=seed)
        loom.runner.shuffle(
            rows_in=paths['ingest']['diffs'],
            rows_out=sample['shuffled'],
            seed=seed,
            debug=debug)
        protobuf_stream_dump([], sample['infer_log'])
    sample0 = paths['samples'][0]
    for seed, sample in enumerate(paths['samples'][1:]):
        if LOOM_DEBUG_MIX:
            cp_ns(sample0['model'], sample['model'])
            cp_ns(sample0['groups'], sample['groups'])
            cp_ns(sample0['assign'], sample['assign'])
        else:
            loom.runner.mix(
                config_in=sample['config'],
                rows_in=paths['ingest']['rows'],
                model_in=sample0['model'],
                groups_in=sample0['groups'],
                assign_in=sample0['assign'],
                model_out=sample['model'],
                groups_out=sample['groups'],
                assign_out=sample['assign'],
                debug=debug)
    loom.consensus.make_fake_consensus(
        paths=paths,
        debug=debug)
示例#6
0
def generate_one((name, sample_count, force, debug)):
    paths = loom.store.get_paths(name, sample_count=sample_count)
    if not force and all(os.path.exists(f) for f in paths.itervalues()):
        with open_compressed(paths['ingest']['version']) as f:
            version = f.read().strip()
        if version == loom.__version__:
            return
    print 'generating', name
    mkdir_p(paths['root'])
    with open_compressed(paths['ingest']['version'], 'w') as f:
        f.write(loom.__version__)
    config = CONFIGS[name]
    chunk_size = max(10, (config['row_count'] + 7) / 8)
    loom.transforms.make_fake_transforms(
        transforms_out=paths['ingest']['transforms'])
    loom.generate.generate(
        init_out=paths['samples'][0]['init'],
        rows_out=paths['ingest']['rows'],
        model_out=paths['samples'][0]['model'],
        groups_out=paths['samples'][0]['groups'],
        assign_out=paths['samples'][0]['assign'],
        **config)
    loom.format.make_schema(
        model_in=paths['samples'][0]['model'],
        schema_out=paths['ingest']['schema'])
    loom.format.make_fake_encoding(
        schema_in=paths['ingest']['schema'],
        model_in=paths['samples'][0]['model'],
        encoding_out=paths['ingest']['encoding'])
    loom.format.make_schema_row(
        schema_in=paths['ingest']['schema'],
        schema_row_out=paths['ingest']['schema_row'])
    loom.runner.tare(
        schema_row_in=paths['ingest']['schema_row'],
        rows_in=paths['ingest']['rows'],
        tares_out=paths['ingest']['tares'],
        debug=debug)
    loom.runner.sparsify(
        schema_row_in=paths['ingest']['schema_row'],
        tares_in=paths['ingest']['tares'],
        rows_in=paths['ingest']['rows'],
        rows_out=paths['ingest']['diffs'],
        debug=debug)
    loom.format.export_rows(
        encoding_in=paths['ingest']['encoding'],
        rows_in=paths['ingest']['rows'],
        rows_csv_out=paths['ingest']['rows_csv'],
        chunk_size=chunk_size)
    loom.format.import_rowids(
        rows_csv_in=paths['ingest']['rows_csv'],
        rowids_out=paths['ingest']['rowids'],
        id_field='_id')
    protobuf_stream_dump([], paths['query']['query_log'])
    loom.config.config_dump({}, paths['query']['config'])
    for seed, sample in enumerate(paths['samples']):
        loom.config.config_dump({'seed': seed}, sample['config'])
        loom.generate.generate_init(
            encoding_in=paths['ingest']['encoding'],
            model_out=sample['init'],
            seed=seed)
        loom.runner.shuffle(
            rows_in=paths['ingest']['diffs'],
            rows_out=sample['shuffled'],
            seed=seed,
            debug=debug)
        protobuf_stream_dump([], sample['infer_log'])
    sample0 = paths['samples'][0]
    for seed, sample in enumerate(paths['samples'][1:]):
        if LOOM_DEBUG_MIX:
            cp_ns(sample0['model'], sample['model'])
            cp_ns(sample0['groups'], sample['groups'])
            cp_ns(sample0['assign'], sample['assign'])
        else:
            loom.runner.mix(
                config_in=sample['config'],
                rows_in=paths['ingest']['rows'],
                model_in=sample0['model'],
                groups_in=sample0['groups'],
                assign_in=sample0['assign'],
                model_out=sample['model'],
                groups_out=sample['groups'],
                assign_out=sample['assign'],
                debug=debug)
    loom.consensus.make_fake_consensus(
        paths=paths,
        debug=debug)