Exemplo n.º 1
0
def _test_dataset(args):
    dim, feature_type, density, infer_kinds, debug, hyper_prior = args
    object_count, feature_count = dim
    with tempdir(cleanup_on_error=(not debug)):
        seed_all(SEED)

        config_name = os.path.abspath("config.pb")
        model_base_name = "model.pb"
        model_name = os.path.abspath(model_base_name)
        rows_name = os.path.abspath("rows.pbs")

        models = generate_model(feature_count, feature_type, hyper_prior)
        model, fixed_hyper_models = models
        dump_model(model, model_name)
        fixed_model_names = []
        for i, fm in enumerate(fixed_hyper_models):
            fixed_model_base = "fixed-{}-{}".format(i, model_base_name)
            fixed_model_name = os.path.abspath(fixed_model_base)
            fixed_model_names.append(fixed_model_name)
            dump_model(fm, fixed_model_name)
        if hyper_prior is None:
            assert len(fixed_model_names) == 0

        rows = generate_rows(object_count, feature_count, feature_type, density)
        dump_rows(rows, rows_name)

        infer_cats = object_count > 1
        infer_hypers = hyper_prior is not None

        if infer_kinds:
            sample_count = 10 * LATENT_SIZES[object_count][feature_count]
            iterations = 32
        else:
            sample_count = 10 * LATENT_SIZES[object_count][1]
            iterations = 0

        config = {
            "posterior_enum": {"sample_count": sample_count, "sample_skip": 10},
            "kernels": {
                "hyper": {"run": infer_hypers, "parallel": False},
                "kind": {"iterations": iterations, "row_queue_capacity": 0, "score_parallel": False},
            },
        }
        loom.config.config_dump(config, config_name)

        casename = "{}-{}-{}-{}-{}{}{}".format(
            object_count,
            feature_count,
            feature_type,
            density,
            ("C" if infer_cats else ""),
            ("K" if infer_kinds else ""),
            ("H" if infer_hypers else ""),
        )
        # LOG('Run', casename)
        error = _test_dataset_config(
            casename, object_count, feature_count, config_name, model_name, fixed_model_names, rows_name, config, debug
        )
        return [] if error is None else [error]
Exemplo n.º 2
0
def _test_modify_schema(modify, name, schema, rows_csv, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR) as store:
        with mock.patch('loom.store.STORE', new=store):
            modified_schema = os.path.join(store, 'schema.json')
            data = json_load(schema)
            data = modify(data)
            json_dump(data, modified_schema)
            loom.tasks.ingest(name, modified_schema, rows_csv, debug=True)
Exemplo n.º 3
0
def test_dump_rows():
    for feature_type in FEATURE_TYPES:
        table = generate_rows(10, 10, feature_type, 0.5)
        with tempdir():
            rows_name = os.path.abspath('rows.pbs')
            dump_rows(table, rows_name)
            message = loom.schema_pb2.Row()
            for string in protobuf_stream_load(rows_name):
                message.ParseFromString(string)
Exemplo n.º 4
0
def test_dump_rows():
    for feature_type in FEATURE_TYPES:
        table = generate_rows(10, 10, feature_type, 0.5)
        with tempdir():
            rows_name = os.path.abspath('rows.pbs')
            dump_rows(table, rows_name)
            message = loom.schema_pb2.Row()
            for string in protobuf_stream_load(rows_name):
                message.ParseFromString(string)
Exemplo n.º 5
0
def _test_modify_csv(modify, name, schema, encoding, rows, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR) as store:
        with mock.patch('loom.store.STORE', new=store):
            rows_dir = os.path.join(store, 'rows_csv')
            loom.format.export_rows(encoding, rows, rows_dir)
            rows_csv = os.path.join(rows_dir, os.listdir(rows_dir)[0])
            data = csv_load(rows_csv)
            data = modify(data)
            csv_dump(data, rows_csv)
            loom.tasks.ingest(name, schema, rows_csv, debug=True)
Exemplo n.º 6
0
def generate_samples(casename, dataset, debug):
    root = os.getcwd()
    with tempdir(cleanup_on_error=(not debug)):
        results = {
            'schema': os.path.abspath('schema.json'),
            'schema_row': os.path.abspath('schema_row.pb'),
            'tares': os.path.abspath('tares.pbs'),
            'diffs': os.path.abspath('diffs.pbs'),
            'samples': os.path.abspath('samples.pbs.gz'),
        }
        os.chdir(root)
        run_posterior_enum(casename, dataset, results, debug)
        for sample in load_samples(results['samples']):
            yield sample
Exemplo n.º 7
0
def generate_samples(casename, dataset, debug):
    root = os.getcwd()
    with tempdir(cleanup_on_error=(not debug)):
        results = {
            'schema': os.path.abspath('schema.json'),
            'schema_row': os.path.abspath('schema_row.pb'),
            'tares': os.path.abspath('tares.pbs'),
            'diffs': os.path.abspath('diffs.pbs'),
            'samples': os.path.abspath('samples.pbs.gz'),
        }
        os.chdir(root)
        run_posterior_enum(casename, dataset, results, debug)
        for sample in load_samples(results['samples']):
            yield sample
Exemplo n.º 8
0
def generate_samples(casename, dataset, debug):
    root = os.getcwd()
    with tempdir(cleanup_on_error=(not debug)):
        results = {
            "schema": os.path.abspath("schema.json"),
            "schema_row": os.path.abspath("schema_row.pb"),
            "tares": os.path.abspath("tares.pbs"),
            "diffs": os.path.abspath("diffs.pbs"),
            "samples": os.path.abspath("samples.pbs.gz"),
        }
        os.chdir(root)
        run_posterior_enum(casename, dataset, results, debug)
        for sample in load_samples(results["samples"]):
            yield sample
Exemplo n.º 9
0
def generate(
        feature_type='mixed',
        row_count=1000,
        feature_count=100,
        density=0.5,
        rows_out='rows.pbs.gz',
        model_out='model.pb.gz',
        groups_out=None,
        assign_out=None,
        init_out=None,
        debug=False,
        profile=None):
    '''
    Generate a synthetic dataset.
    '''
    root = os.getcwd()
    rows_out = os.path.abspath(rows_out)
    model_out = os.path.abspath(model_out)
    if groups_out is not None:
        groups_out = os.path.abspath(groups_out)
    if assign_out is not None:
        assign_out = os.path.abspath(assign_out)
    if init_out is not None:
        init_out = os.path.abspath(init_out)

    features = generate_features(feature_count, feature_type)
    model = generate_model(features)

    with tempdir(cleanup_on_error=(not debug)):
        if init_out is None:
            init_out = os.path.abspath('init.pb.gz')
        with open_compressed(init_out, 'wb') as f:
            f.write(model.SerializeToString())

        config = {'generate': {'row_count': row_count, 'density': density}}
        config_in = os.path.abspath('config.pb.gz')
        loom.config.config_dump(config, config_in)

        os.chdir(root)
        loom.runner.generate(
            config_in=config_in,
            model_in=init_out,
            rows_out=rows_out,
            model_out=model_out,
            groups_out=groups_out,
            assign_out=assign_out,
            debug=debug,
            profile=profile)
Exemplo n.º 10
0
def generate_samples(model_name, rows_name, config_name, debug):
    with tempdir(cleanup_on_error=(not debug)):
        samples_name = os.path.abspath('samples.pbs.gz')
        with chdir(CWD):
            loom.runner.posterior_enum(
                config_name,
                model_name,
                rows_name,
                samples_name,
                debug=debug)
        message = loom.schema_pb2.PosteriorEnum.Sample()
        for string in protobuf_stream_load(samples_name):
            message.ParseFromString(string)
            sample = parse_sample(message)
            score = float(message.score)
            yield sample, score
Exemplo n.º 11
0
def generate(feature_type='mixed',
             row_count=1000,
             feature_count=100,
             density=0.5,
             rows_out='rows.pbs.gz',
             model_out='model.pb.gz',
             groups_out=None,
             assign_out=None,
             init_out=None,
             debug=False,
             profile=None):
    '''
    Generate a synthetic dataset.
    '''
    root = os.getcwd()
    rows_out = os.path.abspath(rows_out)
    model_out = os.path.abspath(model_out)
    if groups_out is not None:
        groups_out = os.path.abspath(groups_out)
    if assign_out is not None:
        assign_out = os.path.abspath(assign_out)
    if init_out is not None:
        init_out = os.path.abspath(init_out)

    features = generate_features(feature_count, feature_type)
    model = generate_model(features)

    with tempdir(cleanup_on_error=(not debug)):
        if init_out is None:
            init_out = os.path.abspath('init.pb.gz')
        with open_compressed(init_out, 'wb') as f:
            f.write(model.SerializeToString())

        config = {'generate': {'row_count': row_count, 'density': density}}
        config_in = os.path.abspath('config.pb.gz')
        loom.config.config_dump(config, config_in)

        os.chdir(root)
        loom.runner.generate(config_in=config_in,
                             model_in=init_out,
                             rows_out=rows_out,
                             model_out=model_out,
                             groups_out=groups_out,
                             assign_out=assign_out,
                             debug=debug,
                             profile=profile)
Exemplo n.º 12
0
def generate(
        feature_type='mixed',
        row_count=1000,
        feature_count=100,
        density=0.5,
        init_out='init.pb.gz',
        rows_out='rows.pbs.gz',
        model_out='model.pb.gz',
        groups_out='groups',
        debug=False,
        profile=None):
    '''
    Generate a synthetic dataset.
    '''
    root = os.path.abspath(os.path.curdir)
    init_out = os.path.abspath(init_out)
    rows_out = os.path.abspath(rows_out)
    model_out = os.path.abspath(model_out)
    groups_out = os.path.abspath(groups_out)

    model = generate_model(row_count, feature_count, feature_type, density)
    with open_compressed(init_out, 'w') as f:
        f.write(model.SerializeToString())

    with tempdir(cleanup_on_error=(not debug)):
        config = {'generate': {'row_count': row_count, 'density': density}}
        config_in = os.path.abspath('config.pb.gz')
        loom.config.config_dump(config, config_in)

        os.chdir(root)
        loom.runner.generate(
            config_in=config_in,
            model_in=init_out,
            rows_out=rows_out,
            model_out=model_out,
            groups_out=groups_out,
            debug=debug,
            profile=profile)
Exemplo n.º 13
0
def _test_dataset(args):
    dim, feature_type, density, infer_kinds, debug, hyper_prior = args
    object_count, feature_count = dim
    with tempdir(cleanup_on_error=(not debug)):
        seed_all(SEED)

        config_name = os.path.abspath('config.pb')
        model_base_name = 'model.pb'
        model_name = os.path.abspath(model_base_name)
        rows_name = os.path.abspath('rows.pbs')

        models = generate_model(feature_count, feature_type, hyper_prior)
        model, fixed_hyper_models = models
        dump_model(model, model_name)
        fixed_model_names = []
        for i, fm in enumerate(fixed_hyper_models):
            fixed_model_base = 'fixed-{}-{}'.format(i, model_base_name)
            fixed_model_name = os.path.abspath(fixed_model_base)
            fixed_model_names.append(fixed_model_name)
            dump_model(fm, fixed_model_name)
        if hyper_prior is None:
            assert len(fixed_model_names) == 0

        rows = generate_rows(object_count, feature_count, feature_type,
                             density)
        dump_rows(rows, rows_name)

        infer_cats = (object_count > 1)
        infer_hypers = (hyper_prior is not None)

        if infer_kinds:
            sample_count = 10 * LATENT_SIZES[object_count][feature_count]
            iterations = 32
        else:
            sample_count = 10 * LATENT_SIZES[object_count][1]
            iterations = 0

        config = {
            'posterior_enum': {
                'sample_count': sample_count,
                'sample_skip': 10,
            },
            'kernels': {
                'hyper': {
                    'run': infer_hypers,
                    'parallel': False,
                },
                'kind': {
                    'iterations': iterations,
                    'row_queue_capacity': 0,
                    'score_parallel': False,
                },
            },
        }
        loom.config.config_dump(config, config_name)

        casename = '{}-{}-{}-{}-{}{}{}'.format(object_count, feature_count,
                                               feature_type, density,
                                               ('C' if infer_cats else ''),
                                               ('K' if infer_kinds else ''),
                                               ('H' if infer_hypers else ''))
        # LOG('Run', casename)
        error = _test_dataset_config(casename, object_count, feature_count,
                                     config_name, model_name,
                                     fixed_model_names, rows_name, config,
                                     debug)
        return [] if error is None else [error]
Exemplo n.º 14
0
def test_missing_rows_error(name, schema, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR) as store:
        with mock.patch('loom.store.STORE', new=store):
            rows_csv = os.path.join(store, 'missing.rows_csv')
            loom.tasks.ingest(name, schema, rows_csv, debug=True)
Exemplo n.º 15
0
def _test_dataset(args):
    dim, feature_type, density, infer_kinds, debug, hyper_prior = args
    object_count, feature_count = dim
    with tempdir(cleanup_on_error=(not debug)):
        seed_all(SEED)

        config_name = os.path.abspath('config.pb')
        model_base_name = 'model.pb'
        model_name = os.path.abspath(model_base_name)
        rows_name = os.path.abspath('rows.pbs')

        models = generate_model(feature_count, feature_type, hyper_prior)
        model, fixed_hyper_models = models
        dump_model(model, model_name)
        fixed_model_names = []
        for i, fm in enumerate(fixed_hyper_models):
            fixed_model_base = 'fixed-{}-{}'.format(i, model_base_name)
            fixed_model_name = os.path.abspath(fixed_model_base)
            fixed_model_names.append(fixed_model_name)
            dump_model(fm, fixed_model_name)
        if hyper_prior is None:
            assert len(fixed_model_names) == 0

        rows = generate_rows(
            object_count,
            feature_count,
            feature_type,
            density)
        dump_rows(rows, rows_name)

        infer_cats = (object_count > 1)
        infer_hypers = (hyper_prior is not None)

        if infer_kinds:
            sample_count = 10 * LATENT_SIZES[object_count][feature_count]
            iterations = 32
        else:
            sample_count = 10 * LATENT_SIZES[object_count][1]
            iterations = 0

        config = {
            'posterior_enum': {
                'sample_count': sample_count,
                'sample_skip': 10,
            },
            'kernels': {
                'hyper': {
                    'run': infer_hypers,
                    'parallel': False,
                },
                'kind': {
                    'iterations': iterations,
                    'row_queue_capacity': 0,
                    'score_parallel': False,
                },
            },
        }
        loom.config.config_dump(config, config_name)

        casename = '{}-{}-{}-{}-{}{}{}'.format(
            object_count,
            feature_count,
            feature_type,
            density,
            ('C' if infer_cats else ''),
            ('K' if infer_kinds else ''),
            ('H' if infer_hypers else ''))
        # LOG('Run', casename)
        error = _test_dataset_config(
            casename,
            object_count,
            feature_count,
            config_name,
            model_name,
            fixed_model_names,
            rows_name,
            config,
            debug)
        return [] if error is None else [error]