Пример #1
0
def test_import_rows(encoding, rows, rows_csv, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        rows_pbs = os.path.abspath('rows.pbs.gz')
        loom.format.import_rows(encoding_in=encoding,
                                rows_csv_in=rows_csv,
                                rows_out=rows_pbs)
        assert_found(rows_pbs)
        expected_count = sum(1 for _ in protobuf_stream_load(rows))
        actual_count = sum(1 for _ in protobuf_stream_load(rows_pbs))
        assert_equal(actual_count, expected_count)
Пример #2
0
def test_import_rows(encoding, rows, rows_csv, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        rows_pbs = os.path.abspath('rows.pbs.gz')
        loom.format.import_rows(
            encoding_in=encoding,
            rows_csv_in=rows_csv,
            rows_out=rows_pbs)
        assert_found(rows_pbs)
        expected_count = sum(1 for _ in protobuf_stream_load(rows))
        actual_count = sum(1 for _ in protobuf_stream_load(rows_pbs))
        assert_equal(actual_count, expected_count)
Пример #3
0
def crossvalidate_one(seed, test_count, train_count, inputs, results,
                      extra_passes, debug):
    LOG('running seed {}:'.format(seed))
    results['train'] = os.path.join(results['root'], 'train', 'diffs.pbs.gz')
    results['test'] = os.path.join(results['root'], 'test', 'rows.pbs.gz')
    results['scores'] = os.path.join(results['root'], 'scores.json.gz')

    config = {
        'seed': seed,
        'schedule': {
            'extra_passes': extra_passes
        },
    }
    loom.config.config_dump(config, results['samples'][0]['config'])

    numpy.random.seed(seed)
    split = [True] * train_count + [False] * test_count
    numpy.random.shuffle(split)
    diffs_in = protobuf_stream_load(inputs['ingest']['diffs'])
    protobuf_stream_dump((row for s, row in izip(split, diffs_in) if s),
                         results['train'])
    rows_in = protobuf_stream_load(inputs['ingest']['rows'])
    protobuf_stream_dump((row for s, row in izip(split, rows_in) if not s),
                         results['test'])

    LOG(' shuffle')
    loom.runner.shuffle(rows_in=results['train'],
                        rows_out=results['samples'][0]['shuffled'],
                        seed=seed,
                        debug=debug)
    LOG(' init')
    loom.generate.generate_init(encoding_in=inputs['ingest']['encoding'],
                                model_out=results['samples'][0]['init'],
                                seed=seed)
    LOG(' infer')
    loom.runner.infer(config_in=results['samples'][0]['config'],
                      rows_in=results['samples'][0]['shuffled'],
                      tares_in=inputs['ingest']['tares'],
                      model_in=results['samples'][0]['init'],
                      model_out=results['samples'][0]['model'],
                      groups_out=results['samples'][0]['groups'],
                      debug=debug)
    LOG(' query')
    rows = loom.query.load_data_rows(results['test'])
    loom.config.config_dump({}, results['query']['config'])
    with loom.query.get_server(results['root'], debug=debug) as query:
        scores = [query.score(row) for row in rows]

    json_dump(scores, results['scores'])
    LOG(' done\n')
    return numpy.mean(scores)
Пример #4
0
def test_infer(name, tares, shuffled, init, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        row_count = sum(1 for _ in protobuf_stream_load(shuffled))
        with open_compressed(init) as f:
            message = CrossCat()
            message.ParseFromString(f.read())
        kind_count = len(message.kinds)

        for config in CONFIGS:
            loom.config.fill_in_defaults(config)
            schedule = config['schedule']
            print 'config: {}'.format(config)

            greedy = (schedule['extra_passes'] == 0)
            kind_iters = config['kernels']['kind']['iterations']
            kind_structure_is_fixed = greedy or kind_iters == 0

            with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
                config_in = os.path.abspath('config.pb.gz')
                model_out = os.path.abspath('model.pb.gz')
                groups_out = os.path.abspath('groups')
                assign_out = os.path.abspath('assign.pbs.gz')
                log_out = os.path.abspath('log.pbs.gz')
                os.mkdir(groups_out)
                loom.config.config_dump(config, config_in)
                loom.runner.infer(
                    config_in=config_in,
                    rows_in=shuffled,
                    tares_in=tares,
                    model_in=init,
                    model_out=model_out,
                    groups_out=groups_out,
                    assign_out=assign_out,
                    log_out=log_out,
                    debug=True,)

                if kind_structure_is_fixed:
                    assert_equal(len(os.listdir(groups_out)), kind_count)

                group_counts = get_group_counts(groups_out)

                assign_count = sum(1 for _ in protobuf_stream_load(assign_out))
                assert_equal(assign_count, row_count)

            print 'row_count: {}'.format(row_count)
            print 'group_counts: {}'.format(' '.join(map(str, group_counts)))
            for group_count in group_counts:
                assert_true(
                    group_count <= row_count,
                    'groups are all singletons')
Пример #5
0
def test_infer(name, tares, shuffled, init, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        row_count = sum(1 for _ in protobuf_stream_load(shuffled))
        with open_compressed(init) as f:
            message = CrossCat()
            message.ParseFromString(f.read())
        kind_count = len(message.kinds)

        for config in CONFIGS:
            loom.config.fill_in_defaults(config)
            schedule = config['schedule']
            print 'config: {}'.format(config)

            greedy = (schedule['extra_passes'] == 0)
            kind_iters = config['kernels']['kind']['iterations']
            kind_structure_is_fixed = greedy or kind_iters == 0

            with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
                config_in = os.path.abspath('config.pb.gz')
                model_out = os.path.abspath('model.pb.gz')
                groups_out = os.path.abspath('groups')
                assign_out = os.path.abspath('assign.pbs.gz')
                log_out = os.path.abspath('log.pbs.gz')
                os.mkdir(groups_out)
                loom.config.config_dump(config, config_in)
                loom.runner.infer(
                    config_in=config_in,
                    rows_in=shuffled,
                    tares_in=tares,
                    model_in=init,
                    model_out=model_out,
                    groups_out=groups_out,
                    assign_out=assign_out,
                    log_out=log_out,
                    debug=True,
                )

                if kind_structure_is_fixed:
                    assert_equal(len(os.listdir(groups_out)), kind_count)

                group_counts = get_group_counts(groups_out)

                assign_count = sum(1 for _ in protobuf_stream_load(assign_out))
                assert_equal(assign_count, row_count)

            print 'row_count: {}'.format(row_count)
            print 'group_counts: {}'.format(' '.join(map(str, group_counts)))
            for group_count in group_counts:
                assert_true(group_count <= row_count,
                            'groups are all singletons')
Пример #6
0
def load_rows(filename):
    rows = []
    for string in protobuf_stream_load(filename):
        row = Row()
        row.ParseFromString(string)
        rows.append(row)
    return rows
Пример #7
0
def load_samples(filename):
    message = loom.schema_pb2.PosteriorEnum.Sample()
    for string in protobuf_stream_load(filename):
        message.ParseFromString(string)
        sample = parse_sample(message)
        score = float(message.score)
        yield sample, score
Пример #8
0
def batch_predict(
        config_in,
        model_in,
        groups_in,
        requests,
        debug=False,
        profile=None):
    root = os.path.abspath(os.path.curdir)
    with tempdir(cleanup_on_error=(not debug)):
        requests_in = os.path.abspath('requests.pbs.gz')
        responses_out = os.path.abspath('responses.pbs.gz')
        protobuf_stream_dump(
            (q.SerializeToString() for q in requests),
            requests_in)

        os.chdir(root)
        loom.runner.query(
            config_in=config_in,
            model_in=model_in,
            groups_in=groups_in,
            requests_in=requests_in,
            responses_out=responses_out,
            debug=debug,
            profile=profile)

        return map(parse_response, protobuf_stream_load(responses_out))
Пример #9
0
def run_posterior_enum(casename, dataset, results, debug, sparsify=True):
    if not sparsify:
        loom.runner.posterior_enum(
            config_in=dataset["config"],
            rows_in=dataset["rows"],
            model_in=dataset["model"],
            samples_out=results["samples"],
            debug=debug,
        )
    else:
        loom.format.make_schema(model_in=dataset["model"], schema_out=results["schema"])
        loom.format.make_schema_row(schema_in=results["schema"], schema_row_out=results["schema_row"])
        loom.runner.tare(
            schema_row_in=results["schema_row"], rows_in=dataset["rows"], tares_out=results["tares"], debug=debug
        )
        tare_count = sum(1 for _ in protobuf_stream_load(results["tares"]))
        if casename is not None and tare_count:
            LOG("Info", casename, "found {} tare rows".format(tare_count))
        loom.runner.sparsify(
            schema_row_in=results["schema_row"],
            tares_in=results["tares"],
            rows_in=dataset["rows"],
            rows_out=results["diffs"],
            debug=debug,
        )
        loom.runner.posterior_enum(
            config_in=dataset["config"],
            rows_in=results["diffs"],
            tares_in=results["tares"],
            model_in=dataset["model"],
            samples_out=results["samples"],
            debug=debug,
        )
Пример #10
0
def load_samples(filename):
    message = loom.schema_pb2.PosteriorEnum.Sample()
    for string in protobuf_stream_load(filename):
        message.ParseFromString(string)
        sample = parse_sample(message)
        score = float(message.score)
        yield sample, score
Пример #11
0
def pretty_print(filename, message_type='guess'):
    '''
    Print text/json/protobuf messages from a raw/gz/bz2 file.
    '''
    parts = os.path.basename(filename).split('.')
    if parts[-1] in ['gz', 'bz2']:
        parts.pop()
    protocol = parts[-1]
    if protocol == 'json':
        data = json_load(filename)
        print json.dumps(data, sort_keys=True, indent=4)
    elif protocol == 'pb':
        message = get_message(filename, message_type)
        with open_compressed(filename) as f:
            message.ParseFromString(f.read())
            print message
    elif protocol == 'pbs':
        message = get_message(filename, message_type)
        for string in protobuf_stream_load(filename):
            message.ParseFromString(string)
            print message
    elif protocol == 'pickle':
        data = pickle_load(filename)
        print repr(data)
    else:
        with open_compressed(filename) as f:
            for line in f:
                print line,
Пример #12
0
def load_rows(filename):
    rows = []
    for string in protobuf_stream_load(filename):
        row = Row()
        row.ParseFromString(string)
        rows.append(row)
    return rows
Пример #13
0
def test_posterior_enum(name, tares, diffs, init, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        config_in = os.path.abspath('config.pb.gz')
        config = {
            'posterior_enum': {
                'sample_count': 7,
            },
            'kernels': {
                'kind': {
                    'row_queue_capacity': 0,
                    'score_parallel': False,
                },
            },
        }
        loom.config.config_dump(config, config_in)
        assert_found(config_in)

        samples_out = os.path.abspath('samples.pbs.gz')
        loom.runner.posterior_enum(config_in=config_in,
                                   model_in=init,
                                   tares_in=tares,
                                   rows_in=diffs,
                                   samples_out=samples_out,
                                   debug=True)
        assert_found(samples_out)
        actual_count = sum(1 for _ in protobuf_stream_load(samples_out))
        assert_equal(actual_count, config['posterior_enum']['sample_count'])
Пример #14
0
def test_posterior_enum(rows, model, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        config_in = os.path.abspath('config.pb.gz')
        config = {
            'posterior_enum': {
                'sample_count': 7,
            },
            'kernels': {
                'kind': {
                    'row_queue_capacity': 0,
                    'score_parallel': False,
                },
            },
        }
        loom.config.config_dump(config, config_in)
        assert_true(os.path.exists(config_in))

        samples_out = os.path.abspath('samples.pbs.gz')
        loom.runner.posterior_enum(
            config_in=config_in,
            model_in=model,
            rows_in=rows,
            samples_out=samples_out,
            debug=True)
        assert_true(os.path.exists(samples_out))
        actual_count = sum(1 for _ in protobuf_stream_load(samples_out))
        assert_equal(actual_count, config['posterior_enum']['sample_count'])
Пример #15
0
def run_posterior_enum(casename, dataset, results, debug, sparsify=True):
    if not sparsify:
        loom.runner.posterior_enum(config_in=dataset['config'],
                                   rows_in=dataset['rows'],
                                   model_in=dataset['model'],
                                   samples_out=results['samples'],
                                   debug=debug)
    else:
        loom.format.make_schema(model_in=dataset['model'],
                                schema_out=results['schema'])
        loom.format.make_schema_row(schema_in=results['schema'],
                                    schema_row_out=results['schema_row'])
        loom.runner.tare(schema_row_in=results['schema_row'],
                         rows_in=dataset['rows'],
                         tares_out=results['tares'],
                         debug=debug)
        tare_count = sum(1 for _ in protobuf_stream_load(results['tares']))
        if casename is not None and tare_count:
            LOG('Info', casename, 'found {} tare rows'.format(tare_count))
        loom.runner.sparsify(schema_row_in=results['schema_row'],
                             tares_in=results['tares'],
                             rows_in=dataset['rows'],
                             rows_out=results['diffs'],
                             debug=debug)
        loom.runner.posterior_enum(config_in=dataset['config'],
                                   rows_in=results['diffs'],
                                   tares_in=results['tares'],
                                   model_in=dataset['model'],
                                   samples_out=results['samples'],
                                   debug=debug)
Пример #16
0
def ingest(name, schema=None, rows_csv=None, id_field=None, debug=False):
    '''
    Ingest dataset with optional json config.
    Arguments:
        name            A unique identifier for ingest + inference
        schema          Json schema file, e.g., {"feature1": "nich"}
        rows_csv        File or directory of csv files or csv.gz files
        id_field        Column name of id field in input csv
        debug           Whether to run debug versions of C++ code
    Environment variables:
        LOOM_THREADS    Number of concurrent ingest tasks
        LOOM_VERBOSITY  Verbosity level
    '''
    paths = loom.store.get_paths(name)
    if schema is None:
        schema = paths['ingest']['schema']
    if rows_csv is None:
        rows_csv = paths['ingest']['rows_csv']
    if not os.path.exists(schema):
        raise LoomError('Missing schema file: {}'.format(schema))
    if not os.path.exists(rows_csv):
        raise LoomError('Missing rows_csv file: {}'.format(rows_csv))

    with open_compressed(paths['ingest']['version'], 'w') as f:
        f.write(loom.__version__)

    LOG('making schema row')
    loom.format.make_schema_row(schema_in=schema,
                                schema_row_out=paths['ingest']['schema_row'])

    LOG('making encoding')
    loom.format.make_encoding(schema_in=schema,
                              rows_in=rows_csv,
                              encoding_out=paths['ingest']['encoding'])

    LOG('importing rows')
    loom.format.import_rows(encoding_in=paths['ingest']['encoding'],
                            rows_csv_in=rows_csv,
                            rows_out=paths['ingest']['rows'])

    LOG('importing rowids')
    loom.format.import_rowids(rows_csv_in=rows_csv,
                              rowids_out=paths['ingest']['rowids'],
                              id_field=id_field)

    LOG('making tare rows')
    loom.runner.tare(schema_row_in=paths['ingest']['schema_row'],
                     rows_in=paths['ingest']['rows'],
                     tares_out=paths['ingest']['tares'],
                     debug=debug)

    tare_count = sum(1 for _ in protobuf_stream_load(paths['ingest']['tares']))
    LOG('sparsifying rows WRT {} tare rows'.format(tare_count))
    loom.runner.sparsify(schema_row_in=paths['ingest']['schema_row'],
                         tares_in=paths['ingest']['tares'],
                         rows_in=paths['ingest']['rows'],
                         rows_out=paths['ingest']['diffs'],
                         debug=debug)
    loom.config.config_dump({}, paths['query']['config'])
Пример #17
0
def test_dump_rows():
    for feature_type in FEATURE_TYPES:
        table = generate_rows(10, 10, feature_type, 0.5)
        with tempdir():
            rows_name = os.path.abspath('rows.pbs')
            dump_rows(table, rows_name)
            message = loom.schema_pb2.Row()
            for string in protobuf_stream_load(rows_name):
                message.ParseFromString(string)
Пример #18
0
def test_dump_rows():
    for feature_type in FEATURE_TYPES:
        table = generate_rows(10, 10, feature_type, 0.5)
        with tempdir():
            rows_name = os.path.abspath('rows.pbs')
            dump_rows(table, rows_name)
            message = loom.schema_pb2.Row()
            for string in protobuf_stream_load(rows_name):
                message.ParseFromString(string)
Пример #19
0
def ingest(name, schema='schema.json', rows_csv='rows.csv.gz', debug=False):
    '''
    Ingest dataset with optional json config.
    Arguments:
        name            A unique identifier for ingest + inference
        schema          Json schema file, e.g., {"feature1": "nich"}
        rows_csv        File or directory of csv files
        debug           Whether to run debug versions of C++ code
    Environment variables:
        LOOM_THREADS    Number of concurrent ingest tasks
        LOOM_VERBOSITY  Verbosity level
    '''
    if not os.path.exists(schema):
        raise IOError('Missing schema file: {}'.format(schema))
    if not os.path.exists(rows_csv):
        raise IOError('Missing rows_csv file: {}'.format(rows_csv))

    paths = loom.store.get_paths(name)
    with open_compressed(paths['ingest']['version'], 'w') as f:
        f.write(loom.__version__)

    LOG('making schema row')
    loom.format.make_schema_row(
        schema_in=schema,
        schema_row_out=paths['ingest']['schema_row'])

    LOG('making encoding')
    loom.format.make_encoding(
        schema_in=schema,
        rows_in=rows_csv,
        encoding_out=paths['ingest']['encoding'])

    LOG('importing rows')
    loom.format.import_rows(
        encoding_in=paths['ingest']['encoding'],
        rows_csv_in=rows_csv,
        rows_out=paths['ingest']['rows'])

    LOG('making tare rows')
    loom.runner.tare(
        schema_row_in=paths['ingest']['schema_row'],
        rows_in=paths['ingest']['rows'],
        tares_out=paths['ingest']['tares'],
        debug=debug)

    tare_count = sum(1 for _ in protobuf_stream_load(paths['ingest']['tares']))
    LOG('sparsifying rows WRT {} tare rows'.format(tare_count))
    loom.runner.sparsify(
        schema_row_in=paths['ingest']['schema_row'],
        tares_in=paths['ingest']['tares'],
        rows_in=paths['ingest']['rows'],
        rows_out=paths['ingest']['diffs'],
        debug=debug)
Пример #20
0
def test_group_pandas(root, rows_csv, rows, **unused):
    row_count = sum(1 for _ in protobuf_stream_load(rows))
    with loom.preql.get_server(root, debug=True) as preql:
        feature_names = preql.feature_names
        for feature in feature_names[:10]:
            result_string = preql.group(feature)
            result_df = pandas.read_csv(StringIO(result_string), index_col=0)
            print 'result_df ='
            print result_df
            assert_equal(result_df.ndim, 2)
            assert_equal(result_df.shape[0], row_count)
            assert_equal(result_df.shape[1], 2)
Пример #21
0
def test_group_pandas(root, rows_csv, rows, **unused):
    row_count = sum(1 for _ in protobuf_stream_load(rows))
    with loom.preql.get_server(root, debug=True) as preql:
        feature_names = preql.feature_names
        for feature in feature_names[:10]:
            result_string = preql.group(feature)
            result_df = pandas.read_csv(StringIO(result_string), index_col=0)
            print 'result_df ='
            print result_df
            assert_equal(result_df.ndim, 2)
            assert_equal(result_df.shape[0], row_count)
            assert_equal(result_df.shape[1], 2)
Пример #22
0
def get_group_counts(groups_out):
    group_counts = []
    for f in os.listdir(groups_out):
        group_count = 0
        groups = os.path.join(groups_out, f)
        for string in protobuf_stream_load(groups):
            group = ProductModel.Group()
            group.ParseFromString(string)
            group_count += 1
        group_counts.append(group_count)
    assert group_counts, 'no groups found'
    return group_counts
Пример #23
0
def get_group_counts(groups_out):
    group_counts = []
    for f in os.listdir(groups_out):
        group_count = 0
        groups = os.path.join(groups_out, f)
        for string in protobuf_stream_load(groups):
            group = ProductModel.Group()
            group.ParseFromString(string)
            group_count += 1
        group_counts.append(group_count)
    assert group_counts, 'no groups found'
    return group_counts
Пример #24
0
def generate_samples(model_name, rows_name, config_name, debug):
    with tempdir(cleanup_on_error=(not debug)):
        samples_name = os.path.abspath('samples.pbs.gz')
        with chdir(CWD):
            loom.runner.posterior_enum(
                config_name,
                model_name,
                rows_name,
                samples_name,
                debug=debug)
        message = loom.schema_pb2.PosteriorEnum.Sample()
        for string in protobuf_stream_load(samples_name):
            message.ParseFromString(string)
            sample = parse_sample(message)
            score = float(message.score)
            yield sample, score
Пример #25
0
def test_posterior_enum(name, tares, diffs, init, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        config_in = os.path.abspath("config.pb.gz")
        config = {
            "posterior_enum": {"sample_count": 7},
            "kernels": {"kind": {"row_queue_capacity": 0, "score_parallel": False}},
        }
        loom.config.config_dump(config, config_in)
        assert_found(config_in)

        samples_out = os.path.abspath("samples.pbs.gz")
        loom.runner.posterior_enum(
            config_in=config_in, model_in=init, tares_in=tares, rows_in=diffs, samples_out=samples_out, debug=True
        )
        assert_found(samples_out)
        actual_count = sum(1 for _ in protobuf_stream_load(samples_out))
        assert_equal(actual_count, config["posterior_enum"]["sample_count"])
Пример #26
0
def crossvalidate(
        name=None,
        sample_count=10,
        portion=0.9,
        extra_passes=loom.config.DEFAULTS['schedule']['extra_passes'],
        debug=False):
    '''
    Randomly split dataset; train models; score held-out data.
    '''
    assert 0 < portion and portion < 1, portion
    assert sample_count > 0, sample_count
    loom.store.require(name, [
        'ingest.encoding',
        'ingest.tares',
        'ingest.diffs',
    ])
    inputs = loom.store.get_paths(name)

    row_count = sum(1 for _ in protobuf_stream_load(inputs['ingest']['diffs']))
    assert row_count > 1, 'too few rows to crossvalidate: {}'.format(row_count)
    train_count = max(1, min(row_count - 1, int(round(portion * row_count))))
    test_count = row_count - train_count
    assert 1 <= train_count and 1 <= test_count

    mean_scores = []
    for seed in xrange(sample_count):
        results = loom.store.get_paths(
            os.path.join(name, 'crossvalidate/{}'.format(seed)))
        mean = crossvalidate_one(
            seed,
            test_count,
            train_count,
            inputs,
            results,
            extra_passes,
            debug)
        mean_scores.append(mean)

    results = loom.store.get_paths(os.path.join(name, 'crossvalidate'))
    results['scores'] = os.path.join(results['root'], 'scores.json.gz')
    json_dump(mean_scores, results['scores'])
    print 'score = {} +- {}'.format(
        numpy.mean(mean_scores),
        numpy.std(mean_scores))
Пример #27
0
def infer(
        name=None,
        extra_passes=loom.config.DEFAULTS['schedule']['extra_passes'],
        parallel=True,
        debug=False,
        profile='time'):
    '''
    Run inference on a dataset, or list available datasets.
    '''
    assert extra_passes > 0, 'cannot initialize with extra_passes = 0'
    loom.store.require(name, ['samples.0.init', 'samples.0.shuffled'])
    inputs, results = get_paths(name, 'infer')

    config = {'schedule': {'extra_passes': extra_passes}}
    if not parallel:
        loom.config.fill_in_sequential(config)
    loom.config.config_dump(config, results['samples'][0]['config'])

    loom.runner.infer(
        config_in=results['samples'][0]['config'],
        rows_in=inputs['samples'][0]['shuffled'],
        tares_in=inputs['ingest']['tares'],
        model_in=inputs['samples'][0]['init'],
        model_out=results['samples'][0]['model'],
        groups_out=results['samples'][0]['groups'],
        log_out=results['samples'][0]['infer_log'],
        debug=debug,
        profile=profile)

    loom.store.provide(name, results, [
        'samples.0.config',
        'samples.0.model',
        'samples.0.groups',
    ])

    groups = results['samples'][0]['groups']
    assert os.listdir(groups), 'no groups were written'
    group_counts = []
    for f in os.listdir(groups):
        group_count = 0
        for _ in protobuf_stream_load(os.path.join(groups, f)):
            group_count += 1
        group_counts.append(group_count)
    print 'group_counts: {}'.format(' '.join(map(str, group_counts)))
Пример #28
0
def infer(
        name=None,
        extra_passes=loom.config.DEFAULTS['schedule']['extra_passes'],
        debug=False,
        profile='time'):
    '''
    Run inference on a dataset, or list available datasets.
    '''
    if name is None:
        list_options_and_exit(ROWS)

    init = INIT.format(name)
    rows = ROWS.format(name)
    assert os.path.exists(init), 'First load dataset'
    assert os.path.exists(rows), 'First load dataset'
    assert extra_passes > 0, 'cannot initialize with extra_passes = 0'

    destin = os.path.join(RESULTS, name)
    mkdir_p(destin)
    groups_out = os.path.join(destin, 'groups')
    mkdir_p(groups_out)

    config = {'schedule': {'extra_passes': extra_passes}}
    config_in = os.path.join(destin, 'config.pb.gz')
    loom.config.config_dump(config, config_in)

    loom.runner.infer(
        config_in=config_in,
        rows_in=rows,
        model_in=init,
        groups_out=groups_out,
        debug=debug,
        profile=profile)

    assert os.listdir(groups_out), 'no groups were written'
    group_counts = []
    for f in os.listdir(groups_out):
        group_count = 0
        for _ in protobuf_stream_load(os.path.join(groups_out, f)):
            group_count += 1
        group_counts.append(group_count)
    print 'group_counts: {}'.format(' '.join(map(str, group_counts)))
Пример #29
0
def export_rows(encoding_in, rows_in, rows_csv_out, chunk_size=1000000):
    '''
    Export rows from gzipped-protobuf-stream to directory-of-gzipped-csv-files.
    '''
    rows_csv_out = os.path.abspath(rows_csv_out)
    if rows_csv_out == os.getcwd():
        raise LoomError('Cannot export_rows to working directory')
    for ext in ['.csv', '.gz', '.bz2']:
        if rows_csv_out.endswith(ext):
            raise LoomError(
                'Expected rows_csv_out to be a dirname, actual'.format(
                    rows_csv_out))
    if not (chunk_size > 0):
        raise LoomError('Invalid chunk_size {}, must be positive'.format(
            chunk_size))
    encoders = json_load(encoding_in)
    fields = [loom.schema.MODEL_TO_DATATYPE[e['model']] for e in encoders]
    decoders = [load_decoder(e) for e in encoders]
    header = ['_id'] + [e['name'] for e in encoders]
    if os.path.exists(rows_csv_out):
        shutil.rmtree(rows_csv_out)
    os.makedirs(rows_csv_out)
    row_count = sum(1 for _ in protobuf_stream_load(rows_in))
    rows = loom.cFormat.row_stream_load(rows_in)
    chunk_count = (row_count + chunk_size - 1) / chunk_size
    chunks = sorted(
        os.path.join(rows_csv_out, 'rows.{}.csv.gz'.format(i))
        for i in xrange(chunk_count)
    )
    with ExitStack() as stack:
        with_ = stack.enter_context
        writers = [with_(csv_writer(f)) for f in chunks]
        for writer in writers:
            writer.writerow(header)
        for row, writer in izip(rows, cycle(writers)):
            data = row.iter_data()
            schema = izip(data['observed'], fields, decoders)
            csv_row = [row.id]
            for observed, field, decode in schema:
                csv_row.append(decode(data[field].next()) if observed else '')
            writer.writerow(csv_row)
Пример #30
0
def crossvalidate(
        name=None,
        sample_count=10,
        portion=0.9,
        extra_passes=loom.config.DEFAULTS['schedule']['extra_passes'],
        debug=False):
    '''
    Randomly split dataset; train models; score held-out data.
    '''
    assert 0 < portion and portion < 1, portion
    assert sample_count > 0, sample_count
    loom.store.require(name, [
        'ingest.encoding',
        'ingest.tares',
        'ingest.diffs',
    ])
    inputs = loom.store.get_paths(name)

    row_count = sum(1 for _ in protobuf_stream_load(inputs['ingest']['diffs']))
    assert row_count > 1, 'too few rows to crossvalidate: {}'.format(row_count)
    train_count = max(1, min(row_count - 1, int(round(portion * row_count))))
    test_count = row_count - train_count
    assert 1 <= train_count and 1 <= test_count

    mean_scores = []
    for seed in xrange(sample_count):
        results = loom.store.get_paths(
            os.path.join(name, 'crossvalidate/{}'.format(seed)))
        mean = crossvalidate_one(seed, test_count, train_count, inputs,
                                 results, extra_passes, debug)
        mean_scores.append(mean)

    results = loom.store.get_paths(os.path.join(name, 'crossvalidate'))
    results['scores'] = os.path.join(results['root'], 'scores.json.gz')
    json_dump(mean_scores, results['scores'])
    print 'score = {} +- {}'.format(numpy.mean(mean_scores),
                                    numpy.std(mean_scores))
Пример #31
0
def run_posterior_enum(casename, dataset, results, debug, sparsify=True):
    if not sparsify:
        loom.runner.posterior_enum(
            config_in=dataset['config'],
            rows_in=dataset['rows'],
            model_in=dataset['model'],
            samples_out=results['samples'],
            debug=debug)
    else:
        loom.format.make_schema(
            model_in=dataset['model'],
            schema_out=results['schema'])
        loom.format.make_schema_row(
            schema_in=results['schema'],
            schema_row_out=results['schema_row'])
        loom.runner.tare(
            schema_row_in=results['schema_row'],
            rows_in=dataset['rows'],
            tares_out=results['tares'],
            debug=debug)
        tare_count = sum(1 for _ in protobuf_stream_load(results['tares']))
        if casename is not None and tare_count:
            LOG('Info', casename, 'found {} tare rows'.format(tare_count))
        loom.runner.sparsify(
            schema_row_in=results['schema_row'],
            tares_in=results['tares'],
            rows_in=dataset['rows'],
            rows_out=results['diffs'],
            debug=debug)
        loom.runner.posterior_enum(
            config_in=dataset['config'],
            rows_in=results['diffs'],
            tares_in=results['tares'],
            model_in=dataset['model'],
            samples_out=results['samples'],
            debug=debug)
Пример #32
0
def load_rows_raw(filename):
    return list(protobuf_stream_load(filename))
Пример #33
0
def load_rows_raw(filename):
    return list(protobuf_stream_load(filename))
Пример #34
0
def crossvalidate_one(
        seed,
        test_count,
        train_count,
        inputs,
        results,
        extra_passes,
        debug):
    LOG('running seed {}:'.format(seed))
    results['train'] = os.path.join(
        results['root'],
        'train',
        'diffs.pbs.gz')
    results['test'] = os.path.join(results['root'], 'test', 'rows.pbs.gz')
    results['scores'] = os.path.join(results['root'], 'scores.json.gz')

    config = {
        'seed': seed,
        'schedule': {'extra_passes': extra_passes},
    }
    loom.config.config_dump(config, results['samples'][0]['config'])

    numpy.random.seed(seed)
    split = [True] * train_count + [False] * test_count
    numpy.random.shuffle(split)
    diffs_in = protobuf_stream_load(inputs['ingest']['diffs'])
    protobuf_stream_dump(
        (row for s, row in izip(split, diffs_in) if s),
        results['train'])
    rows_in = protobuf_stream_load(inputs['ingest']['rows'])
    protobuf_stream_dump(
        (row for s, row in izip(split, rows_in) if not s),
        results['test'])

    LOG(' shuffle')
    loom.runner.shuffle(
        rows_in=results['train'],
        rows_out=results['samples'][0]['shuffled'],
        seed=seed,
        debug=debug)
    LOG(' init')
    loom.generate.generate_init(
        encoding_in=inputs['ingest']['encoding'],
        model_out=results['samples'][0]['init'],
        seed=seed)
    LOG(' infer')
    loom.runner.infer(
        config_in=results['samples'][0]['config'],
        rows_in=results['samples'][0]['shuffled'],
        tares_in=inputs['ingest']['tares'],
        model_in=results['samples'][0]['init'],
        model_out=results['samples'][0]['model'],
        groups_out=results['samples'][0]['groups'],
        debug=debug)
    LOG(' query')
    rows = loom.query.load_data_rows(results['test'])
    loom.config.config_dump({}, results['query']['config'])
    with loom.query.get_server(results['root'], debug=debug) as query:
        scores = [query.score(row) for row in rows]

    json_dump(scores, results['scores'])
    LOG(' done\n')
    return numpy.mean(scores)