示例#1
0
文件: format.py 项目: jostheim/loom
def _make_encoder_builders_file((schema_in, rows_in)):
    assert os.path.isfile(rows_in)
    schema = json_load(schema_in)
    with csv_reader(rows_in) as reader:
        header = reader.next()
        builders = []
        seen = set()
        for name in header:
            if name in schema:
                if name in seen:
                    raise LoomError('Repeated column {} in csv file {}'.format(
                        name, rows_in))
                seen.add(name)
                model = schema[name]
                Builder = ENCODER_BUILDERS[model]
                builder = Builder(name, model)
            else:
                builder = None
            builders.append(builder)
        if all(builder is None for builder in builders):
            raise LoomError(
                'Csv file has no known features;'
                ', try adding a header to {}'.format(rows_in))
        missing_features = sorted(set(schema) - seen)
        if missing_features:
            raise LoomError('\n  '.join(
                ['Csv file is missing features:'] + missing_features))
        for row in reader:
            for value, builder in izip(row, builders):
                if builder is not None:
                    value = value.strip()
                    if value:
                        builder.add_value(value)
    return [b for b in builders if b is not None]
示例#2
0
文件: format.py 项目: edwardt/loom
def _import_rows_file(args):
    encoding_in, rows_csv_in, rows_out, id_offset, id_stride = args
    assert os.path.isfile(rows_csv_in)
    encoders = json_load(encoding_in)
    message = loom.cFormat.Row()
    add_field = {
        'booleans': message.add_booleans,
        'counts': message.add_counts,
        'reals': message.add_reals,
    }
    with open_compressed(rows_csv_in, 'rb') as f:
        reader = csv.reader(f)
        feature_names = list(reader.next())
        name_to_pos = {name: i for i, name in enumerate(feature_names)}
        schema = []
        for encoder in encoders:
            pos = name_to_pos.get(encoder['name'])
            add = add_field[loom.schema.MODEL_TO_DATATYPE[encoder['model']]]
            encode = load_encoder(encoder)
            schema.append((pos, add, encode))

        def rows():
            for i, row in enumerate(reader):
                message.id = id_offset + id_stride * i
                for pos, add, encode in schema:
                    value = None if pos is None else row[pos].strip()
                    observed = bool(value)
                    message.add_observed(observed)
                    if observed:
                        add(encode(value))
                yield message
                message.Clear()

        loom.cFormat.row_stream_dump(rows(), rows_out)
示例#3
0
def related(
        name=None,
        sample_count=loom.preql.SAMPLE_COUNT,
        debug=False,
        profile='time'):
    '''
    Run related query.
    '''
    loom.store.require(name, [
        'ingest.schema',
        'ingest.encoding',
        'samples.0.config',
        'samples.0.model',
        'samples.0.groups',
    ])
    inputs, results = get_paths(name, 'related')
    loom.config.config_dump({}, inputs['query']['config'])
    root = inputs['root']
    encoding = inputs['ingest']['encoding']
    features = sorted(json_load(inputs['ingest']['schema']).keys())

    print 'starting server'
    with loom.preql.get_server(root, encoding, debug, profile) as preql:
        print 'querying {} features'.format(len(features))
        preql.relate(features, sample_count=sample_count)
示例#4
0
def test_metis():

    if os.path.exists(METIS_ARGS_TEMPFILE):
        print 'Loading metis args from %s' % METIS_ARGS_TEMPFILE
        args = json_load(METIS_ARGS_TEMPFILE)

    else:
        print 'Using simple metis args'
        args = {
            'nparts': 2,
            'adjacency': [[0, 2, 3], [1, 2], [0, 1, 2], [0, 3]],
            'eweights': [1073741824, 429496736, 357913952, 1073741824,
                         536870912, 429496736, 536870912, 1073741824,
                         357913952, 1073741824],
        }

    assert len(args['eweights']) == sum(map(len, args['adjacency']))

    print 'Running unweighted metis...'
    unweighted = dict(args)
    del unweighted['eweights']
    edge_cut, partition = pymetis.part_graph(**unweighted)
    print 'Finished unweighted metis'

    print 'Running metis...'
    edge_cut, partition = pymetis.part_graph(**args)
    print 'Finished metis'
示例#5
0
def test_predict(root, rows_csv, encoding, **unused):
    COUNT = 10
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        with loom.query.get_server(root, debug=True) as query_server:
            result_out = 'predictions_out.csv'
            rows_in = os.listdir(rows_csv)[0]
            rows_in = os.path.join(rows_csv, rows_in)
            encoders = json_load(encoding)
            name_to_encoder = {e['name']: load_encoder(e) for e in encoders}
            preql = loom.preql.PreQL(query_server, encoding)
            preql.predict(rows_in, COUNT, result_out, id_offset=False)
            with open_compressed(rows_in, 'rb') as fin:
                with open(result_out, 'r') as fout:
                    in_reader = csv.reader(fin)
                    out_reader = csv.reader(fout)
                    fnames = in_reader.next()
                    out_reader.next()
                    for in_row in in_reader:
                        for i in range(COUNT):
                            out_row = out_reader.next()
                            bundle = zip(fnames, in_row, out_row)
                            for name, in_val, out_val in bundle:
                                encode = name_to_encoder[name]
                                observed = bool(in_val.strip())
                                if observed:
                                    assert_almost_equal(
                                        encode(in_val),
                                        encode(out_val))
                                else:
                                    assert_true(bool(out_val.strip()))
示例#6
0
文件: datasets.py 项目: jostheim/loom
def load(name, schema, rows_csv):
    '''
    Load a csv dataset for testing and benchmarking.
    '''
    assert os.path.exists(schema)
    assert schema.endswith('.json')
    assert os.path.exists(rows_csv)
    if os.path.isfile(rows_csv):
        assert rows_csv.endswith('.csv') or rows_csv.endswith('.csv.gz')
    else:
        assert os.path.isdir(rows_csv)
    paths = loom.store.get_paths(name)
    assert not os.path.exists(paths['root']), 'dataset already loaded'
    json_dump(json_load(schema), paths['ingest']['schema'])
    loom.format.make_schema_row(
        schema_in=paths['ingest']['schema'],
        schema_row_out=paths['ingest']['schema_row'])
    if os.path.isdir(rows_csv):
        os.symlink(rows_csv, paths['ingest']['rows_csv'])
    else:
        os.makedirs(paths['ingest']['rows_csv'])
        os.symlink(
            rows_csv,
            os.path.join(
                paths['ingest']['rows_csv'],
                os.path.basename(rows_csv)))
示例#7
0
def _make_encoder_builders_file((schema_in, rows_in)):
    assert os.path.isfile(rows_in)
    schema = json_load(schema_in)
    with csv_reader(rows_in) as reader:
        header = reader.next()
        builders = []
        seen = set()
        for name in header:
            if name in schema:
                if name in seen:
                    raise LoomError('Repeated column {} in csv file {}'.format(
                        name, rows_in))
                seen.add(name)
                model = schema[name]
                Builder = ENCODER_BUILDERS[model]
                builder = Builder(name, model)
            else:
                builder = None
            builders.append(builder)
        if all(builder is None for builder in builders):
            raise LoomError('Csv file has no known features;'
                            ', try adding a header to {}'.format(rows_in))
        missing_features = sorted(set(schema) - seen)
        if missing_features:
            raise LoomError('\n  '.join(['Csv file is missing features:'] +
                                        missing_features))
        for row in reader:
            for value, builder in izip(row, builders):
                if builder is not None:
                    value = value.strip()
                    if value:
                        builder.add_value(value)
    return [b for b in builders if b is not None]
示例#8
0
文件: tasks.py 项目: jostheim/loom
def make_consensus(name, config=None, debug=False):
    '''
    Combine samples into a single consensus sample.
    Arguments:
        name            A unique identifier for consensus
        config          An optional json config file
                            currently doesn't do anything but will be used to
                            support e.g. cluster coarseness in the future
        debug           Whether to run debug versions of C++ code
    Environment varibles:
        LOOM_VERBOSITY  Verbosity level
    '''
    paths = loom.store.get_paths(name)
    LOG('making config')
    if config is None:
        config = {}
    elif isinstance(config, basestring):
        if not os.path.exists(config):
            raise LoomError('Missing config file: {}'.format(config))
        config = json_load(config)
    else:
        config = copy.deepcopy(config)
    loom.config.config_dump(config, paths['samples'][0]['config'])

    LOG('finding consensus')
    loom.consensus.make_consensus(paths=paths, debug=debug)
示例#9
0
def _check_predictions(rows_in, result_out, encoding):
    encoders = json_load(encoding)
    name_to_encoder = {e['name']: load_encoder(e) for e in encoders}
    with open_compressed(rows_in, 'rb') as fin:
        with open(result_out, 'r') as fout:
            in_reader = csv.reader(fin)
            out_reader = csv.reader(fout)
            fnames = in_reader.next()
            out_reader.next()
            for in_row in in_reader:
                for i in range(COUNT):
                    out_row = out_reader.next()
                    bundle = zip(fnames, in_row, out_row)
                    for name, in_val, out_val in bundle:
                        if name == '_id':
                            assert_equal(in_val, out_val)
                            continue
                        encode = name_to_encoder[name]
                        observed = bool(in_val.strip())
                        if observed:
                            assert_almost_equal(
                                encode(in_val),
                                encode(out_val))
                        else:
                            assert_true(bool(out_val.strip()))
示例#10
0
def make_fake_encoding(schema_in, model_in, encoding_out):
    '''
    Make a fake encoding from json schema + model.
    Assume that feature names in schema correspond to featureids in model
    e.g. schema was generated from loom.format.make_schema
    '''
    schema = json_load(schema_in)
    fields = []
    builders = []
    name_to_builder = {}
    for name, model in sorted(schema.iteritems()):
        fields.append(loom.schema.MODEL_TO_DATATYPE[model])
        Builder = FAKE_ENCODER_BUILDERS[model]
        builder = Builder(name, model)
        builders.append(builder)
        name_to_builder[name] = builder

    cross_cat = loom.schema_pb2.CrossCat()
    with open_compressed(model_in, 'rb') as f:
        cross_cat.ParseFromString(f.read())
    for kind in cross_cat.kinds:
        featureid = iter(kind.featureids)
        for model in loom.schema.MODELS.iterkeys():
            for shared in getattr(kind.product_model, model):
                feature_name = '{:06d}'.format(featureid.next())
                assert feature_name in schema
                if model == 'dd':
                    for i in range(len(shared.alphas)):
                        name_to_builder[feature_name].add_value(str(i))
                elif model == 'dpd':
                    for val in shared.values:
                        name_to_builder[feature_name].add_value(str(val))
    encoders = [b.build() for b in builders]
    ensure_fake_encoders_are_sorted(encoders)
    json_dump(encoders, encoding_out)
示例#11
0
def make_consensus(name, config=None, debug=False):
    '''
    Combine samples into a single consensus sample.
    Arguments:
        name            A unique identifier for consensus
        config          An optional json config file
                            currently doesn't do anything but will be used to
                            support e.g. cluster coarseness in the future
        debug           Whether to run debug versions of C++ code
    Environment varibles:
        LOOM_VERBOSITY  Verbosity level
    '''
    paths = loom.store.get_paths(name)
    LOG('making config')
    if config is None:
        config = {}
    elif isinstance(config, basestring):
        if not os.path.exists(config):
            raise LoomError('Missing config file: {}'.format(config))
        config = json_load(config)
    else:
        config = copy.deepcopy(config)
    loom.config.config_dump(config, paths['samples'][0]['config'])

    LOG('finding consensus')
    loom.consensus.make_consensus(paths=paths, debug=debug)
示例#12
0
文件: util.py 项目: manderle01/loom
def pretty_print(filename, message_type='guess'):
    '''
    Print text/json/protobuf messages from a raw/gz/bz2 file.
    '''
    parts = os.path.basename(filename).split('.')
    if parts[-1] in ['gz', 'bz2']:
        parts.pop()
    protocol = parts[-1]
    if protocol == 'json':
        data = json_load(filename)
        print json.dumps(data, sort_keys=True, indent=4)
    elif protocol == 'pb':
        message = get_message(filename, message_type)
        with open_compressed(filename) as f:
            message.ParseFromString(f.read())
            print message
    elif protocol == 'pbs':
        message = get_message(filename, message_type)
        for string in protobuf_stream_load(filename):
            message.ParseFromString(string)
            print message
    elif protocol == 'pickle':
        data = pickle_load(filename)
        print repr(data)
    else:
        with open_compressed(filename) as f:
            for line in f:
                print line,
示例#13
0
文件: format.py 项目: jostheim/loom
def make_fake_encoding(schema_in, model_in, encoding_out):
    '''
    Make a fake encoding from json schema + model.
    Assume that feature names in schema correspond to featureids in model
    e.g. schema was generated from loom.format.make_schema
    '''
    schema = json_load(schema_in)
    fields = []
    builders = []
    name_to_builder = {}
    for name, model in sorted(schema.iteritems()):
        fields.append(loom.schema.MODEL_TO_DATATYPE[model])
        Builder = FAKE_ENCODER_BUILDERS[model]
        builder = Builder(name, model)
        builders.append(builder)
        name_to_builder[name] = builder

    cross_cat = loom.schema_pb2.CrossCat()
    with open_compressed(model_in, 'rb') as f:
        cross_cat.ParseFromString(f.read())
    for kind in cross_cat.kinds:
        featureid = iter(kind.featureids)
        for model in loom.schema.MODELS.iterkeys():
            for shared in getattr(kind.product_model, model):
                feature_name = '{:06d}'.format(featureid.next())
                assert feature_name in schema
                if model == 'dd':
                    for i in range(len(shared.alphas)):
                        name_to_builder[feature_name].add_value(str(i))
                elif model == 'dpd':
                    for val in shared.values:
                        name_to_builder[feature_name].add_value(str(val))
    encoders = [b.build() for b in builders]
    ensure_fake_encoders_are_sorted(encoders)
    json_dump(encoders, encoding_out)
示例#14
0
def load(name, schema, rows_csv):
    '''
    Load a csv dataset for testing and benchmarking.
    '''
    assert os.path.exists(schema)
    assert schema.endswith('.json')
    assert os.path.exists(rows_csv)
    if os.path.isfile(rows_csv):
        assert rows_csv.endswith('.csv') or rows_csv.endswith('.csv.gz')
    else:
        assert os.path.isdir(rows_csv)
    paths = loom.store.get_paths(name)
    assert not os.path.exists(paths['root']), 'dataset already loaded'
    json_dump(json_load(schema), paths['ingest']['schema'])
    loom.format.make_schema_row(
        schema_in=paths['ingest']['schema'],
        schema_row_out=paths['ingest']['schema_row'])
    if os.path.isdir(rows_csv):
        os.symlink(rows_csv, paths['ingest']['rows_csv'])
    else:
        os.makedirs(paths['ingest']['rows_csv'])
        os.symlink(
            rows_csv,
            os.path.join(
                paths['ingest']['rows_csv'],
                os.path.basename(rows_csv)))
示例#15
0
def test_group_runs(root, schema, encoding, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        with loom.preql.get_server(root, encoding, debug=True) as preql:
            test_columns = json_load(schema).keys()[:10]
            for column in test_columns:
                groupings_csv = 'group.{}.csv'.format(column)
                preql.group(column, result_out=groupings_csv)
                print open(groupings_csv).read()
示例#16
0
def test_group_runs(root, schema, encoding, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        with loom.preql.get_server(root, encoding, debug=True) as preql:
            test_columns = json_load(schema).keys()[:10]
            for column in test_columns:
                groupings_csv = 'group.{}.csv'.format(column)
                preql.group(column, result_out=groupings_csv)
                print open(groupings_csv).read()
示例#17
0
def _test_modify_schema(modify, name, schema, rows_csv, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR) as store:
        with mock.patch('loom.store.STORE', new=store):
            modified_schema = os.path.join(store, 'schema.json')
            data = json_load(schema)
            data = modify(data)
            json_dump(data, modified_schema)
            loom.tasks.ingest(name, modified_schema, rows_csv, debug=True)
示例#18
0
文件: generate.py 项目: jostheim/loom
def generate_init(encoding_in, model_out, seed=0):
    '''
    Generate an initial model for inference.
    '''
    numpy.random.seed(seed)
    encoders = json_load(encoding_in)
    features = import_features(encoders)
    cross_cat = generate_model(features)
    with open_compressed(model_out, 'wb') as f:
        f.write(cross_cat.SerializeToString())
示例#19
0
def test_relate_pandas(root, rows_csv, schema, **unused):
    feature_count = len(json_load(schema))
    with loom.preql.get_server(root, debug=True) as preql:
        result_string = preql.relate(preql.feature_names)
        result_df = pandas.read_csv(StringIO(result_string), index_col=0)
        print 'result_df ='
        print result_df
        assert_equal(result_df.ndim, 2)
        assert_equal(result_df.shape[0], feature_count)
        assert_equal(result_df.shape[1], feature_count)
示例#20
0
def test_relate_pandas(root, rows_csv, schema, **unused):
    feature_count = len(json_load(schema))
    with loom.preql.get_server(root, debug=True) as preql:
        result_string = preql.relate(preql.feature_names)
        result_df = pandas.read_csv(StringIO(result_string), index_col=0)
        print 'result_df ='
        print result_df
        assert_equal(result_df.ndim, 2)
        assert_equal(result_df.shape[0], feature_count)
        assert_equal(result_df.shape[1], feature_count)
示例#21
0
def generate_init(encoding_in, model_out, seed=0):
    '''
    Generate an initial model for inference.
    '''
    numpy.random.seed(seed)
    encoders = json_load(encoding_in)
    features = import_features(encoders)
    cross_cat = generate_model(features)
    with open_compressed(model_out, 'wb') as f:
        f.write(cross_cat.SerializeToString())
示例#22
0
def infer_one(name, seed=0, config=None, debug=False):
    '''
    Infer a single sample.
    Arguments:
        name            A unique identifier for ingest + inference
        seed            The seed, i.e., sample number typically 0-9
        config          An optional json config file, e.g.,
                            {"schedule": {"extra_passes": 500.0}}
        debug           Whether to run debug versions of C++ code
    Environment variables:
        LOOM_VERBOSITY  Verbosity level
    '''
    paths = loom.store.get_paths(name, sample_count=(1 + seed))
    sample = paths['samples'][seed]

    LOG('making config')
    if config is None:
        config = {}
    elif isinstance(config, basestring):
        if not os.path.exists(config):
            raise LoomError('Missing config file: {}'.format(config))
        config = json_load(config)
    else:
        config = copy.deepcopy(config)
    if 'seed' not in config:
        config['seed'] = seed
    loom.config.config_dump(config, sample['config'])

    LOG('generating init')
    loom.generate.generate_init(
        encoding_in=paths['ingest']['encoding'],
        model_out=sample['init'],
        seed=seed)

    LOG('shuffling rows')
    loom.runner.shuffle(
        rows_in=paths['ingest']['diffs'],
        rows_out=sample['shuffled'],
        seed=seed,
        debug=debug)

    LOG('inferring, watch {}'.format(sample['infer_log']))
    loom.runner.infer(
        config_in=sample['config'],
        rows_in=sample['shuffled'],
        tares_in=paths['ingest']['tares'],
        model_in=sample['init'],
        model_out=sample['model'],
        groups_out=sample['groups'],
        assign_out=sample['assign'],
        log_out=sample['infer_log'],
        debug=debug)
示例#23
0
def _retrieve_featureid_to_cgpm(path):
    """Returns a dict mapping loom's 0-based featureid to cgpm.outputs."""
    # Loom orders features alphabetically based on statistical types:
    # i.e. 'bb' < 'dd' < 'nich'. The ordering is stored in
    # `ingest/encoding.json.gz`.
    encoding_in = os.path.join(path, 'ingest', 'encoding.json.gz')
    features = json_load(encoding_in)

    def colname_to_output(cname):
        # Convert dummy column name from 'c00012' to the integer 12.
        return int(cname.replace('c', ''))

    return {i: colname_to_output(f['name']) for i, f in enumerate(features)}
示例#24
0
文件: format.py 项目: edwardt/loom
def make_schema_row(schema_in, schema_row_out):
    '''
    Convert json schema to protobuf schema row.
    '''
    schema = json_load(schema_in)
    value = loom.schema_pb2.ProductValue()
    value.observed.sparsity = loom.schema_pb2.ProductValue.Observed.DENSE
    for model in schema.itervalues():
        field = loom.schema.MODEL_TO_DATATYPE[model]
        value.observed.dense.append(True)
        getattr(value, field).append(EXAMPLE_VALUES[field])
    with open_compressed(schema_row_out, 'wb') as f:
        f.write(value.SerializeToString())
示例#25
0
文件: tasks.py 项目: jostheim/loom
def infer_one(name, seed=0, config=None, debug=False):
    '''
    Infer a single sample.
    Arguments:
        name            A unique identifier for ingest + inference
        seed            The seed, i.e., sample number typically 0-9
        config          An optional json config file, e.g.,
                            {"schedule": {"extra_passes": 500.0}}
        debug           Whether to run debug versions of C++ code
    Environment variables:
        LOOM_VERBOSITY  Verbosity level
    '''
    paths = loom.store.get_paths(name, sample_count=(1 + seed))
    sample = paths['samples'][seed]

    LOG('making config')
    if config is None:
        config = {}
    elif isinstance(config, basestring):
        if not os.path.exists(config):
            raise LoomError('Missing config file: {}'.format(config))
        config = json_load(config)
    else:
        config = copy.deepcopy(config)
    config['seed'] = seed
    loom.config.config_dump(config, sample['config'])

    LOG('generating init')
    loom.generate.generate_init(
        encoding_in=paths['ingest']['encoding'],
        model_out=sample['init'],
        seed=seed)

    LOG('shuffling rows')
    loom.runner.shuffle(
        rows_in=paths['ingest']['diffs'],
        rows_out=sample['shuffled'],
        seed=seed,
        debug=debug)

    LOG('inferring, watch {}'.format(sample['infer_log']))
    loom.runner.infer(
        config_in=sample['config'],
        rows_in=sample['shuffled'],
        tares_in=paths['ingest']['tares'],
        model_in=sample['init'],
        model_out=sample['model'],
        groups_out=sample['groups'],
        assign_out=sample['assign'],
        log_out=sample['infer_log'],
        debug=debug)
示例#26
0
文件: test_query.py 项目: fritzo/loom
def test_tiled_entropy(root, schema, **unused):
    feature_count = len(json_load(schema))
    feature_sets = [frozenset([i]) for i in xrange(feature_count)]
    kwargs = {
        'row_sets': feature_sets,
        'col_sets': feature_sets,
        'sample_count': 10
    }
    with loom.query.get_server(root, debug=True) as server:
        expected = set(server.entropy(**kwargs))
        for tile_size in xrange(1, 1 + feature_count):
            print 'tile_size = {}'.format(tile_size)
            actual = set(server.entropy(tile_size=tile_size, **kwargs))
            assert_set_equal(expected, actual)
示例#27
0
def test_tiled_entropy(root, schema, **unused):
    feature_count = len(json_load(schema))
    feature_sets = [frozenset([i]) for i in xrange(feature_count)]
    kwargs = {
        'row_sets': feature_sets,
        'col_sets': feature_sets,
        'sample_count': 10
    }
    with loom.query.get_server(root, debug=True) as server:
        expected = set(server.entropy(**kwargs))
        for tile_size in xrange(1, 1 + feature_count):
            print 'tile_size = {}'.format(tile_size)
            actual = set(server.entropy(tile_size=tile_size, **kwargs))
            assert_set_equal(expected, actual)
示例#28
0
def make_schema_row(schema_in, schema_row_out):
    '''
    Convert json schema to protobuf schema row.
    '''
    schema = json_load(schema_in)
    if not schema:
        raise LoomError('Schema is empty: {}'.format(schema_in))
    value = loom.schema_pb2.ProductValue()
    value.observed.sparsity = loom.schema_pb2.ProductValue.Observed.DENSE
    for model in schema.itervalues():
        try:
            field = loom.schema.MODEL_TO_DATATYPE[model]
        except KeyError:
            raise LoomError('Unknown model {} in schema {}'.format(
                model, schema_in))
        value.observed.dense.append(True)
        getattr(value, field).append(EXAMPLE_VALUES[field])
    with open_compressed(schema_row_out, 'wb') as f:
        f.write(value.SerializeToString())
示例#29
0
文件: format.py 项目: jostheim/loom
def make_schema_row(schema_in, schema_row_out):
    '''
    Convert json schema to protobuf schema row.
    '''
    schema = json_load(schema_in)
    if not schema:
        raise LoomError('Schema is empty: {}'.format(schema_in))
    value = loom.schema_pb2.ProductValue()
    value.observed.sparsity = loom.schema_pb2.ProductValue.Observed.DENSE
    for model in schema.itervalues():
        try:
            field = loom.schema.MODEL_TO_DATATYPE[model]
        except KeyError:
            raise LoomError('Unknown model {} in schema {}'.format(
                model, schema_in))
        value.observed.dense.append(True)
        getattr(value, field).append(EXAMPLE_VALUES[field])
    with open_compressed(schema_row_out, 'wb') as f:
        f.write(value.SerializeToString())
示例#30
0
def test_predict_pandas(root, rows_csv, schema, **unused):
    feature_count = len(json_load(schema))
    with loom.preql.get_server(root, debug=True) as preql:
        rows_filename = os.path.join(rows_csv, os.listdir(rows_csv)[0])
        with open_compressed(rows_filename) as f:
            rows_df = pandas.read_csv(f,
                                      converters=preql.converters,
                                      index_col='_id')
        print 'rows_df ='
        print rows_df
        row_count = rows_df.shape[0]
        assert_equal(rows_df.shape[1], feature_count)
        rows_io = StringIO(rows_df.to_csv())
        result_string = preql.predict(rows_io, COUNT, id_offset=True)
        result_df = pandas.read_csv(StringIO(result_string), index_col=False)
        print 'result_df ='
        print result_df
        assert_equal(result_df.ndim, 2)
        assert_equal(result_df.shape[0], row_count * COUNT)
        assert_equal(result_df.shape[1], 1 + feature_count)
示例#31
0
def export_rows(encoding_in, rows_in, rows_csv_out, chunk_size=1000000):
    '''
    Export rows from gzipped-protobuf-stream to directory-of-gzipped-csv-files.
    '''
    rows_csv_out = os.path.abspath(rows_csv_out)
    if rows_csv_out == os.getcwd():
        raise LoomError('Cannot export_rows to working directory')
    for ext in ['.csv', '.gz', '.bz2']:
        if rows_csv_out.endswith(ext):
            raise LoomError(
                'Expected rows_csv_out to be a dirname, actual'.format(
                    rows_csv_out))
    if not (chunk_size > 0):
        raise LoomError('Invalid chunk_size {}, must be positive'.format(
            chunk_size))
    encoders = json_load(encoding_in)
    fields = [loom.schema.MODEL_TO_DATATYPE[e['model']] for e in encoders]
    decoders = [load_decoder(e) for e in encoders]
    header = ['_id'] + [e['name'] for e in encoders]
    if os.path.exists(rows_csv_out):
        shutil.rmtree(rows_csv_out)
    os.makedirs(rows_csv_out)
    row_count = sum(1 for _ in protobuf_stream_load(rows_in))
    rows = loom.cFormat.row_stream_load(rows_in)
    chunk_count = (row_count + chunk_size - 1) / chunk_size
    chunks = sorted(
        os.path.join(rows_csv_out, 'rows.{}.csv.gz'.format(i))
        for i in xrange(chunk_count)
    )
    with ExitStack() as stack:
        with_ = stack.enter_context
        writers = [with_(csv_writer(f)) for f in chunks]
        for writer in writers:
            writer.writerow(header)
        for row, writer in izip(rows, cycle(writers)):
            data = row.iter_data()
            schema = izip(data['observed'], fields, decoders)
            csv_row = [row.id]
            for observed, field, decode in schema:
                csv_row.append(decode(data[field].next()) if observed else '')
            writer.writerow(csv_row)
示例#32
0
文件: format.py 项目: edwardt/loom
def make_fake_encoding(schema_in, rows_in, encoding_out):
    '''
    Make a fake encoding from json schema + protobuf rows.
    '''
    schema = json_load(schema_in)
    fields = []
    builders = []
    for name, model in sorted(schema.iteritems()):
        fields.append(loom.schema.MODEL_TO_DATATYPE[model])
        Builder = FAKE_ENCODER_BUILDERS[model]
        builder = Builder(name, model)
        builders.append(builder)
    for row in loom.cFormat.row_stream_load(rows_in):
        data = row.iter_data()
        observeds = data['observed']
        for observed, field, builder in izip(observeds, fields, builders):
            if observed:
                builder.add_value(str(data[field].next()))
    encoders = [builder.build() for builder in builders]
    ensure_fake_encoders_are_sorted(encoders)
    json_dump(encoders, encoding_out)
示例#33
0
def test_predict_pandas(root, rows_csv, schema, **unused):
    feature_count = len(json_load(schema))
    with loom.preql.get_server(root, debug=True) as preql:
        rows_filename = os.path.join(rows_csv, os.listdir(rows_csv)[0])
        with open_compressed(rows_filename) as f:
            rows_df = pandas.read_csv(
                f,
                converters=preql.converters,
                index_col='_id')
        print 'rows_df ='
        print rows_df
        row_count = rows_df.shape[0]
        assert_equal(rows_df.shape[1], feature_count)
        rows_io = StringIO(rows_df.to_csv())
        result_string = preql.predict(rows_io, COUNT, id_offset=True)
        result_df = pandas.read_csv(StringIO(result_string), index_col=False)
        print 'result_df ='
        print result_df
        assert_equal(result_df.ndim, 2)
        assert_equal(result_df.shape[0], row_count * COUNT)
        assert_equal(result_df.shape[1], 1 + feature_count)
示例#34
0
文件: format.py 项目: edwardt/loom
def _make_encoder_builders_file((schema_in, rows_in)):
    assert os.path.isfile(rows_in)
    schema = json_load(schema_in)
    with open_compressed(rows_in, 'rb') as f:
        reader = csv.reader(f)
        header = reader.next()
        builders = []
        for name in header:
            if name in schema:
                model = schema[name]
                Builder = ENCODER_BUILDERS[model]
                builder = Builder(name, model)
            else:
                builder = None
            builders.append(builder)
        for row in reader:
            for value, builder in izip(row, builders):
                if builder is not None:
                    value = value.strip()
                    if value:
                        builder.add_value(value)
    return [b for b in builders if b is not None]
示例#35
0
文件: format.py 项目: edwardt/loom
def export_rows(encoding_in, rows_in, rows_csv_out, chunk_size=1000000):
    '''
    Export rows from protobuf stream to csv.
    '''
    for ext in ['.csv', '.gz', '.bz2']:
        assert not rows_csv_out.endswith(ext),\
            'rows_csv_out should be a dirname'
    assert chunk_size > 0
    encoders = json_load(encoding_in)
    fields = [loom.schema.MODEL_TO_DATATYPE[e['model']] for e in encoders]
    decoders = [load_decoder(e) for e in encoders]
    header = [e['name'] for e in encoders]
    if os.path.exists(rows_csv_out):
        shutil.rmtree(rows_csv_out)
    os.makedirs(rows_csv_out)
    rows = loom.cFormat.row_stream_load(rows_in)
    try:
        empty = None
        for i in xrange(MAX_CHUNK_COUNT):
            file_out = os.path.join(
                rows_csv_out,
                'rows_{:06d}.csv.gz'.format(i))
            with open_compressed(file_out, 'wb') as f:
                writer = csv.writer(f)
                writer.writerow(header)
                empty = file_out
                for j in xrange(chunk_size):
                    data = rows.next().iter_data()
                    schema = izip(data['observed'], fields, decoders)
                    row = [
                        decode(data[field].next()) if observed else ''
                        for observed, field, decode in schema
                    ]
                    writer.writerow(row)
                    empty = None
    except StopIteration:
        if empty:
            os.remove(empty)
示例#36
0
def transform_rows(schema_in, transforms_in, rows_in, rows_out, id_field=None):
    transforms = pickle_load(transforms_in)
    if not transforms:
        cp_ns(rows_in, rows_out)
    else:
        transform = TransformSequence(transforms)
        transformed_header = sorted(json_load(schema_in).iterkeys())
        if id_field is not None:
            assert id_field not in transformed_header
            transformed_header = [id_field] + transformed_header
        tasks = []
        if os.path.isdir(rows_in):
            loom.util.mkdir_p(rows_out)
            for f in os.listdir(rows_in):
                tasks.append((
                    transform,
                    transformed_header,
                    os.path.join(rows_in, f),
                    os.path.join(rows_out, f),
                ))
        else:
            tasks.append((transform, transformed_header, rows_in, rows_out))
        parallel_map(_transform_rows, tasks)
示例#37
0
def transform_rows(schema_in, transforms_in, rows_in, rows_out, id_field=None):
    transforms = pickle_load(transforms_in)
    if not transforms:
        cp_ns(rows_in, rows_out)
    else:
        transform = TransformSequence(transforms)
        transformed_header = sorted(json_load(schema_in).iterkeys())
        if id_field is not None:
            assert id_field not in transformed_header
            transformed_header = [id_field] + transformed_header
        tasks = []
        if os.path.isdir(rows_in):
            loom.util.mkdir_p(rows_out)
            for f in os.listdir(rows_in):
                tasks.append((
                    transform,
                    transformed_header,
                    os.path.join(rows_in, f),
                    os.path.join(rows_out, f),
                ))
        else:
            tasks.append((transform, transformed_header, rows_in, rows_out))
        parallel_map(_transform_rows, tasks)
示例#38
0
文件: format.py 项目: jostheim/loom
def _import_rows_file(args):
    rows_csv_in, rows_out, id_offset, id_stride, encoding_in = args
    assert os.path.isfile(rows_csv_in)
    encoders = json_load(encoding_in)
    message = loom.cFormat.Row()
    add_field = {
        'booleans': message.add_booleans,
        'counts': message.add_counts,
        'reals': message.add_reals,
    }
    with csv_reader(rows_csv_in) as reader:
        feature_names = list(reader.next())
        header_length = len(feature_names)
        name_to_pos = {name: i for i, name in enumerate(feature_names)}
        schema = []
        for encoder in encoders:
            pos = name_to_pos.get(encoder['name'])
            add = add_field[loom.schema.MODEL_TO_DATATYPE[encoder['model']]]
            encode = load_encoder(encoder)
            schema.append((pos, add, encode))

        def rows():
            for i, row in enumerate(reader):
                if len(row) != header_length:
                    raise LoomError('row {} has wrong length {}:\n{}'.format(
                        i, len(row), row))
                message.id = id_offset + id_stride * i
                for pos, add, encode in schema:
                    value = None if pos is None else row[pos].strip()
                    observed = bool(value)
                    message.add_observed(observed)
                    if observed:
                        add(encode(value))
                yield message
                message.Clear()

        loom.cFormat.row_stream_dump(rows(), rows_out)
示例#39
0
def _import_rows_file(args):
    rows_csv_in, rows_out, id_offset, id_stride, encoding_in = args
    assert os.path.isfile(rows_csv_in)
    encoders = json_load(encoding_in)
    message = loom.cFormat.Row()
    add_field = {
        'booleans': message.add_booleans,
        'counts': message.add_counts,
        'reals': message.add_reals,
    }
    with csv_reader(rows_csv_in) as reader:
        feature_names = list(reader.next())
        header_length = len(feature_names)
        name_to_pos = {name: i for i, name in enumerate(feature_names)}
        schema = []
        for encoder in encoders:
            pos = name_to_pos.get(encoder['name'])
            add = add_field[loom.schema.MODEL_TO_DATATYPE[encoder['model']]]
            encode = load_encoder(encoder)
            schema.append((pos, add, encode))

        def rows():
            for i, row in enumerate(reader):
                if len(row) != header_length:
                    raise LoomError('row {} has wrong length {}:\n{}'.format(
                        i, len(row), row))
                message.id = id_offset + id_stride * i
                for pos, add, encode in schema:
                    value = None if pos is None else row[pos].strip()
                    observed = bool(value)
                    message.add_observed(observed)
                    if observed:
                        add(encode(value))
                yield message
                message.Clear()

        loom.cFormat.row_stream_dump(rows(), rows_out)
示例#40
0
 def __init__(self, query_server, encoding=None, debug=False):
     self._paths = loom.store.get_paths(query_server.root)
     if encoding is None:
         encoding = self._paths['ingest']['encoding']
     self._query_server = query_server
     self._encoders = json_load(encoding)
     transforms = self._paths['ingest']['transforms']
     self._transform = loom.transforms.load_transforms(transforms)
     self._feature_names = [e['name'] for e in self._encoders]
     self._feature_set = frozenset(self._feature_names)
     self._name_to_pos = {
         name: i
         for i, name in enumerate(self._feature_names)
     }
     self._name_to_decode = {
         e['name']: load_decoder(e)
         for e in self._encoders
     }
     self._name_to_encode = {
         e['name']: load_encoder(e)
         for e in self._encoders
     }
     self._rowid_map = None
     self._debug = debug
示例#41
0
文件: preql.py 项目: adammendoza/loom
 def __init__(self, query_server, encoding=None, debug=False):
     self._paths = loom.store.get_paths(query_server.root)
     if encoding is None:
         encoding = self._paths['ingest']['encoding']
     self._query_server = query_server
     self._encoders = json_load(encoding)
     transforms = self._paths['ingest']['transforms']
     self._transform = loom.transforms.load_transforms(transforms)
     self._feature_names = [e['name'] for e in self._encoders]
     self._feature_set = frozenset(self._feature_names)
     self._name_to_pos = {
         name: i
         for i, name in enumerate(self._feature_names)
     }
     self._name_to_decode = {
         e['name']: load_decoder(e)
         for e in self._encoders
     }
     self._name_to_encode = {
         e['name']: load_encoder(e)
         for e in self._encoders
     }
     self._rowid_map = None
     self._debug = debug
示例#42
0
文件: preql.py 项目: edwardt/loom
 def __init__(self, query_server, encoding, debug=False):
     self.query_server = query_server
     self.encoders = json_load(encoding)
     self.feature_names = [e['name'] for e in self.encoders]
     self.debug = debug