Пример #1
0
def _make_encoder_builders_file((schema_in, rows_in)):
    assert os.path.isfile(rows_in)
    schema = json_load(schema_in)
    with csv_reader(rows_in) as reader:
        header = reader.next()
        builders = []
        seen = set()
        for name in header:
            if name in schema:
                if name in seen:
                    raise LoomError('Repeated column {} in csv file {}'.format(
                        name, rows_in))
                seen.add(name)
                model = schema[name]
                Builder = ENCODER_BUILDERS[model]
                builder = Builder(name, model)
            else:
                builder = None
            builders.append(builder)
        if all(builder is None for builder in builders):
            raise LoomError(
                'Csv file has no known features;'
                ', try adding a header to {}'.format(rows_in))
        missing_features = sorted(set(schema) - seen)
        if missing_features:
            raise LoomError('\n  '.join(
                ['Csv file is missing features:'] + missing_features))
        for row in reader:
            for value, builder in izip(row, builders):
                if builder is not None:
                    value = value.strip()
                    if value:
                        builder.add_value(value)
    return [b for b in builders if b is not None]
Пример #2
0
def _import_rows_file(args):
    encoding_in, rows_csv_in, rows_out, id_offset, id_stride = args
    assert os.path.isfile(rows_csv_in)
    encoders = json_load(encoding_in)
    message = loom.cFormat.Row()
    add_field = {
        'booleans': message.add_booleans,
        'counts': message.add_counts,
        'reals': message.add_reals,
    }
    with open_compressed(rows_csv_in, 'rb') as f:
        reader = csv.reader(f)
        feature_names = list(reader.next())
        name_to_pos = {name: i for i, name in enumerate(feature_names)}
        schema = []
        for encoder in encoders:
            pos = name_to_pos.get(encoder['name'])
            add = add_field[loom.schema.MODEL_TO_DATATYPE[encoder['model']]]
            encode = load_encoder(encoder)
            schema.append((pos, add, encode))

        def rows():
            for i, row in enumerate(reader):
                message.id = id_offset + id_stride * i
                for pos, add, encode in schema:
                    value = None if pos is None else row[pos].strip()
                    observed = bool(value)
                    message.add_observed(observed)
                    if observed:
                        add(encode(value))
                yield message
                message.Clear()

        loom.cFormat.row_stream_dump(rows(), rows_out)
Пример #3
0
def related(
        name=None,
        sample_count=loom.preql.SAMPLE_COUNT,
        debug=False,
        profile='time'):
    '''
    Run related query.
    '''
    loom.store.require(name, [
        'ingest.schema',
        'ingest.encoding',
        'samples.0.config',
        'samples.0.model',
        'samples.0.groups',
    ])
    inputs, results = get_paths(name, 'related')
    loom.config.config_dump({}, inputs['query']['config'])
    root = inputs['root']
    encoding = inputs['ingest']['encoding']
    features = sorted(json_load(inputs['ingest']['schema']).keys())

    print 'starting server'
    with loom.preql.get_server(root, encoding, debug, profile) as preql:
        print 'querying {} features'.format(len(features))
        preql.relate(features, sample_count=sample_count)
Пример #4
0
def test_metis():

    if os.path.exists(METIS_ARGS_TEMPFILE):
        print 'Loading metis args from %s' % METIS_ARGS_TEMPFILE
        args = json_load(METIS_ARGS_TEMPFILE)

    else:
        print 'Using simple metis args'
        args = {
            'nparts': 2,
            'adjacency': [[0, 2, 3], [1, 2], [0, 1, 2], [0, 3]],
            'eweights': [1073741824, 429496736, 357913952, 1073741824,
                         536870912, 429496736, 536870912, 1073741824,
                         357913952, 1073741824],
        }

    assert len(args['eweights']) == sum(map(len, args['adjacency']))

    print 'Running unweighted metis...'
    unweighted = dict(args)
    del unweighted['eweights']
    edge_cut, partition = pymetis.part_graph(**unweighted)
    print 'Finished unweighted metis'

    print 'Running metis...'
    edge_cut, partition = pymetis.part_graph(**args)
    print 'Finished metis'
Пример #5
0
def test_predict(root, rows_csv, encoding, **unused):
    COUNT = 10
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        with loom.query.get_server(root, debug=True) as query_server:
            result_out = 'predictions_out.csv'
            rows_in = os.listdir(rows_csv)[0]
            rows_in = os.path.join(rows_csv, rows_in)
            encoders = json_load(encoding)
            name_to_encoder = {e['name']: load_encoder(e) for e in encoders}
            preql = loom.preql.PreQL(query_server, encoding)
            preql.predict(rows_in, COUNT, result_out, id_offset=False)
            with open_compressed(rows_in, 'rb') as fin:
                with open(result_out, 'r') as fout:
                    in_reader = csv.reader(fin)
                    out_reader = csv.reader(fout)
                    fnames = in_reader.next()
                    out_reader.next()
                    for in_row in in_reader:
                        for i in range(COUNT):
                            out_row = out_reader.next()
                            bundle = zip(fnames, in_row, out_row)
                            for name, in_val, out_val in bundle:
                                encode = name_to_encoder[name]
                                observed = bool(in_val.strip())
                                if observed:
                                    assert_almost_equal(
                                        encode(in_val),
                                        encode(out_val))
                                else:
                                    assert_true(bool(out_val.strip()))
Пример #6
0
def load(name, schema, rows_csv):
    '''
    Load a csv dataset for testing and benchmarking.
    '''
    assert os.path.exists(schema)
    assert schema.endswith('.json')
    assert os.path.exists(rows_csv)
    if os.path.isfile(rows_csv):
        assert rows_csv.endswith('.csv') or rows_csv.endswith('.csv.gz')
    else:
        assert os.path.isdir(rows_csv)
    paths = loom.store.get_paths(name)
    assert not os.path.exists(paths['root']), 'dataset already loaded'
    json_dump(json_load(schema), paths['ingest']['schema'])
    loom.format.make_schema_row(
        schema_in=paths['ingest']['schema'],
        schema_row_out=paths['ingest']['schema_row'])
    if os.path.isdir(rows_csv):
        os.symlink(rows_csv, paths['ingest']['rows_csv'])
    else:
        os.makedirs(paths['ingest']['rows_csv'])
        os.symlink(
            rows_csv,
            os.path.join(
                paths['ingest']['rows_csv'],
                os.path.basename(rows_csv)))
Пример #7
0
def _make_encoder_builders_file((schema_in, rows_in)):
    assert os.path.isfile(rows_in)
    schema = json_load(schema_in)
    with csv_reader(rows_in) as reader:
        header = reader.next()
        builders = []
        seen = set()
        for name in header:
            if name in schema:
                if name in seen:
                    raise LoomError('Repeated column {} in csv file {}'.format(
                        name, rows_in))
                seen.add(name)
                model = schema[name]
                Builder = ENCODER_BUILDERS[model]
                builder = Builder(name, model)
            else:
                builder = None
            builders.append(builder)
        if all(builder is None for builder in builders):
            raise LoomError('Csv file has no known features;'
                            ', try adding a header to {}'.format(rows_in))
        missing_features = sorted(set(schema) - seen)
        if missing_features:
            raise LoomError('\n  '.join(['Csv file is missing features:'] +
                                        missing_features))
        for row in reader:
            for value, builder in izip(row, builders):
                if builder is not None:
                    value = value.strip()
                    if value:
                        builder.add_value(value)
    return [b for b in builders if b is not None]
Пример #8
0
def make_consensus(name, config=None, debug=False):
    '''
    Combine samples into a single consensus sample.
    Arguments:
        name            A unique identifier for consensus
        config          An optional json config file
                            currently doesn't do anything but will be used to
                            support e.g. cluster coarseness in the future
        debug           Whether to run debug versions of C++ code
    Environment varibles:
        LOOM_VERBOSITY  Verbosity level
    '''
    paths = loom.store.get_paths(name)
    LOG('making config')
    if config is None:
        config = {}
    elif isinstance(config, basestring):
        if not os.path.exists(config):
            raise LoomError('Missing config file: {}'.format(config))
        config = json_load(config)
    else:
        config = copy.deepcopy(config)
    loom.config.config_dump(config, paths['samples'][0]['config'])

    LOG('finding consensus')
    loom.consensus.make_consensus(paths=paths, debug=debug)
Пример #9
0
def _check_predictions(rows_in, result_out, encoding):
    encoders = json_load(encoding)
    name_to_encoder = {e['name']: load_encoder(e) for e in encoders}
    with open_compressed(rows_in, 'rb') as fin:
        with open(result_out, 'r') as fout:
            in_reader = csv.reader(fin)
            out_reader = csv.reader(fout)
            fnames = in_reader.next()
            out_reader.next()
            for in_row in in_reader:
                for i in range(COUNT):
                    out_row = out_reader.next()
                    bundle = zip(fnames, in_row, out_row)
                    for name, in_val, out_val in bundle:
                        if name == '_id':
                            assert_equal(in_val, out_val)
                            continue
                        encode = name_to_encoder[name]
                        observed = bool(in_val.strip())
                        if observed:
                            assert_almost_equal(
                                encode(in_val),
                                encode(out_val))
                        else:
                            assert_true(bool(out_val.strip()))
Пример #10
0
def make_fake_encoding(schema_in, model_in, encoding_out):
    '''
    Make a fake encoding from json schema + model.
    Assume that feature names in schema correspond to featureids in model
    e.g. schema was generated from loom.format.make_schema
    '''
    schema = json_load(schema_in)
    fields = []
    builders = []
    name_to_builder = {}
    for name, model in sorted(schema.iteritems()):
        fields.append(loom.schema.MODEL_TO_DATATYPE[model])
        Builder = FAKE_ENCODER_BUILDERS[model]
        builder = Builder(name, model)
        builders.append(builder)
        name_to_builder[name] = builder

    cross_cat = loom.schema_pb2.CrossCat()
    with open_compressed(model_in, 'rb') as f:
        cross_cat.ParseFromString(f.read())
    for kind in cross_cat.kinds:
        featureid = iter(kind.featureids)
        for model in loom.schema.MODELS.iterkeys():
            for shared in getattr(kind.product_model, model):
                feature_name = '{:06d}'.format(featureid.next())
                assert feature_name in schema
                if model == 'dd':
                    for i in range(len(shared.alphas)):
                        name_to_builder[feature_name].add_value(str(i))
                elif model == 'dpd':
                    for val in shared.values:
                        name_to_builder[feature_name].add_value(str(val))
    encoders = [b.build() for b in builders]
    ensure_fake_encoders_are_sorted(encoders)
    json_dump(encoders, encoding_out)
Пример #11
0
def make_consensus(name, config=None, debug=False):
    '''
    Combine samples into a single consensus sample.
    Arguments:
        name            A unique identifier for consensus
        config          An optional json config file
                            currently doesn't do anything but will be used to
                            support e.g. cluster coarseness in the future
        debug           Whether to run debug versions of C++ code
    Environment varibles:
        LOOM_VERBOSITY  Verbosity level
    '''
    paths = loom.store.get_paths(name)
    LOG('making config')
    if config is None:
        config = {}
    elif isinstance(config, basestring):
        if not os.path.exists(config):
            raise LoomError('Missing config file: {}'.format(config))
        config = json_load(config)
    else:
        config = copy.deepcopy(config)
    loom.config.config_dump(config, paths['samples'][0]['config'])

    LOG('finding consensus')
    loom.consensus.make_consensus(paths=paths, debug=debug)
Пример #12
0
def pretty_print(filename, message_type='guess'):
    '''
    Print text/json/protobuf messages from a raw/gz/bz2 file.
    '''
    parts = os.path.basename(filename).split('.')
    if parts[-1] in ['gz', 'bz2']:
        parts.pop()
    protocol = parts[-1]
    if protocol == 'json':
        data = json_load(filename)
        print json.dumps(data, sort_keys=True, indent=4)
    elif protocol == 'pb':
        message = get_message(filename, message_type)
        with open_compressed(filename) as f:
            message.ParseFromString(f.read())
            print message
    elif protocol == 'pbs':
        message = get_message(filename, message_type)
        for string in protobuf_stream_load(filename):
            message.ParseFromString(string)
            print message
    elif protocol == 'pickle':
        data = pickle_load(filename)
        print repr(data)
    else:
        with open_compressed(filename) as f:
            for line in f:
                print line,
Пример #13
0
def make_fake_encoding(schema_in, model_in, encoding_out):
    '''
    Make a fake encoding from json schema + model.
    Assume that feature names in schema correspond to featureids in model
    e.g. schema was generated from loom.format.make_schema
    '''
    schema = json_load(schema_in)
    fields = []
    builders = []
    name_to_builder = {}
    for name, model in sorted(schema.iteritems()):
        fields.append(loom.schema.MODEL_TO_DATATYPE[model])
        Builder = FAKE_ENCODER_BUILDERS[model]
        builder = Builder(name, model)
        builders.append(builder)
        name_to_builder[name] = builder

    cross_cat = loom.schema_pb2.CrossCat()
    with open_compressed(model_in, 'rb') as f:
        cross_cat.ParseFromString(f.read())
    for kind in cross_cat.kinds:
        featureid = iter(kind.featureids)
        for model in loom.schema.MODELS.iterkeys():
            for shared in getattr(kind.product_model, model):
                feature_name = '{:06d}'.format(featureid.next())
                assert feature_name in schema
                if model == 'dd':
                    for i in range(len(shared.alphas)):
                        name_to_builder[feature_name].add_value(str(i))
                elif model == 'dpd':
                    for val in shared.values:
                        name_to_builder[feature_name].add_value(str(val))
    encoders = [b.build() for b in builders]
    ensure_fake_encoders_are_sorted(encoders)
    json_dump(encoders, encoding_out)
Пример #14
0
def load(name, schema, rows_csv):
    '''
    Load a csv dataset for testing and benchmarking.
    '''
    assert os.path.exists(schema)
    assert schema.endswith('.json')
    assert os.path.exists(rows_csv)
    if os.path.isfile(rows_csv):
        assert rows_csv.endswith('.csv') or rows_csv.endswith('.csv.gz')
    else:
        assert os.path.isdir(rows_csv)
    paths = loom.store.get_paths(name)
    assert not os.path.exists(paths['root']), 'dataset already loaded'
    json_dump(json_load(schema), paths['ingest']['schema'])
    loom.format.make_schema_row(
        schema_in=paths['ingest']['schema'],
        schema_row_out=paths['ingest']['schema_row'])
    if os.path.isdir(rows_csv):
        os.symlink(rows_csv, paths['ingest']['rows_csv'])
    else:
        os.makedirs(paths['ingest']['rows_csv'])
        os.symlink(
            rows_csv,
            os.path.join(
                paths['ingest']['rows_csv'],
                os.path.basename(rows_csv)))
Пример #15
0
def test_group_runs(root, schema, encoding, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        with loom.preql.get_server(root, encoding, debug=True) as preql:
            test_columns = json_load(schema).keys()[:10]
            for column in test_columns:
                groupings_csv = 'group.{}.csv'.format(column)
                preql.group(column, result_out=groupings_csv)
                print open(groupings_csv).read()
Пример #16
0
def test_group_runs(root, schema, encoding, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        with loom.preql.get_server(root, encoding, debug=True) as preql:
            test_columns = json_load(schema).keys()[:10]
            for column in test_columns:
                groupings_csv = 'group.{}.csv'.format(column)
                preql.group(column, result_out=groupings_csv)
                print open(groupings_csv).read()
Пример #17
0
def _test_modify_schema(modify, name, schema, rows_csv, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR) as store:
        with mock.patch('loom.store.STORE', new=store):
            modified_schema = os.path.join(store, 'schema.json')
            data = json_load(schema)
            data = modify(data)
            json_dump(data, modified_schema)
            loom.tasks.ingest(name, modified_schema, rows_csv, debug=True)
Пример #18
0
def generate_init(encoding_in, model_out, seed=0):
    '''
    Generate an initial model for inference.
    '''
    numpy.random.seed(seed)
    encoders = json_load(encoding_in)
    features = import_features(encoders)
    cross_cat = generate_model(features)
    with open_compressed(model_out, 'wb') as f:
        f.write(cross_cat.SerializeToString())
Пример #19
0
def test_relate_pandas(root, rows_csv, schema, **unused):
    feature_count = len(json_load(schema))
    with loom.preql.get_server(root, debug=True) as preql:
        result_string = preql.relate(preql.feature_names)
        result_df = pandas.read_csv(StringIO(result_string), index_col=0)
        print 'result_df ='
        print result_df
        assert_equal(result_df.ndim, 2)
        assert_equal(result_df.shape[0], feature_count)
        assert_equal(result_df.shape[1], feature_count)
Пример #20
0
def test_relate_pandas(root, rows_csv, schema, **unused):
    feature_count = len(json_load(schema))
    with loom.preql.get_server(root, debug=True) as preql:
        result_string = preql.relate(preql.feature_names)
        result_df = pandas.read_csv(StringIO(result_string), index_col=0)
        print 'result_df ='
        print result_df
        assert_equal(result_df.ndim, 2)
        assert_equal(result_df.shape[0], feature_count)
        assert_equal(result_df.shape[1], feature_count)
Пример #21
0
def generate_init(encoding_in, model_out, seed=0):
    '''
    Generate an initial model for inference.
    '''
    numpy.random.seed(seed)
    encoders = json_load(encoding_in)
    features = import_features(encoders)
    cross_cat = generate_model(features)
    with open_compressed(model_out, 'wb') as f:
        f.write(cross_cat.SerializeToString())
Пример #22
0
def infer_one(name, seed=0, config=None, debug=False):
    '''
    Infer a single sample.
    Arguments:
        name            A unique identifier for ingest + inference
        seed            The seed, i.e., sample number typically 0-9
        config          An optional json config file, e.g.,
                            {"schedule": {"extra_passes": 500.0}}
        debug           Whether to run debug versions of C++ code
    Environment variables:
        LOOM_VERBOSITY  Verbosity level
    '''
    paths = loom.store.get_paths(name, sample_count=(1 + seed))
    sample = paths['samples'][seed]

    LOG('making config')
    if config is None:
        config = {}
    elif isinstance(config, basestring):
        if not os.path.exists(config):
            raise LoomError('Missing config file: {}'.format(config))
        config = json_load(config)
    else:
        config = copy.deepcopy(config)
    if 'seed' not in config:
        config['seed'] = seed
    loom.config.config_dump(config, sample['config'])

    LOG('generating init')
    loom.generate.generate_init(
        encoding_in=paths['ingest']['encoding'],
        model_out=sample['init'],
        seed=seed)

    LOG('shuffling rows')
    loom.runner.shuffle(
        rows_in=paths['ingest']['diffs'],
        rows_out=sample['shuffled'],
        seed=seed,
        debug=debug)

    LOG('inferring, watch {}'.format(sample['infer_log']))
    loom.runner.infer(
        config_in=sample['config'],
        rows_in=sample['shuffled'],
        tares_in=paths['ingest']['tares'],
        model_in=sample['init'],
        model_out=sample['model'],
        groups_out=sample['groups'],
        assign_out=sample['assign'],
        log_out=sample['infer_log'],
        debug=debug)
Пример #23
0
def _retrieve_featureid_to_cgpm(path):
    """Returns a dict mapping loom's 0-based featureid to cgpm.outputs."""
    # Loom orders features alphabetically based on statistical types:
    # i.e. 'bb' < 'dd' < 'nich'. The ordering is stored in
    # `ingest/encoding.json.gz`.
    encoding_in = os.path.join(path, 'ingest', 'encoding.json.gz')
    features = json_load(encoding_in)

    def colname_to_output(cname):
        # Convert dummy column name from 'c00012' to the integer 12.
        return int(cname.replace('c', ''))

    return {i: colname_to_output(f['name']) for i, f in enumerate(features)}
Пример #24
0
def make_schema_row(schema_in, schema_row_out):
    '''
    Convert json schema to protobuf schema row.
    '''
    schema = json_load(schema_in)
    value = loom.schema_pb2.ProductValue()
    value.observed.sparsity = loom.schema_pb2.ProductValue.Observed.DENSE
    for model in schema.itervalues():
        field = loom.schema.MODEL_TO_DATATYPE[model]
        value.observed.dense.append(True)
        getattr(value, field).append(EXAMPLE_VALUES[field])
    with open_compressed(schema_row_out, 'wb') as f:
        f.write(value.SerializeToString())
Пример #25
0
def infer_one(name, seed=0, config=None, debug=False):
    '''
    Infer a single sample.
    Arguments:
        name            A unique identifier for ingest + inference
        seed            The seed, i.e., sample number typically 0-9
        config          An optional json config file, e.g.,
                            {"schedule": {"extra_passes": 500.0}}
        debug           Whether to run debug versions of C++ code
    Environment variables:
        LOOM_VERBOSITY  Verbosity level
    '''
    paths = loom.store.get_paths(name, sample_count=(1 + seed))
    sample = paths['samples'][seed]

    LOG('making config')
    if config is None:
        config = {}
    elif isinstance(config, basestring):
        if not os.path.exists(config):
            raise LoomError('Missing config file: {}'.format(config))
        config = json_load(config)
    else:
        config = copy.deepcopy(config)
    config['seed'] = seed
    loom.config.config_dump(config, sample['config'])

    LOG('generating init')
    loom.generate.generate_init(
        encoding_in=paths['ingest']['encoding'],
        model_out=sample['init'],
        seed=seed)

    LOG('shuffling rows')
    loom.runner.shuffle(
        rows_in=paths['ingest']['diffs'],
        rows_out=sample['shuffled'],
        seed=seed,
        debug=debug)

    LOG('inferring, watch {}'.format(sample['infer_log']))
    loom.runner.infer(
        config_in=sample['config'],
        rows_in=sample['shuffled'],
        tares_in=paths['ingest']['tares'],
        model_in=sample['init'],
        model_out=sample['model'],
        groups_out=sample['groups'],
        assign_out=sample['assign'],
        log_out=sample['infer_log'],
        debug=debug)
Пример #26
0
def test_tiled_entropy(root, schema, **unused):
    feature_count = len(json_load(schema))
    feature_sets = [frozenset([i]) for i in xrange(feature_count)]
    kwargs = {
        'row_sets': feature_sets,
        'col_sets': feature_sets,
        'sample_count': 10
    }
    with loom.query.get_server(root, debug=True) as server:
        expected = set(server.entropy(**kwargs))
        for tile_size in xrange(1, 1 + feature_count):
            print 'tile_size = {}'.format(tile_size)
            actual = set(server.entropy(tile_size=tile_size, **kwargs))
            assert_set_equal(expected, actual)
Пример #27
0
def test_tiled_entropy(root, schema, **unused):
    feature_count = len(json_load(schema))
    feature_sets = [frozenset([i]) for i in xrange(feature_count)]
    kwargs = {
        'row_sets': feature_sets,
        'col_sets': feature_sets,
        'sample_count': 10
    }
    with loom.query.get_server(root, debug=True) as server:
        expected = set(server.entropy(**kwargs))
        for tile_size in xrange(1, 1 + feature_count):
            print 'tile_size = {}'.format(tile_size)
            actual = set(server.entropy(tile_size=tile_size, **kwargs))
            assert_set_equal(expected, actual)
Пример #28
0
def make_schema_row(schema_in, schema_row_out):
    '''
    Convert json schema to protobuf schema row.
    '''
    schema = json_load(schema_in)
    if not schema:
        raise LoomError('Schema is empty: {}'.format(schema_in))
    value = loom.schema_pb2.ProductValue()
    value.observed.sparsity = loom.schema_pb2.ProductValue.Observed.DENSE
    for model in schema.itervalues():
        try:
            field = loom.schema.MODEL_TO_DATATYPE[model]
        except KeyError:
            raise LoomError('Unknown model {} in schema {}'.format(
                model, schema_in))
        value.observed.dense.append(True)
        getattr(value, field).append(EXAMPLE_VALUES[field])
    with open_compressed(schema_row_out, 'wb') as f:
        f.write(value.SerializeToString())
Пример #29
0
def make_schema_row(schema_in, schema_row_out):
    '''
    Convert json schema to protobuf schema row.
    '''
    schema = json_load(schema_in)
    if not schema:
        raise LoomError('Schema is empty: {}'.format(schema_in))
    value = loom.schema_pb2.ProductValue()
    value.observed.sparsity = loom.schema_pb2.ProductValue.Observed.DENSE
    for model in schema.itervalues():
        try:
            field = loom.schema.MODEL_TO_DATATYPE[model]
        except KeyError:
            raise LoomError('Unknown model {} in schema {}'.format(
                model, schema_in))
        value.observed.dense.append(True)
        getattr(value, field).append(EXAMPLE_VALUES[field])
    with open_compressed(schema_row_out, 'wb') as f:
        f.write(value.SerializeToString())
Пример #30
0
def test_predict_pandas(root, rows_csv, schema, **unused):
    feature_count = len(json_load(schema))
    with loom.preql.get_server(root, debug=True) as preql:
        rows_filename = os.path.join(rows_csv, os.listdir(rows_csv)[0])
        with open_compressed(rows_filename) as f:
            rows_df = pandas.read_csv(f,
                                      converters=preql.converters,
                                      index_col='_id')
        print 'rows_df ='
        print rows_df
        row_count = rows_df.shape[0]
        assert_equal(rows_df.shape[1], feature_count)
        rows_io = StringIO(rows_df.to_csv())
        result_string = preql.predict(rows_io, COUNT, id_offset=True)
        result_df = pandas.read_csv(StringIO(result_string), index_col=False)
        print 'result_df ='
        print result_df
        assert_equal(result_df.ndim, 2)
        assert_equal(result_df.shape[0], row_count * COUNT)
        assert_equal(result_df.shape[1], 1 + feature_count)
Пример #31
0
def export_rows(encoding_in, rows_in, rows_csv_out, chunk_size=1000000):
    '''
    Export rows from gzipped-protobuf-stream to directory-of-gzipped-csv-files.
    '''
    rows_csv_out = os.path.abspath(rows_csv_out)
    if rows_csv_out == os.getcwd():
        raise LoomError('Cannot export_rows to working directory')
    for ext in ['.csv', '.gz', '.bz2']:
        if rows_csv_out.endswith(ext):
            raise LoomError(
                'Expected rows_csv_out to be a dirname, actual'.format(
                    rows_csv_out))
    if not (chunk_size > 0):
        raise LoomError('Invalid chunk_size {}, must be positive'.format(
            chunk_size))
    encoders = json_load(encoding_in)
    fields = [loom.schema.MODEL_TO_DATATYPE[e['model']] for e in encoders]
    decoders = [load_decoder(e) for e in encoders]
    header = ['_id'] + [e['name'] for e in encoders]
    if os.path.exists(rows_csv_out):
        shutil.rmtree(rows_csv_out)
    os.makedirs(rows_csv_out)
    row_count = sum(1 for _ in protobuf_stream_load(rows_in))
    rows = loom.cFormat.row_stream_load(rows_in)
    chunk_count = (row_count + chunk_size - 1) / chunk_size
    chunks = sorted(
        os.path.join(rows_csv_out, 'rows.{}.csv.gz'.format(i))
        for i in xrange(chunk_count)
    )
    with ExitStack() as stack:
        with_ = stack.enter_context
        writers = [with_(csv_writer(f)) for f in chunks]
        for writer in writers:
            writer.writerow(header)
        for row, writer in izip(rows, cycle(writers)):
            data = row.iter_data()
            schema = izip(data['observed'], fields, decoders)
            csv_row = [row.id]
            for observed, field, decode in schema:
                csv_row.append(decode(data[field].next()) if observed else '')
            writer.writerow(csv_row)
Пример #32
0
def make_fake_encoding(schema_in, rows_in, encoding_out):
    '''
    Make a fake encoding from json schema + protobuf rows.
    '''
    schema = json_load(schema_in)
    fields = []
    builders = []
    for name, model in sorted(schema.iteritems()):
        fields.append(loom.schema.MODEL_TO_DATATYPE[model])
        Builder = FAKE_ENCODER_BUILDERS[model]
        builder = Builder(name, model)
        builders.append(builder)
    for row in loom.cFormat.row_stream_load(rows_in):
        data = row.iter_data()
        observeds = data['observed']
        for observed, field, builder in izip(observeds, fields, builders):
            if observed:
                builder.add_value(str(data[field].next()))
    encoders = [builder.build() for builder in builders]
    ensure_fake_encoders_are_sorted(encoders)
    json_dump(encoders, encoding_out)
Пример #33
0
def test_predict_pandas(root, rows_csv, schema, **unused):
    feature_count = len(json_load(schema))
    with loom.preql.get_server(root, debug=True) as preql:
        rows_filename = os.path.join(rows_csv, os.listdir(rows_csv)[0])
        with open_compressed(rows_filename) as f:
            rows_df = pandas.read_csv(
                f,
                converters=preql.converters,
                index_col='_id')
        print 'rows_df ='
        print rows_df
        row_count = rows_df.shape[0]
        assert_equal(rows_df.shape[1], feature_count)
        rows_io = StringIO(rows_df.to_csv())
        result_string = preql.predict(rows_io, COUNT, id_offset=True)
        result_df = pandas.read_csv(StringIO(result_string), index_col=False)
        print 'result_df ='
        print result_df
        assert_equal(result_df.ndim, 2)
        assert_equal(result_df.shape[0], row_count * COUNT)
        assert_equal(result_df.shape[1], 1 + feature_count)
Пример #34
0
def _make_encoder_builders_file((schema_in, rows_in)):
    assert os.path.isfile(rows_in)
    schema = json_load(schema_in)
    with open_compressed(rows_in, 'rb') as f:
        reader = csv.reader(f)
        header = reader.next()
        builders = []
        for name in header:
            if name in schema:
                model = schema[name]
                Builder = ENCODER_BUILDERS[model]
                builder = Builder(name, model)
            else:
                builder = None
            builders.append(builder)
        for row in reader:
            for value, builder in izip(row, builders):
                if builder is not None:
                    value = value.strip()
                    if value:
                        builder.add_value(value)
    return [b for b in builders if b is not None]
Пример #35
0
def export_rows(encoding_in, rows_in, rows_csv_out, chunk_size=1000000):
    '''
    Export rows from protobuf stream to csv.
    '''
    for ext in ['.csv', '.gz', '.bz2']:
        assert not rows_csv_out.endswith(ext),\
            'rows_csv_out should be a dirname'
    assert chunk_size > 0
    encoders = json_load(encoding_in)
    fields = [loom.schema.MODEL_TO_DATATYPE[e['model']] for e in encoders]
    decoders = [load_decoder(e) for e in encoders]
    header = [e['name'] for e in encoders]
    if os.path.exists(rows_csv_out):
        shutil.rmtree(rows_csv_out)
    os.makedirs(rows_csv_out)
    rows = loom.cFormat.row_stream_load(rows_in)
    try:
        empty = None
        for i in xrange(MAX_CHUNK_COUNT):
            file_out = os.path.join(
                rows_csv_out,
                'rows_{:06d}.csv.gz'.format(i))
            with open_compressed(file_out, 'wb') as f:
                writer = csv.writer(f)
                writer.writerow(header)
                empty = file_out
                for j in xrange(chunk_size):
                    data = rows.next().iter_data()
                    schema = izip(data['observed'], fields, decoders)
                    row = [
                        decode(data[field].next()) if observed else ''
                        for observed, field, decode in schema
                    ]
                    writer.writerow(row)
                    empty = None
    except StopIteration:
        if empty:
            os.remove(empty)
Пример #36
0
def transform_rows(schema_in, transforms_in, rows_in, rows_out, id_field=None):
    transforms = pickle_load(transforms_in)
    if not transforms:
        cp_ns(rows_in, rows_out)
    else:
        transform = TransformSequence(transforms)
        transformed_header = sorted(json_load(schema_in).iterkeys())
        if id_field is not None:
            assert id_field not in transformed_header
            transformed_header = [id_field] + transformed_header
        tasks = []
        if os.path.isdir(rows_in):
            loom.util.mkdir_p(rows_out)
            for f in os.listdir(rows_in):
                tasks.append((
                    transform,
                    transformed_header,
                    os.path.join(rows_in, f),
                    os.path.join(rows_out, f),
                ))
        else:
            tasks.append((transform, transformed_header, rows_in, rows_out))
        parallel_map(_transform_rows, tasks)
Пример #37
0
def transform_rows(schema_in, transforms_in, rows_in, rows_out, id_field=None):
    transforms = pickle_load(transforms_in)
    if not transforms:
        cp_ns(rows_in, rows_out)
    else:
        transform = TransformSequence(transforms)
        transformed_header = sorted(json_load(schema_in).iterkeys())
        if id_field is not None:
            assert id_field not in transformed_header
            transformed_header = [id_field] + transformed_header
        tasks = []
        if os.path.isdir(rows_in):
            loom.util.mkdir_p(rows_out)
            for f in os.listdir(rows_in):
                tasks.append((
                    transform,
                    transformed_header,
                    os.path.join(rows_in, f),
                    os.path.join(rows_out, f),
                ))
        else:
            tasks.append((transform, transformed_header, rows_in, rows_out))
        parallel_map(_transform_rows, tasks)
Пример #38
0
def _import_rows_file(args):
    rows_csv_in, rows_out, id_offset, id_stride, encoding_in = args
    assert os.path.isfile(rows_csv_in)
    encoders = json_load(encoding_in)
    message = loom.cFormat.Row()
    add_field = {
        'booleans': message.add_booleans,
        'counts': message.add_counts,
        'reals': message.add_reals,
    }
    with csv_reader(rows_csv_in) as reader:
        feature_names = list(reader.next())
        header_length = len(feature_names)
        name_to_pos = {name: i for i, name in enumerate(feature_names)}
        schema = []
        for encoder in encoders:
            pos = name_to_pos.get(encoder['name'])
            add = add_field[loom.schema.MODEL_TO_DATATYPE[encoder['model']]]
            encode = load_encoder(encoder)
            schema.append((pos, add, encode))

        def rows():
            for i, row in enumerate(reader):
                if len(row) != header_length:
                    raise LoomError('row {} has wrong length {}:\n{}'.format(
                        i, len(row), row))
                message.id = id_offset + id_stride * i
                for pos, add, encode in schema:
                    value = None if pos is None else row[pos].strip()
                    observed = bool(value)
                    message.add_observed(observed)
                    if observed:
                        add(encode(value))
                yield message
                message.Clear()

        loom.cFormat.row_stream_dump(rows(), rows_out)
Пример #39
0
def _import_rows_file(args):
    rows_csv_in, rows_out, id_offset, id_stride, encoding_in = args
    assert os.path.isfile(rows_csv_in)
    encoders = json_load(encoding_in)
    message = loom.cFormat.Row()
    add_field = {
        'booleans': message.add_booleans,
        'counts': message.add_counts,
        'reals': message.add_reals,
    }
    with csv_reader(rows_csv_in) as reader:
        feature_names = list(reader.next())
        header_length = len(feature_names)
        name_to_pos = {name: i for i, name in enumerate(feature_names)}
        schema = []
        for encoder in encoders:
            pos = name_to_pos.get(encoder['name'])
            add = add_field[loom.schema.MODEL_TO_DATATYPE[encoder['model']]]
            encode = load_encoder(encoder)
            schema.append((pos, add, encode))

        def rows():
            for i, row in enumerate(reader):
                if len(row) != header_length:
                    raise LoomError('row {} has wrong length {}:\n{}'.format(
                        i, len(row), row))
                message.id = id_offset + id_stride * i
                for pos, add, encode in schema:
                    value = None if pos is None else row[pos].strip()
                    observed = bool(value)
                    message.add_observed(observed)
                    if observed:
                        add(encode(value))
                yield message
                message.Clear()

        loom.cFormat.row_stream_dump(rows(), rows_out)
Пример #40
0
 def __init__(self, query_server, encoding=None, debug=False):
     self._paths = loom.store.get_paths(query_server.root)
     if encoding is None:
         encoding = self._paths['ingest']['encoding']
     self._query_server = query_server
     self._encoders = json_load(encoding)
     transforms = self._paths['ingest']['transforms']
     self._transform = loom.transforms.load_transforms(transforms)
     self._feature_names = [e['name'] for e in self._encoders]
     self._feature_set = frozenset(self._feature_names)
     self._name_to_pos = {
         name: i
         for i, name in enumerate(self._feature_names)
     }
     self._name_to_decode = {
         e['name']: load_decoder(e)
         for e in self._encoders
     }
     self._name_to_encode = {
         e['name']: load_encoder(e)
         for e in self._encoders
     }
     self._rowid_map = None
     self._debug = debug
Пример #41
0
 def __init__(self, query_server, encoding=None, debug=False):
     self._paths = loom.store.get_paths(query_server.root)
     if encoding is None:
         encoding = self._paths['ingest']['encoding']
     self._query_server = query_server
     self._encoders = json_load(encoding)
     transforms = self._paths['ingest']['transforms']
     self._transform = loom.transforms.load_transforms(transforms)
     self._feature_names = [e['name'] for e in self._encoders]
     self._feature_set = frozenset(self._feature_names)
     self._name_to_pos = {
         name: i
         for i, name in enumerate(self._feature_names)
     }
     self._name_to_decode = {
         e['name']: load_decoder(e)
         for e in self._encoders
     }
     self._name_to_encode = {
         e['name']: load_encoder(e)
         for e in self._encoders
     }
     self._rowid_map = None
     self._debug = debug
Пример #42
0
 def __init__(self, query_server, encoding, debug=False):
     self.query_server = query_server
     self.encoders = json_load(encoding)
     self.feature_names = [e['name'] for e in self.encoders]
     self.debug = debug