def _check_predictions(rows_in, result_out, encoding): encoders = json_load(encoding) name_to_encoder = {e['name']: load_encoder(e) for e in encoders} with open_compressed(rows_in, 'rb') as fin: with open(result_out, 'r') as fout: in_reader = csv.reader(fin) out_reader = csv.reader(fout) fnames = in_reader.next() out_reader.next() for in_row in in_reader: for i in range(COUNT): out_row = out_reader.next() bundle = zip(fnames, in_row, out_row) for name, in_val, out_val in bundle: if name == '_id': assert_equal(in_val, out_val) continue encode = name_to_encoder[name] observed = bool(in_val.strip()) if observed: assert_almost_equal( encode(in_val), encode(out_val)) else: assert_true(bool(out_val.strip()))
def test_predict(root, rows_csv, encoding, **unused): COUNT = 10 with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): with loom.query.get_server(root, debug=True) as query_server: result_out = 'predictions_out.csv' rows_in = os.listdir(rows_csv)[0] rows_in = os.path.join(rows_csv, rows_in) encoders = json_load(encoding) name_to_encoder = {e['name']: load_encoder(e) for e in encoders} preql = loom.preql.PreQL(query_server, encoding) preql.predict(rows_in, COUNT, result_out, id_offset=False) with open_compressed(rows_in, 'rb') as fin: with open(result_out, 'r') as fout: in_reader = csv.reader(fin) out_reader = csv.reader(fout) fnames = in_reader.next() out_reader.next() for in_row in in_reader: for i in range(COUNT): out_row = out_reader.next() bundle = zip(fnames, in_row, out_row) for name, in_val, out_val in bundle: encode = name_to_encoder[name] observed = bool(in_val.strip()) if observed: assert_almost_equal( encode(in_val), encode(out_val)) else: assert_true(bool(out_val.strip()))
def predict(self, rows_csv, count, result_out, id_offset=True): with open_compressed(rows_csv, 'rb') as fin: with open_compressed(result_out, 'w') as fout: reader = csv.reader(fin) writer = csv.writer(fout) feature_names = list(reader.next()) writer.writerow(feature_names) name_to_pos = {name: i for i, name in enumerate(feature_names)} pos_to_decode = {} schema = [] for encoder in self.encoders: pos = name_to_pos.get(encoder['name']) encode = load_encoder(encoder) decode = load_decoder(encoder) if pos is not None: pos_to_decode[pos] = decode schema.append((pos, encode)) for row in reader: conditioning_row = [] to_sample = [] if id_offset: row_id = row.pop(0) for pos, encode, in schema: value = None if pos is None else row[pos].strip() observed = bool(value) to_sample.append((not observed)) if observed is False: conditioning_row.append(None) else: conditioning_row.append(encode(value)) samples = self.query_server.sample( to_sample, conditioning_row, count) for sample in samples: if id_offset: out_row = [row_id] else: out_row = [] for name in feature_names: pos = name_to_pos[name] decode = pos_to_decode[pos] val = sample[pos] out_row.append(val) writer.writerow(out_row)
def __init__(self, query_server, encoding=None, debug=False): self._paths = loom.store.get_paths(query_server.root) if encoding is None: encoding = self._paths['ingest']['encoding'] self._query_server = query_server self._encoders = json_load(encoding) transforms = self._paths['ingest']['transforms'] self._transform = loom.transforms.load_transforms(transforms) self._feature_names = [e['name'] for e in self._encoders] self._feature_set = frozenset(self._feature_names) self._name_to_pos = { name: i for i, name in enumerate(self._feature_names) } self._name_to_decode = { e['name']: load_decoder(e) for e in self._encoders } self._name_to_encode = { e['name']: load_encoder(e) for e in self._encoders } self._rowid_map = None self._debug = debug
'text': ['This is a text feature.', 'Hello World!'], 'tags': ['', 'big_data machine_learning platform'], } for fluent_type, values in EXAMPLE_VALUES.items(): EXAMPLE_VALUES['optional_{}'.format(fluent_type)] = [''] + values EXAMPLE_VALUES['id'] = ['any unique string can serve as an id'] FLUENT_TO_BASIC = { 'boolean': 'bb', 'categorical': 'dd', 'unbounded_categorical': 'dpd', 'count': 'gp', 'real': 'nich', } encode_bool = load_encoder({'model': 'bb'}) decode_bool = load_decoder({'model': 'bb'}) def get_row_dict(header, row): '''By convention, empty strings are omitted from the result dict.''' return {key: value for key, value in izip(header, row) if value} class TransformSequence(object): def __init__(self, transforms): self.transforms = transforms def forward_set(self, feature_set): result = set(feature_set) for t in self.transforms: