def synthesis(prepare_res, params): dw_passed, _ = prepare_res # Using set_slice on a dataset that was written in analysis is not # actually supported, but since it currently works (as long as that # particular slice wasn't written in analysis) let's test it. dw_passed.set_slice(0) dw_passed.write(**{k: v[0] for k, v in test_data.data.items()}) dw_synthesis_split = DatasetWriter(name="synthesis_split", hashlabel="a") dw_synthesis_split.add("a", "int32") dw_synthesis_split.add("b", "unicode") dw_synthesis_split.get_split_write()(1, "a") dw_synthesis_split.get_split_write_list()([2, "b"]) dw_synthesis_split.get_split_write_dict()({"a": 3, "b": "c"}) dw_synthesis_manual = DatasetWriter(name="synthesis_manual", columns={"sliceno": "int32"}) dw_nonetest = DatasetWriter(name="nonetest", columns={t: t for t in test_data.data}) for sliceno in range(params.slices): dw_synthesis_manual.set_slice(sliceno) dw_synthesis_manual.write(sliceno) dw_nonetest.set_slice(sliceno) dw_nonetest.write( **{ k: v[0] if k in test_data.not_none_capable else None for k, v in test_data.data.items() })
def analysis(sliceno, prepare_res): dw_default = DatasetWriter() dw_named = DatasetWriter(name="named") dw_passed, num = prepare_res dw_default.write(a=sliceno, b="a") dw_default.write_list([num, str(sliceno)]) dw_named.write(True, date(1536, 12, min(sliceno + 1, 31))) dw_named.write_dict({"c": False, "d": date(2236, 5, min(sliceno + 1, 31))}) # slice 0 is written in synthesis if 0 < sliceno < test_data.value_cnt: dw_passed.write_dict( {k: v[sliceno] for k, v in test_data.data.items()})
def _verify(name, types, data, coltype, want, default, want_fail, kw): if callable(want): check = want else: def check(got, fromstr, filtered=False): want1 = want if isinstance(want, list) else want[typ] if filtered: want1 = want1[::2] assert got == want1, 'Expected %r, got %r from %s.' % ( want1, got, fromstr, ) dw = DatasetWriter(name=name, columns={'data': coltype, 'extra': 'bytes'}) dw.set_slice(0) for ix, v in enumerate(data): dw.write(v, b'1' if ix % 2 == 0 else b'skip') for sliceno in range(1, g.SLICES): dw.set_slice(sliceno) bytes_ds = dw.finish() for typ in types: opts = dict(column2type=dict(data=typ)) opts.update(kw) if default is not no_default: opts['defaults'] = {'data': default} try: jid = subjobs.build('dataset_type', datasets=dict(source=bytes_ds), options=opts) except JobError: if want_fail: continue raise Exception('Typing %r as %s failed.' % ( bytes_ds, typ, )) assert not want_fail, "Typing %r as %s should have failed, but didn't (%s)." % ( bytes_ds, typ, jid) typed_ds = Dataset(jid) got = list(typed_ds.iterate(0, 'data')) check(got, '%s (typed as %s from %r)' % ( typed_ds, typ, bytes_ds, )) if 'filter_bad' not in opts and not callable(want): opts['filter_bad'] = True opts['column2type']['extra'] = 'int32_10' jid = subjobs.build('dataset_type', datasets=dict(source=bytes_ds), options=opts) typed_ds = Dataset(jid) got = list(typed_ds.iterate(0, 'data')) check( got, '%s (typed as %s from %r with every other line skipped from filter_bad)' % ( typed_ds, typ, bytes_ds, ), True) used_type(typ)
def test_filter_bad_across_types(): columns = { 'bytes': 'bytes', 'float64': 'bytes', 'int32_10': 'ascii', 'json': 'unicode', 'number:int': 'unicode', 'unicode:utf-8': 'bytes', } # all_good, *values # Make sure all those types (except bytes) can filter other lines, # and be filtered by other lines. And that several filtering values # is not a problem (line 11). data = [ ( True, b'first', b'1.1', '1', '"a"', '001', b'ett', ), ( True, b'second', b'2.2', '2', '"b"', '02', b'tv\xc3\xa5', ), ( True, b'third', b'3.3', '3', '["c"]', '3.0', b'tre', ), ( False, b'fourth', b'4.4', '4', '"d"', '4.4', b'fyra', ), # number:int bad ( False, b'fifth', b'5.5', '-', '"e"', '5', b'fem', ), # int32_10 bad ( False, b'sixth', b'6.b', '6', '"f"', '6', b'sex', ), # float64 bad [ False, b'seventh', b'7.7', '7', '{"g"}', '7', b'sju', ], # json bad ( False, b'eigth', b'8.8', '8', '"h"', '8', b'\xa5\xc3tta', ), # unicode:utf-8 bad ( True, b'ninth', b'9.9', '9', '"i"', '9', b'nio', ), ( True, b'tenth', b'10', '10', '"j"', '10', b'tio', ), ( False, b'eleventh', b'11a', '1-', '"k",', '1,', b'elva', ), # float64, int32_10 and number:int bad ( True, b'twelfth', b'12', '12', '"l"', '12', b'tolv', ), ] dw = DatasetWriter(name="filter bad across types", columns=columns) dw.set_slice(0) want = [] def add_want(v): want.append(( int(v[3]), v[1], json.loads(v[4]), v[6].decode('utf-8'), )) for v in data: if v[0]: add_want(v) dw.write(*v[1:]) for sliceno in range(1, g.SLICES): dw.set_slice(sliceno) source_ds = dw.finish() # Once with just filter_bad, once with some defaults too. defaults = {} for _ in range(2): jid = subjobs.build( 'dataset_type', datasets=dict(source=source_ds), options=dict(column2type={t: t for t in columns}, filter_bad=True, defaults=defaults), ) typed_ds = Dataset(jid) got = list( typed_ds.iterate(0, ['int32_10', 'bytes', 'json', 'unicode:utf-8'])) assert got == want, "Exptected %r, got %r from %s (from %r%s)" % ( want, got, typed_ds, source_ds, ' with defaults' if defaults else '') # make more lines "ok" for the second lap defaults = {'number:int': '0', 'float64': '0', 'json': '"replacement"'} add_want(data[3]) add_want(data[5]) data[6][4] = '"replacement"' add_want(data[6]) want.sort() # adding them out of order, int32_10 sorts correctly.