def synthesis(prepare_res, slices, job): dw_passed, _ = prepare_res # Using set_slice on a dataset that was written in analysis is not # actually supported, but since it currently works (as long as that # particular slice wasn't written in analysis) let's test it. dw_passed.set_slice(0) dw_passed.write(**{k: v[0] for k, v in test_data.data.items()}) dw_synthesis_split = DatasetWriter(name="synthesis_split", hashlabel="a") dw_synthesis_split.add("a", "int32") dw_synthesis_split.add("b", "unicode") dw_synthesis_split.get_split_write()(1, "a") dw_synthesis_split.get_split_write_list()([2, "b"]) dw_synthesis_split.get_split_write_dict()({"a": 3, "b": "c"}) dw_synthesis_manual = job.datasetwriter(name="synthesis_manual", columns={"sliceno": "int32"}) dw_nonetest = job.datasetwriter(name="nonetest", columns={t: t for t in test_data.data}) for sliceno in range(slices): dw_synthesis_manual.set_slice(sliceno) dw_synthesis_manual.write(sliceno) dw_nonetest.set_slice(sliceno) dw_nonetest.write( **{ k: v[0] if k in test_data.not_none_capable else None for k, v in test_data.data.items() })
def synthesis(params): ds = write(data) for colname in data[0]: verify(params.slices, data, ds, hashlabel=colname) # ok, all the hashing stuff works out, let's test the chaining options. bonus_ds = write(bonus_data, name="bonus", previous=ds) # no chaining options - full chain verify(params.slices, data + bonus_data, bonus_ds, hashlabel="date") # just the bonus ds verify(params.slices, bonus_data, bonus_ds, hashlabel="date", length=1) # built as a chain verify(params.slices, data + bonus_data, bonus_ds, hashlabel="date", as_chain=True) # normal chaining a = verify(params.slices, data, ds, hashlabel="date") b = verify(params.slices, data + bonus_data, bonus_ds, hashlabel="date", previous=a) assert b.chain() == [ a, b ], "chain of %s is not [%s, %s] as expected" % (b, a, b) # as_chain sparseness dw = DatasetWriter(columns=columns, name="empty") dw.get_split_write() ds = verify(params.slices, [], dw.finish(), hashlabel="date", as_chain=True) assert len( ds.chain() ) == 1, ds + ": dataset_hashpart on empty dataset with as_chain=True did not produce a single dataset" # two populated slices with the same data, should end up in two datasets. dw = DatasetWriter(columns=columns, name="0 and 2") dw.set_slice(0) dw.write_dict(data[0]) dw.set_slice(1) dw.set_slice(2) dw.write_dict(data[0]) for s in range(3, params.slices): dw.set_slice(s) ds = verify(params.slices, [data[0]], dw.finish(), hashlabel="date", as_chain=True) got_slices = len(ds.chain()) assert got_slices == 2, "%s (built with as_chain=True) has %d datasets in chain, expected 2." % ( ds, got_slices, )
def synthesis(job): dw = DatasetWriter(name='empty', columns={'v': 'ascii'}) dw.get_split_write() empty_ds = dw.finish() assert empty_ds.min('non-existant column') is empty_ds.max('non-existant column') is None, 'Dataset.min/max() broken for non-existant columns' for typ, groups in tests.items(): t_ds = subjobs.build('dataset_type', column2type={'v': typ}, source=empty_ds).dataset() minmax = (t_ds.columns['v'].min, t_ds.columns['v'].max) if minmax != (None, None): raise Exception('Typing empty dataset as %s did not give minmax == None, gave %r' % (typ, minmax,)) all_names = list(chain.from_iterable(groupdata[group].keys() for group in groups)) # just 1 and 2, so we don't make way too many for num_groups in (1, 2,): for names in combinations(all_names, num_groups): ds, mn, mx = make_source(names) t_ds = subjobs.build('dataset_type', column2type={'v': typ}, source=ds).dataset() got_minmax = (t_ds.columns['v'].min, t_ds.columns['v'].max) want_minmax = (mn, mx) chk_minmax(got_minmax, want_minmax, 'Typing %s as %s gave wrong minmax: expected %r, got %r (in %s)' % (ds, typ, want_minmax, got_minmax, t_ds,)) chk_minmax(got_minmax, (t_ds.min('v'), t_ds.max('v')), 'Dataset.min/max() broken on ' + t_ds) # verify writing the same data normally also gives the correct result dw = DatasetWriter(name='rewrite ' + t_ds, columns=t_ds.columns) write = dw.get_split_write() for v in t_ds.iterate(None, 'v'): write(v) re_ds = dw.finish() got_minmax = (re_ds.columns['v'].min, re_ds.columns['v'].max) want_minmax = (mn, mx) chk_minmax(got_minmax, want_minmax, 'Rewriting %s gave the wrong minmax: expected %r, got %r (in %s)' % (t_ds, want_minmax, got_minmax, re_ds,)) # make sure renaming doesn't mix anything up dw = DatasetWriter(name='rename', columns={'a': 'ascii', 'b': 'ascii'}) write = dw.get_split_write() write('5', '3') write('7', 'oops') ds = dw.finish() t_ds = subjobs.build( 'dataset_type', column2type=dict(num='number', int='int32_10'), defaults=dict(num='1', int='2'), rename=dict(a='num', b='int'), source=ds, ).dataset() for name, want_minmax in ( ('num', (5, 7)), ('int', (2, 3)), ): got_minmax = (t_ds.columns[name].min, t_ds.columns[name].max) msg = 'Typing %s gave wrong minmax: expected %r, got %r (in %s)' % (ds, want_minmax, got_minmax, t_ds,) chk_minmax(got_minmax, want_minmax, msg)
def synthesis(): dw_a = DatasetWriter(name='a', columns={'num': 'int32'}) dw_b = DatasetWriter(name='b', columns={'num': 'int32'}, previous=dw_a) dw_c = DatasetWriter(name='c', columns={'num': 'int32'}, previous=dw_b) w = dw_a.get_split_write() w(3) w(2) w = dw_b.get_split_write() w(2) w(1) w = dw_c.get_split_write() w(0) a = dw_a.finish() b = dw_b.finish() c = dw_c.finish() opts = dict( sort_columns='num', sort_across_slices=True, ) # sort as a chain jid = subjobs.build('dataset_sort', options=opts, datasets=dict(source=a, previous=None)) assert list(Dataset(jid).iterate(None, 'num')) == [2, 3] sorted_a = jid jid = subjobs.build('dataset_sort', options=opts, datasets=dict(source=b, previous=jid)) assert list(Dataset(jid).iterate_chain(None, 'num')) == [2, 3, 1, 2] jid = subjobs.build('dataset_sort', options=opts, datasets=dict(source=c, previous=jid)) assert list(Dataset(jid).iterate_chain(None, 'num')) == [2, 3, 1, 2, 0] # sort all as a single dataset jid = subjobs.build('dataset_sort', options=opts, datasets=dict(source=c, previous=None)) assert list(Dataset(jid).iterate_chain(None, 'num')) == [0, 1, 2, 2, 3] # merge b and c but not a jid = subjobs.build('dataset_sort', options=opts, datasets=dict(source=c, previous=sorted_a)) # test with new style job.dataset assert list(jid.dataset().iterate(None, 'num')) == [0, 1, 2] assert list(jid.dataset().iterate_chain(None, 'num')) == [2, 3, 0, 1, 2]
def mkds(name, columns, data, **kw): columns = dict.fromkeys(columns, 'int32') dw = DatasetWriter(name=name, columns=columns, **kw) write = dw.get_split_write() for v in data: write(*v) return dw.finish()
def synthesis(job): manual_chain = [Dataset(jobids.selfchain, name) for name in "abcdefgh"] manual_abf = [manual_chain[0], manual_chain[1], manual_chain[5]] # build a local abf chain prev = None for ix, ds in enumerate(manual_abf): name = "abf%d" % (ix, ) prev = ds.link_to_here(name, override_previous=prev) manual_abf_data = list(Dataset.iterate_list(None, None, manual_abf)) local_abf_data = list(Dataset(job, "abf2").iterate_chain(None, None)) assert manual_abf_data == local_abf_data # disconnect h, verify there is no chain manual_chain[-1].link_to_here("alone", override_previous=None) assert len(Dataset(job, "alone").chain()) == 1 # check that the original chain is unhurt assert manual_chain == manual_chain[-1].chain() # So far so good, now make a chain long enough to have a cache. prev = None ix = 0 going = True while going: if prev and "cache" in prev._data: going = False name = "longchain%d" % (ix, ) dw = DatasetWriter(name=name, previous=prev) dw.add("ix", "number") dw.get_split_write()(ix) prev = dw.finish() ix += 1 # we now have a chain that goes one past the first cache point full_chain = Dataset(prev).chain() assert "cache" in full_chain[ -2]._data # just to check the above logic is correct assert "cache" not in full_chain[-1]._data # just to be sure.. full_chain[-2].link_to_here("nocache", override_previous=None) full_chain[-1].link_to_here("withcache", override_previous=full_chain[-3]) assert "cache" not in Dataset(job, "nocache")._data assert "cache" in Dataset(job, "withcache")._data # And make sure they both get the right data too. assert list(Dataset(prev).iterate_chain(None, "ix")) == list(range(ix)) assert list(Dataset(job, "nocache").iterate_chain(None, "ix")) == [ix - 2] assert list(Dataset(job, "withcache").iterate_chain( None, "ix")) == list(range(ix - 2)) + [ix - 1]
def make_source(names): names = sorted(names) dsname = '+'.join(names) if dsname not in sources: dw = DatasetWriter(name=dsname, columns={'v': 'ascii'}) write = dw.get_split_write() for name in names: for value in data[name][0]: write(value) sources[dsname] = ( dw.finish(), min(unnan(data[name][1] for name in names)), max(unnan(data[name][2] for name in names)), ) return sources[dsname]
def test_column_discarding(): dw = DatasetWriter(name='column discarding') dw.add('a', 'bytes') dw.add('b', 'bytes') dw.add('c', 'bytes') w = dw.get_split_write() w(b'a', b'b', b'c') source = dw.finish() # Discard b because it's not typed ac_implicit = subjobs.build( 'dataset_type', source=source, column2type=dict(a='ascii', c='ascii'), discard_untyped=True, ).dataset() assert sorted(ac_implicit.columns) == ['a', 'c'], '%s: %r' % (ac_implicit, sorted(ac_implicit.columns),) assert list(ac_implicit.iterate(None)) == [('a', 'c',)], ac_implicit # Discard b explicitly ac_explicit = subjobs.build( 'dataset_type', source=source, column2type=dict(a='ascii', c='ascii'), rename=dict(b=None), ).dataset() assert sorted(ac_explicit.columns) == ['a', 'c'], '%s: %r' % (ac_explicit, sorted(ac_explicit.columns),) assert list(ac_explicit.iterate(None)) == [('a', 'c',)], ac_explicit # Discard c by overwriting it with b. Keep untyped b. ac_bASc = subjobs.build( 'dataset_type', source=source, column2type=dict(a='ascii', c='ascii'), rename=dict(b='c'), ).dataset() assert sorted(ac_bASc.columns) == ['a', 'b', 'c'], '%s: %r' % (ac_bASc, sorted(ac_bASc.columns),) assert list(ac_bASc.iterate(None)) == [('a', b'b', 'b',)], ac_bASc # Discard c by overwriting it with b. Also type b as a different type. abc_bASc = subjobs.build( 'dataset_type', source=source, column2type=dict(a='ascii', b='strbool', c='ascii'), rename=dict(b='c'), ).dataset() assert sorted(abc_bASc.columns) == ['a', 'b', 'c'], '%s: %r' % (abc_bASc, sorted(abc_bASc.columns),) assert list(abc_bASc.iterate(None)) == [('a', True, 'b',)], abc_bASc
def test_rehash_with_empty_slices(): dw = DatasetWriter(name='rehash with empty slices', hashlabel='a') dw.add('a', 'ascii') dw.add('b', 'ascii') w = dw.get_split_write() w('a', '42') w('42', 'b') source = dw.finish() hashfunc = typed_writer('int32').hash def verify_hashing(caption, want_values, **kw): ds = subjobs.build('dataset_type', source=source, column2type=dict(a='int32_10'), caption=caption, **kw).dataset() got_values = set() for sliceno in range(g.slices): for got in ds.iterate(sliceno): assert hashfunc(got[0]) % g.slices == sliceno assert got not in got_values got_values.add(got) assert want_values == got_values verify_hashing('with discard', {( 42, 'b', )}, filter_bad=True) # using defaults uses some different code paths verify_hashing('with default=0 (probably two slices)', {( 0, '42', ), ( 42, 'b', )}, defaults=dict(a='0')) verify_hashing('with default=42 (one slice)', {( 42, '42', ), ( 42, 'b', )}, defaults=dict(a='42'))
def prepare(): dw = DatasetWriter(columns={"data": "ascii"}) write = dw.get_split_write() write("foo") write("bar")