def analysis_lap(vars): if vars.rehashing: if vars.first_lap: out_fn = 'hashtmp.%d' % (vars.sliceno, ) colname = vars.rev_rename.get(vars.dw.hashlabel, vars.dw.hashlabel) coltype = vars.column2type[options.rename.get(colname, colname)] vars.rehashing = False real_coltype = one_column(vars, colname, coltype, [out_fn], True) vars.rehashing = True assert vars.res_bad_count[colname] == [0 ] # imlicitly has a default vars.slicemap_fd = map_init(vars, 'slicemap%d' % (vars.sliceno, ), 'slicemap_size') slicemap = mmap(vars.slicemap_fd, vars.slicemap_size) slicemap = Int16BytesWrapper(slicemap) hash = typed_writer(real_coltype).hash slices = vars.slices vars.hash_lines = hash_lines = [0] * slices for ix, value in enumerate(typed_reader(real_coltype)(out_fn)): dest_slice = hash(value) % slices slicemap[ix] = dest_slice hash_lines[dest_slice] += 1 unlink(out_fn) for colname, coltype in vars.column2type.items(): if vars.rehashing: out_fns = [ vars.dw.column_filename(colname, sliceno=s) for s in range(vars.slices) ] else: out_fns = [vars.dw.column_filename(colname)] one_column(vars, vars.rev_rename.get(colname, colname), coltype, out_fns) return vars.res_bad_count, vars.res_default_count, vars.res_minmax
def analysis(sliceno, params): assert list(datasets.source.iterate(sliceno, "a")) == [sliceno, 42] assert list(datasets.source.iterate(sliceno, "b")) == ["a", str(sliceno)] named = Dataset(datasets.source, "named") assert list(named.iterate(sliceno, "c")) == [True, False] assert list(named.iterate(sliceno, "d")) == [ date(1536, 12, min(sliceno + 1, 31)), date(2236, 5, min(sliceno + 1, 31)) ] if sliceno < test_data.value_cnt: passed = Dataset(datasets.source, "passed") good = tuple(v[sliceno] for _, v in sorted(test_data.data.items())) assert list(passed.iterate(sliceno)) == [good] if version_info > (3, 6, 0): want_fold = (sliceno == 1) assert next(passed.iterate(sliceno, "datetime")).fold == want_fold assert next(passed.iterate(sliceno, "time")).fold == want_fold synthesis_split = Dataset(datasets.source, "synthesis_split") values = zip(( 1, 2, 3, ), "abc") hash = typed_writer("int32").hash good = [v for v in values if hash(v[0]) % params.slices == sliceno] assert list(synthesis_split.iterate(sliceno)) == good synthesis_manual = Dataset(datasets.source, "synthesis_manual") assert list(synthesis_manual.iterate(sliceno, "sliceno")) == [sliceno] nonetest = Dataset(datasets.source, "nonetest") good = tuple(v[0] if k in test_data.not_none_capable else None for k, v in sorted(test_data.data.items())) assert list(nonetest.iterate(sliceno)) == [good]
def hashfilter(typ, values, sliceno): from accelerator.g import slices if typ == 'json': return values[sliceno::slices] else: from accelerator.dsutil import typed_writer h = typed_writer(typ).hash return [v for v in values if h(v) % slices == sliceno]
def test_rehash_with_empty_slices(): dw = DatasetWriter(name='rehash with empty slices', hashlabel='a') dw.add('a', 'ascii') dw.add('b', 'ascii') w = dw.get_split_write() w('a', '42') w('42', 'b') source = dw.finish() hashfunc = typed_writer('int32').hash def verify_hashing(caption, want_values, **kw): ds = subjobs.build('dataset_type', source=source, column2type=dict(a='int32_10'), caption=caption, **kw).dataset() got_values = set() for sliceno in range(g.slices): for got in ds.iterate(sliceno): assert hashfunc(got[0]) % g.slices == sliceno assert got not in got_values got_values.add(got) assert want_values == got_values verify_hashing('with discard', {( 42, 'b', )}, filter_bad=True) # using defaults uses some different code paths verify_hashing('with default=0 (probably two slices)', {( 0, '42', ), ( 42, 'b', )}, defaults=dict(a='0')) verify_hashing('with default=42 (one slice)', {( 42, '42', ), ( 42, 'b', )}, defaults=dict(a='42'))
def verify(slices, data, source, previous=None, **options): jid = subjobs.build( "dataset_hashpart", datasets=dict(source=source, previous=previous), options=options, ) hl = options["hashlabel"] h = typed_writer(columns[hl][0]).hash ds = Dataset(jid) good = {row[hl]: row for row in data} names = list(source.columns) for slice in range(slices): for row in ds.iterate_chain(slice, names): row = dict(zip(names, row)) assert h( row[hl] ) % slices == slice, "row %r is incorrectly in slice %d in %s" % ( row, slice, ds) want = good[row[hl]] assert row == want, '%s (rehashed from %s) did not contain the right data for "%s".\nWanted\n%r\ngot\n%r' % ( ds, source, hl, want, row) return ds
def synthesis(job, slices): # Test keeping untyped columns. dw = job.datasetwriter(name='a', columns={ 'a': 'unicode', 'b': ('bytes', True), 'c': ('ascii', True), 'd': ('number', True) }) write = dw.get_split_write() write('A', None, None, None) write('a', b'b', 'c', 0) a = dw.finish() assert a.hashlabel == None typed_a = subjobs.build('dataset_type', options=dict(hashlabel='a', column2type={'a': 'ascii'}), datasets=dict(source=a)).dataset() assert typed_a.hashlabel == 'a' assert set(typed_a.iterate(None)) == {('A', None, None, None), ('a', b'b', 'c', 0)}, typed_a # Test hashing on a column not explicitly typed. dw = job.datasetwriter(name='b', columns={ 'a': 'unicode', 'b': 'ascii', 'c': 'bytes', 'd': 'unicode' }, previous=a) write = dw.get_split_write() write('A', 'B', b'C', '1') b = dw.finish() assert b.hashlabel == None typed_b = subjobs.build('dataset_type', options=dict(hashlabel='a', column2type={'b': 'ascii'}), datasets=dict(source=b)).dataset() assert typed_b.hashlabel == 'a' assert set(typed_b.iterate(None)) == {('a', 'b', b'c'), ('A', None, None), ('A', 'B', b'C')}, typed_b # Test renaming over the original hashlabel dw = job.datasetwriter(name='c', columns={ 'a': 'unicode', 'b': 'ascii', 'c': 'bytes', 'd': 'unicode' }, hashlabel='a') write = dw.get_split_write() write('\xe5', 'b', b'c', '0') c = dw.finish() assert c.hashlabel == 'a' typed_c = subjobs.build('dataset_type', options=dict(column2type={ 'a': 'ascii', 'd': 'number' }, rename={'c': 'a'}), datasets=dict(source=c)).dataset() assert typed_c.hashlabel == None assert list(typed_c.iterate(None)) == [('c', 'b', b'c', 0)], typed_c # Test using the original names but for different columns (keeping hashlabel under new name) dw = job.datasetwriter(name='d', columns={ 'a': 'unicode', 'b': 'ascii', 'c': 'bytes', 'd': 'unicode' }, hashlabel='a') write = dw.get_split_write() write('\xc5', 'B', B'C', '1') d = dw.finish() assert d.hashlabel == 'a' typed_d = subjobs.build('dataset_type', options=dict(column2type={ 'a': 'bytes', 'b': 'ascii', 'c': 'int32_10', 'd': 'bytes' }, rename={ 'b': 'a', 'c': 'b', 'd': 'c', 'a': 'd' }), datasets=dict(source=d)).dataset() assert typed_d.hashlabel == 'd' assert list(typed_d.iterate(None)) == [(b'B', 'C', 1, b'\xc3\x85') ], typed_c # Test various types for hashing and discarding of bad lines. for hl in (None, 'a', 'b', 'c'): dw = job.datasetwriter(name='hashed on %s' % (hl, ), columns={ 'a': 'unicode', 'b': 'unicode', 'c': 'unicode' }, hashlabel=hl) w = dw.get_split_write() for ix in range(1000): w(unicode(ix), '%d.%d' % (ix, ix % 5 == 0), ('{"a": %s}' if ix % 3 else '%d is bad') % (ix, )) src_ds = dw.finish() assert src_ds.hashlabel == hl test( src_ds, dict(column2type={ 'a': 'int32_10', 'b': 'number:int' }, filter_bad=True), 800) test( src_ds, dict(column2type={ 'a': 'int64_10', 'b': 'number', 'c': 'json' }, filter_bad=True), 666) test( src_ds, dict(column2type={ 'a': 'floatint32ei', 'b': 'number:int', 'c': 'json' }, filter_bad=True), 533) test( src_ds, dict(column2type={ 'from_a': 'number', 'from_b': 'float64', 'from_c': 'ascii' }, rename=dict(a='from_a', b='from_b', c='from_c')), 1000) test( src_ds, dict(column2type={ 'c': 'bits32_16', 'a': 'float32', 'b': 'bytes' }, rename=dict(a='c', b='a', c='b')), 1000) # this doesn't test as many permutations, it's just to test more column types. dw = job.datasetwriter(name='more types') cols = { 'floatbooli': cycle(['1.42 or so', '0 maybe', '1 (exactly)']), 'datetime:%Y%m%d %H:%M': [ '2019%02d%02d 17:%02d' % (t % 12 + 1, t % 28 + 1, t % 60) for t in range(1000) ], 'date:%Y%m%d': ['2019%02d%02d' % ( t % 12 + 1, t % 28 + 1, ) for t in range(1000)], 'time:%H:%M': ['%02d:%02d' % (t // 60, t % 60) for t in range(1000)], 'timei:%H:%M': [ '%02d:%02d%c' % (t // 60, t % 60, chr(t % 26 + 65)) for t in range(1000) ], } gens = [] for coltype, gen in cols.items(): dw.add(coltype.split(':')[0], 'ascii') gens.append(iter(gen)) dw.add('half', 'bytes') gens.append(cycle([b'1', b'no'])) w = dw.get_split_write() for _ in range(1000): w(*map(next, gens)) src_ds = dw.finish() assert src_ds.hashlabel == None column2type = {t.split(':')[0]: t for t in cols} for hl in column2type: hashed = subjobs.build('dataset_type', options=dict(column2type=column2type, hashlabel=hl), datasets=dict(source=src_ds)).dataset() assert hashed.hashlabel == hl unhashed = subjobs.build('dataset_type', options=dict(column2type=column2type), datasets=dict(source=src_ds)).dataset() assert unhashed.hashlabel == None rehashed = subjobs.build('dataset_hashpart', options=dict(hashlabel=hl), datasets=dict(source=unhashed)).dataset() assert rehashed.hashlabel == hl assert hashed.lines == rehashed.lines assert sum(hashed.lines) == 1000 assert set(hashed.columns.keys()) == set( unhashed.columns.keys()) == set(rehashed.columns.keys()) # and again with a bad column column2type['half'] = 'float32' hashed = subjobs.build('dataset_type', options=dict(column2type=column2type, hashlabel=hl, filter_bad=True), datasets=dict(source=src_ds)).dataset() assert hashed.hashlabel == hl unhashed = subjobs.build('dataset_type', options=dict(column2type=column2type, filter_bad=True), datasets=dict(source=src_ds)).dataset() assert unhashed.hashlabel == None rehashed = subjobs.build('dataset_hashpart', options=dict(hashlabel=hl), datasets=dict(source=unhashed)).dataset() assert rehashed.hashlabel == hl del column2type['half'] assert hashed.lines == rehashed.lines assert sum(hashed.lines) == 500 assert set(hashed.columns.keys()) == set( unhashed.columns.keys()) == set(rehashed.columns.keys()) # test rehashing on a column we don't type, over all types. dw = job.datasetwriter(name='rehash all types', columns={ '2type': ('ascii', True), 'ascii': ('ascii', True), 'bits32': ('bits32', False), 'bits64': ('bits64', False), 'bool': ('bool', True), 'bytes': ('bytes', True), 'date': ('date', True), 'datetime': ('datetime', True), 'float32': ('float32', True), 'float64': ('float64', True), 'int32': ('int32', True), 'int64': ('int64', True), 'json': ('json', True), 'number': ('number', True), 'time': ('time', True), 'unicode': ('unicode', True), }) write = dw.get_split_write() data = { '42': ('ascii string', 100, 1000, True, b'bytes string', date(2019, 12, 11), datetime(2019, 12, 11, 20, 7, 21), 1.5, 0.00000001, 99, -11, { "a": "b" }, 1e100, time(20, 7, 21), 'unicode string'), None: (None, 0, 0, None, None, None, None, None, None, None, None, None, None, None, None), '18': ('ASCII STRING', 111, 1111, False, b'BYTES STRING', date(1868, 1, 3), datetime(1868, 1, 3, 13, 14, 5), 2.5, -0.0000001, 67, -99, [42, ".."], 5e100, time(13, 14, 5), 'UNICODE STRING'), } write('42', *data['42']) write(None, *data[None]) write('18', *data['18']) src_ds = dw.finish() data['None'] = data.pop(None) type2type = { 'ascii': 'unicode:ascii', 'bool': 'unicode:ascii', 'date': 'unicode:ascii', 'datetime': 'unicode:ascii', 'time': 'unicode:ascii', 'bits32': 'bits32_10', 'bits64': 'bits64_10', 'bytes': 'bytes', 'float32': 'float32', 'float64': 'float64', 'int32': 'int32_10', 'int64': 'int64_10', 'number': 'number', 'unicode': 'unicode:ascii', } for hl, typeas in sorted(type2type.items()): ds = subjobs.build('dataset_type', column2type={ '2type': typeas }, hashlabel=hl, source=src_ds).dataset() seen = set() hl_hash = typed_writer(hl).hash for sliceno in range(slices): for line in ds.iterate(sliceno, None): key = line[0] or None if isinstance(key, float): key = int(key) if isinstance(key, bytes): key = key.decode('ascii') else: key = unicode(key) assert data.get(key) == line[ 1:], "%s (hl %s) didn't have the right data for line %r" % ( ds, hl, line[0], ) hv = line[sorted(src_ds.columns).index(hl)] assert hl_hash( hv ) % slices == sliceno, "%s (hl %s) didn't hash %r correctly" % ( ds, hl, hv, ) assert key not in seen, "%s (hl %s) repeated line %s" % ( ds, hl, line[0], ) seen.add(key) assert seen == {'42', 'None', '18'}, "%s didn't have all lines (%r)" % ( ds, seen, )
def one_column(vars, colname, coltype, out_fns, for_hasher=False): if for_hasher: record_bad = skip_bad = False elif vars.first_lap: record_bad = options.filter_bad skip_bad = False else: record_bad = 0 skip_bad = options.filter_bad minmax_fn = 'minmax%d' % (vars.sliceno, ) fmt = fmt_b = None is_null_converter = False if coltype in dataset_type.convfuncs: shorttype = coltype _, cfunc, pyfunc = dataset_type.convfuncs[coltype] elif coltype.startswith('null_'): shorttype = coltype pyfunc = False cfunc = True is_null_converter = True else: shorttype, fmt = coltype.split(':', 1) _, cfunc, pyfunc = dataset_type.convfuncs[shorttype + ':*'] if cfunc: cfunc = shorttype.replace(':', '_') if pyfunc: tmp = pyfunc(coltype) if callable(tmp): pyfunc = tmp cfunc = None else: pyfunc = None cfunc, fmt, fmt_b = tmp if coltype == 'number': cfunc = 'number' elif coltype == 'number:int': coltype = 'number' cfunc = 'number' fmt = "int" assert cfunc or pyfunc, coltype + " didn't have cfunc or pyfunc" coltype = shorttype in_fns = [] offsets = [] max_counts = [] dest_colname = options.rename.get(colname, colname) for d in vars.chain: assert colname in d.columns, '%s not in %s' % ( colname, d, ) if not d.lines[vars.sliceno]: continue if not is_null_converter: assert d.columns[ colname].type in byteslike_types, '%s has bad type in %s' % ( colname, d, ) in_fns.append(d.column_filename(colname, vars.sliceno)) if d.columns[colname].offsets: offsets.append(d.columns[colname].offsets[vars.sliceno]) max_counts.append(d.lines[vars.sliceno]) else: offsets.append(0) max_counts.append(-1) if cfunc: default_value = options.defaults.get(dest_colname, cstuff.NULL) if for_hasher and default_value is cstuff.NULL: if coltype.startswith('bits'): # No None-support. default_value = '0' else: default_value = None default_len = 0 if default_value is None: default_value = cstuff.NULL default_value_is_None = True else: default_value_is_None = False if default_value != cstuff.NULL: if isinstance(default_value, unicode): default_value = default_value.encode("utf-8") default_len = len(default_value) c = getattr(cstuff.backend, 'convert_column_' + cfunc) if vars.rehashing: c_slices = vars.slices else: c_slices = 1 bad_count = cstuff.mk_uint64(c_slices) default_count = cstuff.mk_uint64(c_slices) gzip_mode = "wb%d" % (options.compression, ) if in_fns: assert len(out_fns) == c_slices + vars.save_bad res = c(*cstuff.bytesargs( in_fns, len(in_fns), out_fns, gzip_mode, minmax_fn, default_value, default_len, default_value_is_None, fmt, fmt_b, record_bad, skip_bad, vars.badmap_fd, vars.badmap_size, vars.save_bad, c_slices, vars.slicemap_fd, vars.slicemap_size, bad_count, default_count, offsets, max_counts)) assert not res, 'Failed to convert ' + colname vars.res_bad_count[dest_colname] = list(bad_count) vars.res_default_count[dest_colname] = sum(default_count) coltype = coltype.split(':', 1)[0] if is_null_converter: real_coltype = vars.chain[0].columns[colname].type # Some lines may have been filtered out, so these minmax values # could be wrong. There's no easy/cheap way to fix that though, # and they will never be wrong in the bad direction. vars.res_minmax[dest_colname] = [ vars.chain.min(colname), vars.chain.max(colname) ] else: real_coltype = dataset_type.typerename.get(coltype, coltype) if exists(minmax_fn): with typed_reader(real_coltype)(minmax_fn) as it: vars.res_minmax[dest_colname] = list(it) unlink(minmax_fn) else: # python func if for_hasher: raise Exception("Can't hash on column of type %s." % (coltype, )) nodefault = object() if dest_colname in options.defaults: default_value = options.defaults[dest_colname] if default_value is not None: if isinstance(default_value, unicode): default_value = default_value.encode('utf-8') default_value = pyfunc(default_value) else: default_value = nodefault if options.filter_bad: badmap = mmap(vars.badmap_fd, vars.badmap_size) if PY2: badmap = IntegerBytesWrapper(badmap) if vars.rehashing: slicemap = mmap(vars.slicemap_fd, vars.slicemap_size) slicemap = Int16BytesWrapper(slicemap) bad_count = [0] * vars.slices else: bad_count = [0] chosen_slice = 0 default_count = 0 dont_minmax_types = { 'bytes', 'ascii', 'unicode', 'json', 'complex32', 'complex64' } real_coltype = dataset_type.typerename.get(coltype, coltype) do_minmax = real_coltype not in dont_minmax_types if vars.save_bad: bad_fh = typed_writer('bytes')(out_fns.pop(), none_support=True) fhs = [typed_writer(real_coltype)(fn) for fn in out_fns] if vars.save_bad: fhs.append(bad_fh) write = fhs[0].write col_min = col_max = None it = itertools.chain.from_iterable( d._column_iterator(vars.sliceno, colname, _type='bytes') for d in vars.chain) for ix, v in enumerate(it): if vars.rehashing: chosen_slice = slicemap[ix] write = fhs[chosen_slice].write if skip_bad: if badmap[ix // 8] & (1 << (ix % 8)): bad_count[chosen_slice] += 1 if vars.save_bad: bad_fh.write(v) continue try: v = pyfunc(v) except ValueError: if default_value is not nodefault: v = default_value default_count += 1 elif record_bad: bad_count[chosen_slice] += 1 bv = badmap[ix // 8] badmap[ix // 8] = bv | (1 << (ix % 8)) continue else: raise Exception("Invalid value %r with no default in %s" % ( v, colname, )) if do_minmax and not isinstance(v, NoneType): if col_min is None: col_min = col_max = v if v < col_min: col_min = v if v > col_max: col_max = v write(v) for fh in fhs: fh.close() if vars.rehashing: slicemap.close() if options.filter_bad: badmap.close() vars.res_bad_count[dest_colname] = bad_count vars.res_default_count[dest_colname] = default_count vars.res_minmax[dest_colname] = [col_min, col_max] return real_coltype
def synthesis(job, slices): # All the representations we want to verify. values = [ # 1 byte values [i, '=B', i + 128 + 5] for i in range(-5, 123) ] + [ # 3 bytes values [-6, '=bh', 2, -6], [123, '=bh', 2, 123], [-0x8000, '=bh', 2, -0x8000], [0x7fff, '=bh', 2, 0x7fff], # 5 byte values [-0x8001, '=bi', 4, -0x8001], [0x8000, '=bi', 4, 0x8000], [-0x80000000, '=bi', 4, -0x80000000], [0x7fffffff, '=bi', 4, 0x7fffffff], # 9 byte values [-0x80000001, '=bq', 8, -0x80000001], [0x80000000, '=bq', 8, 0x80000000], [-0x8000000000000000, '=bq', 8, -0x8000000000000000], [0x7fffffffffffffff, '=bq', 8, 0x7fffffffffffffff], # special values [None, '=b', 0], [0.1, '=bd', 1, 0.1], ] # Verify each value through a manual typed_writer. # Also write to a dataset, a csv and a value2bytes dict. value2bytes = {} dw = job.datasetwriter() dw.add('num', 'number', none_support=True) write = dw.get_split_write() with job.open('data.csv', 'wt') as csv_fh: csv_fh.write('num\n') for v in values: value = v[0] write(value) csv_fh.write('%s\n' % (value, )) want_bytes = struct.pack(*v[1:]) value2bytes[value] = want_bytes with typed_writer('number')('tmp', compression='gzip', none_support=True) as w: w.write(value) with gzip.open('tmp', 'rb') as fh: got_bytes = fh.read() assert want_bytes == got_bytes, "%r gave %r, wanted %r" % ( value, got_bytes, want_bytes, ) # Make sure we get the same representation through a dataset. # Assumes that the column is merged (a single file for all slices). ds = dw.finish() just_values = set(v[0] for v in values) assert set(ds.iterate( None, 'num')) == just_values, "Dataset contains wrong values" want_bytes = b''.join(value2bytes[v] for v in ds.iterate(None, 'num')) with gzip.open(ds.column_filename('num'), 'rb') as fh: got_bytes = fh.read() assert want_bytes == got_bytes, "All individual encoding are right, but not in a dataset?" # csvimport and dataset_type the same thing, # verify we got the same bytes jid = build('csvimport', filename=job.filename('data.csv')) jid = build('dataset_type', source=jid, column2type={'num': 'number'}, defaults={'num': None}) with gzip.open(jid.dataset().column_filename('num'), 'rb') as fh: got_bytes = fh.read() assert want_bytes == got_bytes, "csvimport + dataset_type (%s) gave different bytes" % ( jid, )
def synthesis(prepare_res, params, job, slices): dws = prepare_res for dw in ( dws.unhashed_split, dws.up_split, ): w = dw.get_split_write_list() for row in all_data: w(row) hl2ds = {None: [], "up": [], "down": []} all_ds = {} special_cases = { "up_datetime", "down_time", "up_date", "down_date", "unhashed_bytes", "up_ascii", "down_unicode", } for name, dw in dws.items(): ds = dw.finish() all_ds[ds.name] = ds if ds.name not in special_cases: hl2ds[ds.hashlabel].append(ds) # Verify that the different ways of writing gave the same result for hashlabel in (None, "up", "down"): for sliceno in range(slices): data = [(ds, list(ds.iterate(sliceno))) for ds in hl2ds[hashlabel]] good = data[0][1] for name, got in data: assert got == good, "%s doesn't match %s in slice %d" % ( data[0][0], name, sliceno, ) # Verify that both up and down hashed on the expected column hash = typed_writer("int32").hash for colname in ("up", "down"): ds = all_ds[colname + "_checked"] for sliceno in range(slices): for value in ds.iterate(sliceno, colname): assert hash( int(value) ) % slices == sliceno, "Bad hashing on %s in slice %d" % ( colname, sliceno, ) # Verify that up and down are not the same, to catch hashing # not actually hashing. for up_name, down_name in ( ("up_checked", "down_checked"), ("up_datetime", "down_time"), ("up_date", "down_date"), ("up_ascii", "down_unicode"), ): up = cleanup(all_ds[up_name].iterate(None)) down = cleanup(all_ds[down_name].iterate(None)) assert up != down, "Hashlabel did not change slice distribution (%s vs %s)" % ( up_name, down_name) # And check that the data is still the same. assert sorted(up) == sorted( down) == all_data, "Hashed datasets have wrong data (%s vs %s)" % ( up_name, down_name) # Verify that rehashing works. # (Can't use sliceno None, because that won't rehash, and even if it did # the order wouldn't match. Order doesn't even match in the rehashed # individual slices.) def test_rehash(want_ds, chk_ds_lst): want_ds = all_ds[want_ds] for sliceno in range(slices): want = sorted(cleanup(want_ds.iterate(sliceno))) for chk_ds in chk_ds_lst: assert chk_ds.hashlabel != want_ds.hashlabel got = chk_ds.iterate(sliceno, hashlabel=want_ds.hashlabel, rehash=True) got = sorted(cleanup(got)) assert want == got, "Rehashing is broken for %s (slice %d of %s)" % ( chk_ds.columns[want_ds.hashlabel].type, sliceno, chk_ds, ) test_rehash("up_checked", hl2ds[None] + hl2ds["down"]) test_rehash("down_checked", hl2ds[None] + hl2ds["up"]) test_rehash("up_datetime", [all_ds["down_time"]]) test_rehash("down_time", [all_ds["up_datetime"]]) test_rehash("down_date", [all_ds["up_date"]]) test_rehash("up_ascii", [all_ds["unhashed_bytes"], all_ds["down_unicode"]]) test_rehash("down_unicode", [all_ds["unhashed_bytes"], all_ds["up_ascii"]]) # And finally verify that we are not allowed to specify the wrong hashlabel good = True try: all_ds["up_checked"].iterate(None, hashlabel="down") good = False except DatasetUsageError: pass try: all_ds["unhashed_manual"].iterate(None, hashlabel="down") good = False except DatasetUsageError: pass assert good, "Iteration allowed on the wrong hashlabel" # verify that non-integral floats hash the same in the five types that can have them # using + 0.5 is safe for the values we use, it can be exactly represented in 32 bit floats. float_data = [v + 0.5 for v, _ in all_data] float_ds_lst = [] for typ in ("float32", "float64", "complex32", "complex64", "number"): dw = job.datasetwriter(name="floattest_" + typ, columns={"value": typ}, hashlabel="value") write = dw.get_split_write() for v in float_data: write(v) float_ds_lst.append(dw.finish()) for sliceno in range(slices): values = [(ds, list(ds.iterate(sliceno, "value"))) for ds in float_ds_lst] want_ds, want = values.pop() for ds, got in values: assert got == want, "%s did not match %s in slice %d" % ( ds, want_ds, sliceno, )