def analysis(sliceno, prepare_res, params): dws = prepare_res dws.down_discarded.enable_hash_discard() dws.down_discarded_list.enable_hash_discard() dws.down_discarded_dict.enable_hash_discard() dws.up_datetime.enable_hash_discard() dws.down_time.enable_hash_discard() dws.up_ascii.enable_hash_discard() dws.down_unicode.enable_hash_discard() dws.up_date.enable_hash_discard() dws.down_date.enable_hash_discard() for ix, (up, down) in enumerate(all_data): if dws.up_checked.hashcheck(up): dws.up_checked.write(up, down) if dws.down_checked.hashcheck(down): dws.down_checked.write(up, down) if ix % params.slices == sliceno: dws.unhashed_manual.write(up, down) dws.unhashed_complex64.write(up, down) dws.unhashed_bytes.write( str(up).encode("ascii"), str(down).encode("ascii")) dws.down_discarded.write(up, down) dws.down_discarded_list.write_list([up, down]) dws.down_discarded_dict.write_dict(dict(up=up, down=down)) dt_up = datetime(1970, 1, 1, 0, 0, 0, up) dt_down = datetime(1970, 1, 1, 0, 0, 0, down) dws.up_datetime.write(dt_up, dt_down) dws.down_time.write(dt_up.time(), dt_down.time()) dws.up_date.write(date.fromordinal(up + 1), date.fromordinal(down + 1)) dws.down_date.write(date.fromordinal(up + 1), date.fromordinal(down + 1)) dws.up_ascii.write(str(up), str(down)) dws.down_unicode.write(unicode(up), unicode(down)) # verify that we are not allowed to write in the wrong slice without enable_hash_discard if not dws.up_checked.hashcheck(0): good = True for fn, a in ( ("write", ( 0, 0, )), ("write_list", ([0, 0], )), ("write_dict", (dict(up=0, down=0), )), ): try: getattr(dws.up_checked, fn)(*a) good = False except Exception: pass assert good, "%s allowed writing in wrong slice" % (fn, )
def show(lineno, items): if only_matching == 'part': items = [filter_item(unicode(item)) for item in items] if only_matching == 'columns': d = { k: v for k, v in zip(used_columns, items) if filter_item(unicode(v)) } else: d = dict(zip(used_columns, items)) if args.show_lineno: prefix['lineno'] = lineno if prefix: prefix['data'] = d d = prefix return dumps(d).encode('utf-8', 'surrogatepass') + b'\n'
def analysis(sliceno, prepare_res): writers, columns, chain = prepare_res key_it = chain.iterate(sliceno, options.column) # we can't just use chain.iterate because of protections against changing types with copy_mode values_it = itertools.chain.from_iterable( ds.iterate(sliceno, columns, copy_mode=True, status_reporting=False) for ds in chain) for key, values in izip(key_it, values_it): writers[unicode(key)].write(*values)
def quote(s): """Quote s unless it looks fine without""" s = unicode(s) r = repr(s) if PY2: # remove leading u r = r[1:] if s and len(s) + 2 == len(r) and not any(c.isspace() for c in s): return s else: return r
def show(): data = list(prefix) if args.show_lineno: data.append(unicode(lineno)) if PY2: show_items = (v if isinstance(v, unicode) else str(v).decode('utf-8', 'replace') for v in items) else: show_items = map(str, items) show_items = list(show_items) lens = (len(item) for item in data + show_items) if highlight_matches: show_items = list(map(colour_item, show_items)) if escape_item: lens_unesc = (len(item) for item in data + show_items) show_items = list(map(escape_item, show_items)) lens_esc = (len(item) for item in data + show_items) lens = ( l + esc - unesc for l, unesc, esc in zip(lens, lens_unesc, lens_esc)) data.extend(show_items) return separate(data, lens).encode('utf-8', errors)
def show(lineno, items): data = list(prefix) if args.show_lineno: data.append(unicode(lineno)) show_items = map(unicode, items) if only_matching: if only_matching == 'columns': show_items = (item if filter_item(item) else '' for item in show_items) else: show_items = map(filter_item, show_items) show_items = list(show_items) lens = (len(item) for item in data + show_items) if highlight_matches: show_items = list(map(colour_item, show_items)) if escape_item: lens_unesc = (len(item) for item in data + show_items) show_items = list(map(escape_item, show_items)) lens_esc = (len(item) for item in data + show_items) lens = ( l + esc - unesc for l, unesc, esc in zip(lens, lens_unesc, lens_esc)) data.extend(show_items) return separate(data, lens).encode('utf-8', errors) + b'\n'
def synthesis(job): # Test keeping untyped columns of bytes-like types. dw = job.datasetwriter(name='a', columns={ 'a': 'unicode', 'b': 'bytes', 'c': 'ascii', 'd': 'number' }) write = dw.get_split_write() write('a', b'b', 'c', 0) a = dw.finish() assert a.hashlabel == None typed_a = subjobs.build('dataset_type', options=dict(hashlabel='a', column2type={'a': 'ascii'}), datasets=dict(source=a)).dataset() assert typed_a.hashlabel == 'a' assert list(typed_a.iterate(None)) == [('a', b'b', 'c')], typed_a # Test hashing on a column not explicitly typed. dw = job.datasetwriter(name='b', columns={ 'a': 'unicode', 'b': 'ascii', 'c': 'bytes', 'd': 'unicode' }, previous=a) write = dw.get_split_write() write('A', 'B', b'C', '1') b = dw.finish() assert b.hashlabel == None typed_b = subjobs.build('dataset_type', options=dict(hashlabel='a', column2type={'b': 'ascii'}), datasets=dict(source=b)).dataset() assert typed_b.hashlabel == 'a' assert set(typed_b.iterate(None)) == {('a', 'b'), ('A', 'B')}, typed_b # Test renaming over the original hashlabel dw = job.datasetwriter(name='c', columns={ 'a': 'unicode', 'b': 'ascii', 'c': 'bytes', 'd': 'unicode' }, hashlabel='a') write = dw.get_split_write() write('\xe5', 'b', b'c', '0') c = dw.finish() assert c.hashlabel == 'a' typed_c = subjobs.build('dataset_type', options=dict(column2type={ 'a': 'ascii', 'd': 'number' }, rename={'c': 'a'}), datasets=dict(source=c)).dataset() assert typed_c.hashlabel == None assert list(typed_c.iterate(None)) == [('c', 'b', b'c', 0)], typed_c # Test using the original names but for different columns (keeping hashlabel under new name) dw = job.datasetwriter(name='d', columns={ 'a': 'unicode', 'b': 'ascii', 'c': 'bytes', 'd': 'unicode' }, hashlabel='a') write = dw.get_split_write() write('\xc5', 'B', B'C', '1') d = dw.finish() assert d.hashlabel == 'a' typed_d = subjobs.build('dataset_type', options=dict(column2type={ 'a': 'bytes', 'b': 'ascii', 'c': 'int32_10', 'd': 'bytes' }, rename={ 'b': 'a', 'c': 'b', 'd': 'c', 'a': 'd' }), datasets=dict(source=d)).dataset() assert typed_d.hashlabel == 'd' assert list(typed_d.iterate(None)) == [(b'B', 'C', 1, b'\xc3\x85') ], typed_c # Test various types for hashing and discarding of bad lines. for hl in (None, 'a', 'b', 'c'): dw = job.datasetwriter(name='hashed on %s' % (hl, ), columns={ 'a': 'unicode', 'b': 'unicode', 'c': 'unicode' }, hashlabel=hl) w = dw.get_split_write() for ix in range(1000): w(unicode(ix), '%d.%d' % (ix, ix % 5 == 0), ('{"a": %s}' if ix % 3 else '%d is bad') % (ix, )) src_ds = dw.finish() assert src_ds.hashlabel == hl test( src_ds, dict(column2type={ 'a': 'int32_10', 'b': 'number:int' }, filter_bad=True), 800) test( src_ds, dict(column2type={ 'a': 'int64_10', 'b': 'number', 'c': 'json' }, filter_bad=True), 666) test( src_ds, dict(column2type={ 'a': 'floatint32ei', 'b': 'number:int', 'c': 'json' }, filter_bad=True), 533) test( src_ds, dict(column2type={ 'from_a': 'number', 'from_b': 'float64', 'from_c': 'ascii' }, rename=dict(a='from_a', b='from_b', c='from_c')), 1000) test( src_ds, dict(column2type={ 'c': 'bits32_16', 'a': 'float32', 'b': 'bytes' }, rename=dict(a='c', b='a', c='b')), 1000) # this doesn't test as many permutations, it's just to test more column types. dw = job.datasetwriter(name='more types') cols = { 'floatbooli': cycle(['1.42 or so', '0 maybe', '1 (exactly)']), 'datetime:%Y%m%d %H:%M': [ '2019%02d%02d 17:%02d' % (t % 12 + 1, t % 28 + 1, t % 60) for t in range(1000) ], 'date:%Y%m%d': ['2019%02d%02d' % ( t % 12 + 1, t % 28 + 1, ) for t in range(1000)], 'time:%H:%M': ['%02d:%02d' % (t // 60, t % 60) for t in range(1000)], 'timei:%H:%M': [ '%02d:%02d%c' % (t // 60, t % 60, chr(t % 26 + 65)) for t in range(1000) ], } gens = [] for coltype, gen in cols.items(): dw.add(coltype.split(':')[0], 'ascii') gens.append(iter(gen)) dw.add('half', 'bytes') gens.append(cycle([b'1', b'no'])) w = dw.get_split_write() for _ in range(1000): w(*map(next, gens)) src_ds = dw.finish() assert src_ds.hashlabel == None column2type = {t.split(':')[0]: t for t in cols} for hl in column2type: hashed = subjobs.build('dataset_type', options=dict(column2type=column2type, hashlabel=hl), datasets=dict(source=src_ds)).dataset() assert hashed.hashlabel == hl unhashed = subjobs.build('dataset_type', options=dict(column2type=column2type), datasets=dict(source=src_ds)).dataset() assert unhashed.hashlabel == None rehashed = subjobs.build('dataset_rehash', options=dict(hashlabel=hl), datasets=dict(source=unhashed)).dataset() assert rehashed.hashlabel == hl assert hashed.lines == rehashed.lines assert sum(hashed.lines) == 1000 assert set(hashed.columns.keys()) == set( unhashed.columns.keys()) == set(rehashed.columns.keys()) # and again with a bad column column2type['half'] = 'float32' hashed = subjobs.build('dataset_type', options=dict(column2type=column2type, hashlabel=hl, filter_bad=True), datasets=dict(source=src_ds)).dataset() assert hashed.hashlabel == hl unhashed = subjobs.build('dataset_type', options=dict(column2type=column2type, filter_bad=True), datasets=dict(source=src_ds)).dataset() assert unhashed.hashlabel == None rehashed = subjobs.build('dataset_rehash', options=dict(hashlabel=hl), datasets=dict(source=unhashed)).dataset() assert rehashed.hashlabel == hl del column2type['half'] assert hashed.lines == rehashed.lines assert sum(hashed.lines) == 500 assert set(hashed.columns.keys()) == set( unhashed.columns.keys()) == set(rehashed.columns.keys())
def synthesis(job, slices): # Test keeping untyped columns. dw = job.datasetwriter(name='a', columns={ 'a': 'unicode', 'b': ('bytes', True), 'c': ('ascii', True), 'd': ('number', True) }) write = dw.get_split_write() write('A', None, None, None) write('a', b'b', 'c', 0) a = dw.finish() assert a.hashlabel == None typed_a = subjobs.build('dataset_type', options=dict(hashlabel='a', column2type={'a': 'ascii'}), datasets=dict(source=a)).dataset() assert typed_a.hashlabel == 'a' assert set(typed_a.iterate(None)) == {('A', None, None, None), ('a', b'b', 'c', 0)}, typed_a # Test hashing on a column not explicitly typed. dw = job.datasetwriter(name='b', columns={ 'a': 'unicode', 'b': 'ascii', 'c': 'bytes', 'd': 'unicode' }, previous=a) write = dw.get_split_write() write('A', 'B', b'C', '1') b = dw.finish() assert b.hashlabel == None typed_b = subjobs.build('dataset_type', options=dict(hashlabel='a', column2type={'b': 'ascii'}), datasets=dict(source=b)).dataset() assert typed_b.hashlabel == 'a' assert set(typed_b.iterate(None)) == {('a', 'b', b'c'), ('A', None, None), ('A', 'B', b'C')}, typed_b # Test renaming over the original hashlabel dw = job.datasetwriter(name='c', columns={ 'a': 'unicode', 'b': 'ascii', 'c': 'bytes', 'd': 'unicode' }, hashlabel='a') write = dw.get_split_write() write('\xe5', 'b', b'c', '0') c = dw.finish() assert c.hashlabel == 'a' typed_c = subjobs.build('dataset_type', options=dict(column2type={ 'a': 'ascii', 'd': 'number' }, rename={'c': 'a'}), datasets=dict(source=c)).dataset() assert typed_c.hashlabel == None assert list(typed_c.iterate(None)) == [('c', 'b', b'c', 0)], typed_c # Test using the original names but for different columns (keeping hashlabel under new name) dw = job.datasetwriter(name='d', columns={ 'a': 'unicode', 'b': 'ascii', 'c': 'bytes', 'd': 'unicode' }, hashlabel='a') write = dw.get_split_write() write('\xc5', 'B', B'C', '1') d = dw.finish() assert d.hashlabel == 'a' typed_d = subjobs.build('dataset_type', options=dict(column2type={ 'a': 'bytes', 'b': 'ascii', 'c': 'int32_10', 'd': 'bytes' }, rename={ 'b': 'a', 'c': 'b', 'd': 'c', 'a': 'd' }), datasets=dict(source=d)).dataset() assert typed_d.hashlabel == 'd' assert list(typed_d.iterate(None)) == [(b'B', 'C', 1, b'\xc3\x85') ], typed_c # Test various types for hashing and discarding of bad lines. for hl in (None, 'a', 'b', 'c'): dw = job.datasetwriter(name='hashed on %s' % (hl, ), columns={ 'a': 'unicode', 'b': 'unicode', 'c': 'unicode' }, hashlabel=hl) w = dw.get_split_write() for ix in range(1000): w(unicode(ix), '%d.%d' % (ix, ix % 5 == 0), ('{"a": %s}' if ix % 3 else '%d is bad') % (ix, )) src_ds = dw.finish() assert src_ds.hashlabel == hl test( src_ds, dict(column2type={ 'a': 'int32_10', 'b': 'number:int' }, filter_bad=True), 800) test( src_ds, dict(column2type={ 'a': 'int64_10', 'b': 'number', 'c': 'json' }, filter_bad=True), 666) test( src_ds, dict(column2type={ 'a': 'floatint32ei', 'b': 'number:int', 'c': 'json' }, filter_bad=True), 533) test( src_ds, dict(column2type={ 'from_a': 'number', 'from_b': 'float64', 'from_c': 'ascii' }, rename=dict(a='from_a', b='from_b', c='from_c')), 1000) test( src_ds, dict(column2type={ 'c': 'bits32_16', 'a': 'float32', 'b': 'bytes' }, rename=dict(a='c', b='a', c='b')), 1000) # this doesn't test as many permutations, it's just to test more column types. dw = job.datasetwriter(name='more types') cols = { 'floatbooli': cycle(['1.42 or so', '0 maybe', '1 (exactly)']), 'datetime:%Y%m%d %H:%M': [ '2019%02d%02d 17:%02d' % (t % 12 + 1, t % 28 + 1, t % 60) for t in range(1000) ], 'date:%Y%m%d': ['2019%02d%02d' % ( t % 12 + 1, t % 28 + 1, ) for t in range(1000)], 'time:%H:%M': ['%02d:%02d' % (t // 60, t % 60) for t in range(1000)], 'timei:%H:%M': [ '%02d:%02d%c' % (t // 60, t % 60, chr(t % 26 + 65)) for t in range(1000) ], } gens = [] for coltype, gen in cols.items(): dw.add(coltype.split(':')[0], 'ascii') gens.append(iter(gen)) dw.add('half', 'bytes') gens.append(cycle([b'1', b'no'])) w = dw.get_split_write() for _ in range(1000): w(*map(next, gens)) src_ds = dw.finish() assert src_ds.hashlabel == None column2type = {t.split(':')[0]: t for t in cols} for hl in column2type: hashed = subjobs.build('dataset_type', options=dict(column2type=column2type, hashlabel=hl), datasets=dict(source=src_ds)).dataset() assert hashed.hashlabel == hl unhashed = subjobs.build('dataset_type', options=dict(column2type=column2type), datasets=dict(source=src_ds)).dataset() assert unhashed.hashlabel == None rehashed = subjobs.build('dataset_hashpart', options=dict(hashlabel=hl), datasets=dict(source=unhashed)).dataset() assert rehashed.hashlabel == hl assert hashed.lines == rehashed.lines assert sum(hashed.lines) == 1000 assert set(hashed.columns.keys()) == set( unhashed.columns.keys()) == set(rehashed.columns.keys()) # and again with a bad column column2type['half'] = 'float32' hashed = subjobs.build('dataset_type', options=dict(column2type=column2type, hashlabel=hl, filter_bad=True), datasets=dict(source=src_ds)).dataset() assert hashed.hashlabel == hl unhashed = subjobs.build('dataset_type', options=dict(column2type=column2type, filter_bad=True), datasets=dict(source=src_ds)).dataset() assert unhashed.hashlabel == None rehashed = subjobs.build('dataset_hashpart', options=dict(hashlabel=hl), datasets=dict(source=unhashed)).dataset() assert rehashed.hashlabel == hl del column2type['half'] assert hashed.lines == rehashed.lines assert sum(hashed.lines) == 500 assert set(hashed.columns.keys()) == set( unhashed.columns.keys()) == set(rehashed.columns.keys()) # test rehashing on a column we don't type, over all types. dw = job.datasetwriter(name='rehash all types', columns={ '2type': ('ascii', True), 'ascii': ('ascii', True), 'bits32': ('bits32', False), 'bits64': ('bits64', False), 'bool': ('bool', True), 'bytes': ('bytes', True), 'date': ('date', True), 'datetime': ('datetime', True), 'float32': ('float32', True), 'float64': ('float64', True), 'int32': ('int32', True), 'int64': ('int64', True), 'json': ('json', True), 'number': ('number', True), 'time': ('time', True), 'unicode': ('unicode', True), }) write = dw.get_split_write() data = { '42': ('ascii string', 100, 1000, True, b'bytes string', date(2019, 12, 11), datetime(2019, 12, 11, 20, 7, 21), 1.5, 0.00000001, 99, -11, { "a": "b" }, 1e100, time(20, 7, 21), 'unicode string'), None: (None, 0, 0, None, None, None, None, None, None, None, None, None, None, None, None), '18': ('ASCII STRING', 111, 1111, False, b'BYTES STRING', date(1868, 1, 3), datetime(1868, 1, 3, 13, 14, 5), 2.5, -0.0000001, 67, -99, [42, ".."], 5e100, time(13, 14, 5), 'UNICODE STRING'), } write('42', *data['42']) write(None, *data[None]) write('18', *data['18']) src_ds = dw.finish() data['None'] = data.pop(None) type2type = { 'ascii': 'unicode:ascii', 'bool': 'unicode:ascii', 'date': 'unicode:ascii', 'datetime': 'unicode:ascii', 'time': 'unicode:ascii', 'bits32': 'bits32_10', 'bits64': 'bits64_10', 'bytes': 'bytes', 'float32': 'float32', 'float64': 'float64', 'int32': 'int32_10', 'int64': 'int64_10', 'number': 'number', 'unicode': 'unicode:ascii', } for hl, typeas in sorted(type2type.items()): ds = subjobs.build('dataset_type', column2type={ '2type': typeas }, hashlabel=hl, source=src_ds).dataset() seen = set() hl_hash = typed_writer(hl).hash for sliceno in range(slices): for line in ds.iterate(sliceno, None): key = line[0] or None if isinstance(key, float): key = int(key) if isinstance(key, bytes): key = key.decode('ascii') else: key = unicode(key) assert data.get(key) == line[ 1:], "%s (hl %s) didn't have the right data for line %r" % ( ds, hl, line[0], ) hv = line[sorted(src_ds.columns).index(hl)] assert hl_hash( hv ) % slices == sliceno, "%s (hl %s) didn't hash %r correctly" % ( ds, hl, hv, ) assert key not in seen, "%s (hl %s) repeated line %s" % ( ds, hl, line[0], ) seen.add(key) assert seen == {'42', 'None', '18'}, "%s didn't have all lines (%r)" % ( ds, seen, )
def __reduce__(self): return unicode, (unicode(self), )
def grep(ds, sliceno, out): out.start(ds) if len(patterns) == 1: chk = patterns[0].search else: def chk(s): return any(p.search(s) for p in patterns) first = [True] def mk_iter(col): kw = {} if first[0]: first[0] = False lines = ds.lines[sliceno] if lines > status_interval[sliceno]: def cb(n): q_status.put((sliceno, False)) out.excite() kw['callback'] = cb kw['callback_interval'] = status_interval[sliceno] if ds.columns[col].type == 'ascii': kw['_type'] = 'unicode' it = ds._column_iterator(sliceno, col, **kw) if ds.columns[col].type == 'bytes': errors = 'replace' if PY2 else 'surrogateescape' if ds.columns[col].none_support: it = (None if v is None else v.decode('utf-8', errors) for v in it) else: it = (v.decode('utf-8', errors) for v in it) return it used_columns = columns_for_ds(ds) used_grep_columns = grep_columns and columns_for_ds(ds, grep_columns) if grep_columns and set(used_grep_columns) != set(used_columns): grep_iter = izip(*(mk_iter(col) for col in used_grep_columns)) else: grep_iter = repeat(None) lines_iter = izip(*(mk_iter(col) for col in used_columns)) if args.before_context: before = deque((), args.before_context) else: before = None if args.format == 'json': prefix = {} if args.show_dataset: prefix['dataset'] = ds if args.show_sliceno: prefix['sliceno'] = sliceno show = make_show(prefix, used_columns) else: prefix = [] if args.show_dataset: prefix.append(ds) if args.show_sliceno: prefix.append(str(sliceno)) prefix = tuple(prefix) show = make_show(prefix, used_columns) if args.invert_match: maybe_invert = operator.not_ else: maybe_invert = bool to_show = 0 for lineno, (grep_items, items) in enumerate(izip(grep_iter, lines_iter)): if maybe_invert( any(chk(unicode(item)) for item in grep_items or items)): if q_list: q_list.put((ds, sliceno)) return while before: out.put(show(*before.popleft())) to_show = 1 + args.after_context if to_show: out.put(show(lineno, items)) to_show -= 1 elif before is not None: before.append((lineno, items)) out.end(ds)
def synthesis(job): def mk(name, types, lines, hashlabel=None, previous=None): columns = {chr(ix): typ for ix, typ in enumerate(types, 65)} dw = job.datasetwriter(name=name, columns=columns, hashlabel=hashlabel, previous=previous) w = dw.get_split_write_list() for line in lines: w(line) return dw.finish() def chk(job, colnames, types, ds2lines, previous={}, hashlabel=None): have_ds = set(ds.name for ds in job.datasets) want_ds = set(ds2lines) assert have_ds == want_ds, 'Job %r should have had datasets %r but had %r' % (job, want_ds, have_ds,) colnames = sorted(colnames) for ds, lines in ds2lines.items(): ds = job.dataset(ds) assert ds.hashlabel == hashlabel, 'Dataset %s should have had hashlabel %s but had %s' % (ds.quoted, hashlabel, ds.hashlabel,) assert ds.previous == previous.get(ds.name), 'Dataset %s should have had previous %s but had %s' % (ds.quoted, previous.get(ds.name), ds.previous,) ds_colnames = sorted(ds.columns) assert ds_colnames == colnames, 'Dataset %s should have had columns %r but had %r' % (ds.quoted, colnames, ds_colnames,) ds_types = tuple(col.type for _, col in sorted(ds.columns.items())) assert ds_types == types, 'Dataset %s should have had columns with types %r but had %r' % (ds.quoted, types, ds_types,) have_lines = sorted(ds.iterate(None)) want_lines = sorted(lines) assert have_lines == want_lines, 'Dataset %s should have contained %r but contained %r' % (ds.quoted, want_lines, have_lines,) # just a simple splitting a = mk('a', ('unicode', 'ascii', 'int64'), [('a', 'a', 1), ('b', 'b', 2), ('a', 'c', 3)], hashlabel='A') j_a_A = subjobs.build('dataset_fanout', source=a, column='A') chk(j_a_A, 'BC', ('ascii', 'int64'), {'a': [('a', 1), ('c', 3)], 'b': [('b', 2)]}) j_a_B = subjobs.build('dataset_fanout', source=a, column='B') chk(j_a_B, 'AC', ('unicode', 'int64'), {'a': [('a', 1)], 'b': [('b', 2)], 'c': [('a', 3)]}, hashlabel='A') # non-text columns should work too j_a_C = subjobs.build('dataset_fanout', source=a, column='C') chk(j_a_C, 'AB', ('unicode', 'ascii'), {'1': [('a', 'a')], '2': [('b', 'b')], '3': [('a', 'c')]}, hashlabel='A') b = mk('b', ('ascii', 'unicode', 'int32', 'int32'), [('a', 'aa', 11, 111), ('b', 'bb', 12, 112), ('a', 'cc', 13, 113), ('d', 'dd', 14, 114)], previous=a) # with previous j_b_A = subjobs.build('dataset_fanout', source=b, column='A', previous=j_a_A) chk( j_b_A, 'BCD', ('unicode', 'int32', 'int32'), {'a': [('aa', 11, 111), ('cc', 13, 113)], 'b': [('bb', 12, 112)], 'd': [('dd', 14, 114)]}, previous={'a': j_a_A.dataset('a'), 'b': j_a_A.dataset('b')}, ) # without previous, but only getting the data from b because of length=1 j_b_A_len1 = subjobs.build('dataset_fanout', source=b, column='A', length=1) chk( j_b_A_len1, 'BCD', ('unicode', 'int32', 'int32'), {'a': [('aa', 11, 111), ('cc', 13, 113)], 'b': [('bb', 12, 112)], 'd': [('dd', 14, 114)]}, ) # with "wrong" previous, inheriting some empty datasets. j_b_A_C = subjobs.build('dataset_fanout', source=b, column='A', previous=j_a_C) chk( j_b_A_C, 'BCD', ('unicode', 'int32', 'int32'), {'a': [('aa', 11, 111), ('cc', 13, 113)], 'b': [('bb', 12, 112)], 'd': [('dd', 14, 114)], '1': [], '2': [], '3': []}, previous={'1': j_a_C.dataset('1'), '2': j_a_C.dataset('2'), '3': j_a_C.dataset('3')}, ) # without previous, getting data from both a and b and the "widest" type for the columns. # (discards the D column since it doesn't exist in a.) j_b_A_None = subjobs.build('dataset_fanout', source=b, column='A') chk( j_b_A_None, 'BC', ('unicode', 'int64'), {'a': [('a', 1), ('aa', 11), ('c', 3), ('cc', 13)], 'b': [('b', 2), ('bb', 12)], 'd': [('dd', 14)]}, ) # test more type combinations, and switching hashlabel (to an included column) tt_a = mk( 'tt_a', ('ascii', 'int32', 'bits64', 'float32', 'number', 'complex32', 'number'), [('a', 1, 2, 2.5, 3, 1+2j, 3.14)], hashlabel='B', ) tt_b = mk( 'tt_b', ('ascii', 'int64', 'bits32', 'float64', 'int32', 'complex64', 'float64'), [('a', 11, 12, 12.5, 13, 11+2j, 13.14)], hashlabel='B', previous=tt_a, ) tt_c = mk( 'tt_c', ('ascii', 'int32', 'bits64', 'int64', 'float64', 'complex32', 'float32'), [('a', 111, 112, 112, 113.5, 111+2j, 314.0), ('b', 0, 0, 0, 0, 0, 0)], hashlabel='C', previous=tt_b, ) # first two, some type changes j_tt_b = subjobs.build('dataset_fanout', source=tt_b, column='A') chk( j_tt_b, 'BCDEFG', ('int64', 'bits64', 'float64', 'number', 'complex64', 'number'), {'a': [(1, 2, 2.5, 3, 1+2j, 3.14), (11, 12, 12.5, 13, 11+2j, 13.14)]}, hashlabel='B', ) # all three in one, more types become number j_tt_c = subjobs.build('dataset_fanout', source=tt_c, column='A') chk( j_tt_c, 'BCDEFG', ('int64', 'bits64', 'number', 'number', 'complex64', 'number'), {'a': [(1, 2, 2.5, 3, 1+2j, 3.14), (11, 12, 12.5, 13, 11+2j, 13.14), (111, 112, 112, 113.5, 111+2j, 314.0)], 'b': [(0, 0, 0, 0, 0, 0)]}, hashlabel=None, ) # just two (checking that earlier types are not considered) j_tt_c_len2 = subjobs.build('dataset_fanout', source=tt_c, column='A', length=2) chk( j_tt_c_len2, 'BCDEFG', ('int64', 'bits64', 'number', 'number', 'complex64', 'float64'), {'a': [(11, 12, 12.5, 13, 11+2j, 13.14), (111, 112, 112, 113.5, 111+2j, 314.0)], 'b': [(0, 0, 0, 0, 0, 0)]}, hashlabel=None, ) # using previous to only get one source dataset, again checking that earlier # types are not considered and that only a gets a previous (and b doesn't) j_tt_c_b = subjobs.build('dataset_fanout', source=tt_c, column='A', previous=j_tt_b) chk( j_tt_c_b, 'BCDEFG', ('int32', 'bits64', 'int64', 'float64', 'complex32', 'float32'), {'a': [(111, 112, 112, 113.5, 111+2j, 314.0)], 'b': [(0, 0, 0, 0, 0, 0)]}, hashlabel='C', previous={'a': j_tt_b.dataset('a')}, ) # it generally works, let's make an exhaustive test of compatible types # (to check that the values actually are compatible) previous = None all_types = [] want_data = [] for ix, types in enumerate(zip( cycle(['ascii']), # this is the split column ['bits32', 'bits64', 'int32', 'int64', 'float32', 'float64', 'number'], cycle(['bits64', 'bits32']), cycle(['complex64', 'complex32']), cycle(['float64', 'float32']), cycle(['int64', 'bits32', 'int32']), cycle(['unicode', 'ascii']), )): data = [('data',) + (ix + 1000,) * 5 + (unicode(ix),)] want_data.append(data[0][1:]) all_types.append( mk('all types %d' % (ix,), types, data, previous=previous) ) previous = all_types[-1] j_all = subjobs.build('dataset_fanout', source=all_types[-1], column='A') chk( j_all, 'BCDEFG', ('number', 'bits64', 'complex64', 'float64', 'int64', 'unicode'), {'data': want_data}, ) # the B column doesn't have number any more here, but should still become number. j_all_except_number = subjobs.build('dataset_fanout', source=all_types[-2], column='A') chk( j_all_except_number, 'BCDEFG', ('number', 'bits64', 'complex64', 'float64', 'int64', 'unicode'), {'data': want_data[:-1]}, )