def mkds(name, columns, data, **kw): columns = dict.fromkeys(columns, 'int32') dw = DatasetWriter(name=name, columns=columns, **kw) write = dw.get_split_write() for v in data: write(*v) return dw.finish()
def prepare(params): d = datasets.source caption = options.caption % dict(caption=d.caption, hashlabel=options.hashlabel) if len( d.chain(stop_ds={datasets.previous: 'source'}, length=options.length)) == 1: filename = d.filename else: filename = None dws = [] previous = datasets.previous for sliceno in range(params.slices): if options.as_chain and sliceno == params.slices - 1: name = "default" else: name = str(sliceno) dw = DatasetWriter( caption="%s (slice %d)" % (caption, sliceno), hashlabel=options.hashlabel, filename=filename, previous=previous, name=name, for_single_slice=sliceno, ) previous = (params.jobid, name) dws.append(dw) names = [] for n, c in d.columns.items(): # names has to be in the same order as the add calls # so the iterator returns the same order the writer expects. names.append(n) for dw in dws: dw.add(n, c.type) return dws, names, caption, filename
def prepare(): columns = dict( bytes="bytes", float="float64", int="int64", json="json", unicode="unicode", ) a = DatasetWriter(name="a", columns=columns) b = DatasetWriter(name="b", columns=columns, previous=a) c = DatasetWriter(name="c", columns=columns) return a, b, c
def test_filter_bad_across_types(): columns={ 'bytes': 'bytes', 'float64': 'bytes', 'int32_10': 'ascii', 'json': 'unicode', 'number:int': 'unicode', 'unicode:utf-8': 'bytes', } # all_good, *values # Make sure all those types (except bytes) can filter other lines, # and be filtered by other lines. And that several filtering values # is not a problem (line 11). data = [ (True, b'first', b'1.1', '1', '"a"', '001', b'ett',), (True, b'second', b'2.2', '2', '"b"', '02', b'tv\xc3\xa5',), (True, b'third', b'3.3', '3', '["c"]', '3.0', b'tre',), (False, b'fourth', b'4.4', '4', '"d"', '4.4', b'fyra',), # number:int bad (False, b'fifth', b'5.5', '-', '"e"', '5', b'fem',), # int32_10 bad (False, b'sixth', b'6.b', '6', '"f"', '6', b'sex',), # float64 bad [False, b'seventh', b'7.7', '7', '{"g"}', '7', b'sju',], # json bad (False, b'eigth', b'8.8', '8', '"h"', '8', b'\xa5\xc3tta',),# unicode:utf-8 bad (True, b'ninth', b'9.9', '9', '"i"', '9', b'nio',), (True, b'tenth', b'10', '10', '"j"', '10', b'tio',), (False, b'eleventh', b'11a', '1-', '"k",', '1,', b'elva',), # float64, int32_10 and number:int bad (True, b'twelfth', b'12', '12', '"l"', '12', b'tolv',), ] dw = DatasetWriter(name="filter bad across types", columns=columns) dw.set_slice(0) want = [] def add_want(v): want.append((int(v[3]), v[1], json.loads(v[4]), v[6].decode('utf-8'),)) for v in data: if v[0]: add_want(v) dw.write(*v[1:]) for sliceno in range(1, g.slices): dw.set_slice(sliceno) source_ds = dw.finish() # Once with just filter_bad, once with some defaults too. defaults = {} for _ in range(2): jid = subjobs.build( 'dataset_type', datasets=dict(source=source_ds), options=dict(column2type={t: t for t in columns}, filter_bad=True, defaults=defaults), ) typed_ds = Dataset(jid) got = list(typed_ds.iterate(0, ['int32_10', 'bytes', 'json', 'unicode:utf-8'])) assert got == want, "Exptected %r, got %r from %s (from %r%s)" % (want, got, typed_ds, source_ds, ' with defaults' if defaults else '') # make more lines "ok" for the second lap defaults = {'number:int': '0', 'float64': '0', 'json': '"replacement"'} add_want(data[3]) add_want(data[5]) data[6][4] = '"replacement"' add_want(data[6]) want.sort() # adding them out of order, int32_10 sorts correctly.
def make_source(names): names = sorted(names) dsname = '+'.join(names) if dsname not in sources: dw = DatasetWriter(name=dsname, columns={'v': 'ascii'}) write = dw.get_split_write() for name in names: for value in data[name][0]: write(value) sources[dsname] = ( dw.finish(), min(unnan(data[name][1] for name in names)), max(unnan(data[name][2] for name in names)), ) return sources[dsname]
def datasetwriter(self, columns={}, filename=None, hashlabel=None, hashlabel_override=False, caption=None, previous=None, name='default', parent=None, meta_only=False, for_single_slice=None, copy_mode=False, allow_missing_slices=False): from accelerator.dataset import DatasetWriter return DatasetWriter(columns=columns, filename=filename, hashlabel=hashlabel, hashlabel_override=hashlabel_override, caption=caption, previous=previous, name=name, parent=parent, meta_only=meta_only, for_single_slice=for_single_slice, copy_mode=copy_mode, allow_missing_slices=allow_missing_slices)
def analysis(sliceno, prepare_res, job): dw_default = DatasetWriter() dw_named = job.datasetwriter(name='named') dw_passed, _ = prepare_res for name, dw in [('default', dw_default), ('named', dw_named), ('passed', dw_passed)]: for data in jobs.source.dataset(name).iterate(sliceno, copy_mode=True): dw.write(*data)
def synthesis(prepare_res, params): if not options.as_chain: # If we don't want a chain we abuse our knowledge of dataset internals # to avoid recompressing. Don't do this stuff yourself. dws, names, caption, filename = prepare_res merged_dw = DatasetWriter( caption=caption, hashlabel=options.hashlabel, filename=filename, previous=datasets.previous, meta_only=True, columns=datasets.source.columns, ) for sliceno in range(params.slices): merged_dw.set_lines(sliceno, sum(dw._lens[sliceno] for dw in dws)) for dwno, dw in enumerate(dws): merged_dw.set_minmax((sliceno, dwno), dw._minmax[sliceno]) for n in names: fn = merged_dw.column_filename(n, sliceno=sliceno) with open(fn, "wb") as out_fh: for dw in dws: fn = dw.column_filename(n, sliceno=sliceno) with open(fn, "rb") as in_fh: copyfileobj(in_fh, out_fh) for dw in dws: dw.discard()
def prepare(job, slices): assert slices >= test_data.value_cnt dw_default = DatasetWriter() dw_default.add("a", "number") dw_default.add("b", "ascii") DatasetWriter(name="named", columns={"c": "bool", "d": "date"}) dw_passed = job.datasetwriter(name="passed", columns=test_data.columns) return dw_passed, 42
def prepare(): columns = dict( ascii="ascii", bytes="bytes", bytes_none=("bytes", True), float="float64", int="int64", json="json", unicode="unicode", unicode_none=("unicode", True), ) if PY3: # z so it sorts last columns['zpickle'] = 'pickle' for ix, v in enumerate(test_data): test_data[ix] = v + ([ix, 'line %d' % (ix, ), {'line': ix}, 42], ) test_data[-1][-1][-1] = float('-inf') a = DatasetWriter(name="a", columns=columns) b = DatasetWriter(name="b", columns=columns, previous=a) c = DatasetWriter(name="c", columns=columns) return a, b, c, test_data
def prepare(params): assert params.slices >= 2, "Hashing won't do anything with just one slice" dws = DotDict() # all the numeric types should hash the same (for values they have in common) for name, hashlabel, typ in ( ("unhashed_manual", None, "int32"), # manually interlaved ("unhashed_split", None, "int64"), # split_write interlaved ("up_checked", "up", "float32"), # hashed on up using dw.hashcheck ("up_split", "up", "float64"), # hashed on up using split_write ("down_checked", "down", "bits32"), # hashed on down using dw.hashcheck ("down_discarded", "down", "bits64"), # hashed on down using discarding writes ("down_discarded_list", "down", "number"), # hashed on down using discarding list writes ("down_discarded_dict", "down", "complex32"), # hashed on down using discarding dict writes # we have too many types, so we need more datasets ("unhashed_complex64", None, "complex64"), ("unhashed_bytes", None, "bytes"), ("up_ascii", "up", "ascii"), ("down_unicode", "down", "unicode"), # datetime on 1970-01-01 hashes like time ("up_datetime", "up", "datetime"), ("down_time", "down", "time"), # date doesn't hash the same as anything else, so compare it to itself ("up_date", "up", "date"), ("down_date", "down", "date"), ): dw = DatasetWriter(name=name, hashlabel=hashlabel) dw.add("up", typ) dw.add("down", typ) dws[name] = dw return dws
def test_column_discarding(): dw = DatasetWriter(name='column discarding') dw.add('a', 'bytes') dw.add('b', 'bytes') dw.add('c', 'bytes') w = dw.get_split_write() w(b'a', b'b', b'c') source = dw.finish() # Discard b because it's not typed ac_implicit = subjobs.build( 'dataset_type', source=source, column2type=dict(a='ascii', c='ascii'), discard_untyped=True, ).dataset() assert sorted(ac_implicit.columns) == ['a', 'c'], '%s: %r' % (ac_implicit, sorted(ac_implicit.columns),) assert list(ac_implicit.iterate(None)) == [('a', 'c',)], ac_implicit # Discard b explicitly ac_explicit = subjobs.build( 'dataset_type', source=source, column2type=dict(a='ascii', c='ascii'), rename=dict(b=None), ).dataset() assert sorted(ac_explicit.columns) == ['a', 'c'], '%s: %r' % (ac_explicit, sorted(ac_explicit.columns),) assert list(ac_explicit.iterate(None)) == [('a', 'c',)], ac_explicit # Discard c by overwriting it with b. Keep untyped b. ac_bASc = subjobs.build( 'dataset_type', source=source, column2type=dict(a='ascii', c='ascii'), rename=dict(b='c'), ).dataset() assert sorted(ac_bASc.columns) == ['a', 'b', 'c'], '%s: %r' % (ac_bASc, sorted(ac_bASc.columns),) assert list(ac_bASc.iterate(None)) == [('a', b'b', 'b',)], ac_bASc # Discard c by overwriting it with b. Also type b as a different type. abc_bASc = subjobs.build( 'dataset_type', source=source, column2type=dict(a='ascii', b='strbool', c='ascii'), rename=dict(b='c'), ).dataset() assert sorted(abc_bASc.columns) == ['a', 'b', 'c'], '%s: %r' % (abc_bASc, sorted(abc_bASc.columns),) assert list(abc_bASc.iterate(None)) == [('a', True, 'b',)], abc_bASc
def prepare(job): job.datasetwriter(columns=jobs.source.dataset().columns, copy_mode=True) DatasetWriter(name='named', columns=jobs.source.dataset('named').columns, copy_mode=True) dw_passed = job.datasetwriter(name='passed', copy_mode=True) # DatasetColumn in .add for n, c in sorted(jobs.source.dataset('passed').columns.items()): dw_passed.add(n, c) # verify that .add(none_support=) takes precedence over coltype dw_nonetest_removed = job.datasetwriter(name='nonetest_removed') for n, c in sorted(jobs.source.dataset('nonetest').columns.items()): dw_nonetest_removed.add(n, c, none_support=(n == 'unicode')) return dw_passed, dw_nonetest_removed
def prepare(params): dws = {} prev = None for name in "abcdefgh": dw = DatasetWriter(name=name, previous=prev) dw.add("ds", "ascii") dw.add("num", "number") dws[name] = dw prev = dw return dws
def prepare(params): d = datasets.source ds_list = d.chain(stop_ds={datasets.previous: 'source'}) if options.sort_across_slices: columniter = partial(Dataset.iterate_list, None, datasets=ds_list) sort_idx = sort(columniter) total = len(sort_idx) per_slice = [total // params.slices] * params.slices extra = total % params.slices if extra: # spread the left over length over pseudo-randomly selected slices # (using the start of sort_idx to select slices). # this will always select the first slices if data is already sorted # but at least it's deterministic. selector = sorted(range(min(params.slices, total)), key=sort_idx.__getitem__) for sliceno in selector[:extra]: per_slice[sliceno] += 1 # change per_slice to be the actual sort indexes start = 0 for ix, num in enumerate(per_slice): end = start + num per_slice[ix] = sort_idx[start:end] start = end assert sum(len(part) for part in per_slice) == total # all rows used assert len(set( len(part) for part in per_slice)) < 3 # only 1 or 2 lengths possible sort_idx = per_slice else: sort_idx = None if options.sort_across_slices: hashlabel = None else: hashlabel = d.hashlabel if len(ds_list) == 1: filename = d.filename else: filename = None dw = DatasetWriter( columns=d.columns, caption=params.caption, hashlabel=hashlabel, filename=filename, previous=datasets.previous, ) return dw, ds_list, sort_idx
def _verify(name, types, data, coltype, want, default, want_fail, kw): if callable(want): check = want else: def check(got, fromstr, filtered=False): want1 = want if isinstance(want, list) else want[typ] if filtered: want1 = want1[::2] assert got == want1, 'Expected %r, got %r from %s.' % (want1, got, fromstr,) dw = DatasetWriter(name=name, columns={'data': coltype, 'extra': 'bytes'}) dw.set_slice(0) for ix, v in enumerate(data): dw.write(v, b'1' if ix % 2 == 0 else b'skip') for sliceno in range(1, g.slices): dw.set_slice(sliceno) bytes_ds = dw.finish() for typ in types: opts = dict(column2type=dict(data=typ)) opts.update(kw) if default is not no_default: opts['defaults'] = {'data': default} try: jid = subjobs.build('dataset_type', datasets=dict(source=bytes_ds), options=opts) except JobError: if want_fail: continue raise Exception('Typing %r as %s failed.' % (bytes_ds, typ,)) assert not want_fail, "Typing %r as %s should have failed, but didn't (%s)." % (bytes_ds, typ, jid) typed_ds = Dataset(jid) got = list(typed_ds.iterate(0, 'data')) check(got, '%s (typed as %s from %r)' % (typed_ds, typ, bytes_ds,)) if 'filter_bad' not in opts and not callable(want): opts['filter_bad'] = True opts['column2type']['extra'] = 'int32_10' jid = subjobs.build('dataset_type', datasets=dict(source=bytes_ds), options=opts) typed_ds = Dataset(jid) got = list(typed_ds.iterate(0, 'data')) check(got, '%s (typed as %s from %r with every other line skipped from filter_bad)' % (typed_ds, typ, bytes_ds,), True) used_type(typ)
def analysis(sliceno, prepare_res, job): dw_default = job.datasetwriter() dw_named = DatasetWriter(name="named") dw_passed, num = prepare_res dw_default.write(a=sliceno, b="a") dw_default.write_list([num, str(sliceno)]) dw_named.write(True, date(1536, 12, min(sliceno + 1, 31))) dw_named.write_dict({"c": False, "d": date(2236, 5, min(sliceno + 1, 31))}) # slice 0 is written in synthesis if 0 < sliceno < test_data.value_cnt: dw_passed.write_dict( {k: v[sliceno] for k, v in test_data.data.items()})
def synthesis(job): manual_chain = [Dataset(jobids.selfchain, name) for name in "abcdefgh"] manual_abf = [manual_chain[0], manual_chain[1], manual_chain[5]] # build a local abf chain prev = None for ix, ds in enumerate(manual_abf): name = "abf%d" % (ix, ) prev = ds.link_to_here(name, override_previous=prev) manual_abf_data = list(Dataset.iterate_list(None, None, manual_abf)) local_abf_data = list(Dataset(job, "abf2").iterate_chain(None, None)) assert manual_abf_data == local_abf_data # disconnect h, verify there is no chain manual_chain[-1].link_to_here("alone", override_previous=None) assert len(Dataset(job, "alone").chain()) == 1 # check that the original chain is unhurt assert manual_chain == manual_chain[-1].chain() # So far so good, now make a chain long enough to have a cache. prev = None ix = 0 going = True while going: if prev and "cache" in prev._data: going = False name = "longchain%d" % (ix, ) dw = DatasetWriter(name=name, previous=prev) dw.add("ix", "number") dw.get_split_write()(ix) prev = dw.finish() ix += 1 # we now have a chain that goes one past the first cache point full_chain = Dataset(prev).chain() assert "cache" in full_chain[ -2]._data # just to check the above logic is correct assert "cache" not in full_chain[-1]._data # just to be sure.. full_chain[-2].link_to_here("nocache", override_previous=None) full_chain[-1].link_to_here("withcache", override_previous=full_chain[-3]) assert "cache" not in Dataset(job, "nocache")._data assert "cache" in Dataset(job, "withcache")._data # And make sure they both get the right data too. assert list(Dataset(prev).iterate_chain(None, "ix")) == list(range(ix)) assert list(Dataset(job, "nocache").iterate_chain(None, "ix")) == [ix - 2] assert list(Dataset(job, "withcache").iterate_chain( None, "ix")) == list(range(ix - 2)) + [ix - 1]
def test_rehash_with_empty_slices(): dw = DatasetWriter(name='rehash with empty slices', hashlabel='a') dw.add('a', 'ascii') dw.add('b', 'ascii') w = dw.get_split_write() w('a', '42') w('42', 'b') source = dw.finish() hashfunc = typed_writer('int32').hash def verify_hashing(caption, want_values, **kw): ds = subjobs.build('dataset_type', source=source, column2type=dict(a='int32_10'), caption=caption, **kw).dataset() got_values = set() for sliceno in range(g.slices): for got in ds.iterate(sliceno): assert hashfunc(got[0]) % g.slices == sliceno assert got not in got_values got_values.add(got) assert want_values == got_values verify_hashing('with discard', {( 42, 'b', )}, filter_bad=True) # using defaults uses some different code paths verify_hashing('with default=0 (probably two slices)', {( 0, '42', ), ( 42, 'b', )}, defaults=dict(a='0')) verify_hashing('with default=42 (one slice)', {( 42, '42', ), ( 42, 'b', )}, defaults=dict(a='42'))
def prepare(params): assert params.slices >= 2, "Hashing won't do anything with just one slice" dws = DotDict() for name, hashlabel in ( ("unhashed_manual", None), # manually interlaved ("unhashed_split", None), # split_write interlaved ("up_checked", "up"), # hashed on up using dw.hashcheck ("up_split", "up"), # hashed on up using split_write ("down_checked", "down"), # hashed on down using dw.hashcheck ("down_discarded", "down"), # hashed on down using discarding writes ("down_discarded_list", "down"), # hashed on down using discarding list writes ("down_discarded_dict", "down"), # hashed on down using discarding dict writes ): dw = DatasetWriter(name=name, hashlabel=hashlabel) dw.add("up", "int32") dw.add("down", "int32") dws[name] = dw return dws
def prepare(): return DatasetWriter(columns={t: t for t in test_data.data})
def synthesis(prepare_res, slices, job): dw_passed, _ = prepare_res # Using set_slice on a dataset that was written in analysis is not # actually supported, but since it currently works (as long as that # particular slice wasn't written in analysis) let's test it. dw_passed.set_slice(0) dw_passed.write(**{k: v[0] for k, v in test_data.data.items()}) dw_synthesis_split = DatasetWriter(name="synthesis_split", hashlabel="a") dw_synthesis_split.add("a", "int32") dw_synthesis_split.add("b", "unicode") dw_synthesis_split.get_split_write()(1, "a") dw_synthesis_split.get_split_write_list()([2, "b"]) dw_synthesis_split.get_split_write_dict()({"a": 3, "b": "c"}) dw_synthesis_manual = job.datasetwriter(name="synthesis_manual", columns={"sliceno": "int32"}) dw_nonetest = job.datasetwriter(name="nonetest", columns={t: t for t in test_data.data}) for sliceno in range(slices): dw_synthesis_manual.set_slice(sliceno) dw_synthesis_manual.write(sliceno) dw_nonetest.set_slice(sliceno) dw_nonetest.write( **{ k: v[0] if k in test_data.not_none_capable else None for k, v in test_data.data.items() })
def test_filter_bad_across_types(): columns = { 'bytes': 'bytes', 'float64': 'bytes', 'int32_10': 'ascii', 'json': 'unicode', 'number:int': 'unicode', 'unicode:utf-8': 'bytes', } # all_good, *values # Make sure all those types (except bytes) can filter other lines, # and be filtered by other lines. And that several filtering values # is not a problem (line 11). data = [ [ True, b'first', b'1.1', '1', '"a"', '001', b'ett', ], [ True, b'second', b'2.2', '2', '"b"', '02', b'tv\xc3\xa5', ], [ True, b'third', b'3.3', '3', '["c"]', '3.0', b'tre', ], [ False, b'fourth', b'4.4', '4', '"d"', '4.4', b'fyra', ], # number:int bad [ False, b'fifth', b'5.5', '-', '"e"', '5', b'fem', ], # int32_10 bad [ False, b'sixth', b'6.b', '6', '"f"', '6', b'sex', ], # float64 bad [ False, b'seventh', b'7.7', '7', '{"g"}', '7', b'sju', ], # json bad [ False, b'eigth', b'8.8', '8', '"h"', '8', b'\xa5\xc3tta', ], # unicode:utf-8 bad [ True, b'ninth', b'9.9', '9', '"i"', '9', b'nio', ], [ True, b'tenth', b'10', '10', '"j"', '10', b'tio', ], [ False, b'eleventh', b'11a', '1-', '"k",', '1,', b'elva', ], # float64, int32_10 and number:int bad [ True, b'twelfth', b'12', '12', '"l"', '12', b'tolv', ], ] want_bad = [tuple(l[1:]) for l in data if not l[0]] dw = DatasetWriter(name="filter bad across types", columns=columns, allow_missing_slices=True) cols_to_check = ['int32_10', 'bytes', 'json', 'unicode:utf-8'] if PY3: # z so it sorts last. dw.add('zpickle', 'pickle') cols_to_check.append('zpickle') for ix in range(len(data)): data[ix].append({ix}) dw.set_slice(0) want = [] def add_want(ix): v = data[ix] want.append(( int(v[3]), v[1], json.loads(v[4]), v[6].decode('utf-8'), )) if PY3: want[-1] = want[-1] + (v[7], ) for ix, v in enumerate(data): if v[0]: add_want(ix) dw.write(*v[1:]) source_ds = dw.finish() # Once with just filter_bad, once with some defaults too. defaults = {} for _ in range(2): jid = subjobs.build( 'dataset_type', datasets=dict(source=source_ds), options=dict(column2type={t: t for t in columns}, filter_bad=True, defaults=defaults), ) typed_ds = Dataset(jid) got = list(typed_ds.iterate(0, cols_to_check)) assert got == want, "Expected %r, got %r from %s (from %r%s)" % ( want, got, typed_ds, source_ds, ' with defaults' if defaults else '') bad_ds = Dataset(jid, 'bad') got_bad = list(bad_ds.iterate(0, sorted(columns))) assert got_bad == want_bad, "Expected %r, got %r from %s (from %r%s)" % ( want_bad, got_bad, bad_ds, source_ds, ' with defaults' if defaults else '') # make more lines "ok" for the second lap if not defaults: want_bad.pop(0) # number:int want_bad.pop(1) # float64 want_bad.pop(1) # json defaults = {'number:int': '0', 'float64': '0', 'json': '"replacement"'} add_want(3) add_want(5) data[6][4] = '"replacement"' add_want(6) want.sort() # adding them out of order, int32_10 sorts correctly.
def synthesis(params): ds = write(data) for colname in data[0]: verify(params.slices, data, ds, hashlabel=colname) # ok, all the hashing stuff works out, let's test the chaining options. bonus_ds = write(bonus_data, name="bonus", previous=ds) # no chaining options - full chain verify(params.slices, data + bonus_data, bonus_ds, hashlabel="date") # just the bonus ds verify(params.slices, bonus_data, bonus_ds, hashlabel="date", length=1) # built as a chain verify(params.slices, data + bonus_data, bonus_ds, hashlabel="date", as_chain=True) # normal chaining a = verify(params.slices, data, ds, hashlabel="date") b = verify(params.slices, data + bonus_data, bonus_ds, hashlabel="date", previous=a) assert b.chain() == [ a, b ], "chain of %s is not [%s, %s] as expected" % (b, a, b) # as_chain sparseness dw = DatasetWriter(columns=columns, name="empty") dw.get_split_write() ds = verify(params.slices, [], dw.finish(), hashlabel="date", as_chain=True) assert len( ds.chain() ) == 1, ds + ": dataset_hashpart on empty dataset with as_chain=True did not produce a single dataset" # two populated slices with the same data, should end up in two datasets. dw = DatasetWriter(columns=columns, name="0 and 2") dw.set_slice(0) dw.write_dict(data[0]) dw.set_slice(1) dw.set_slice(2) dw.write_dict(data[0]) for s in range(3, params.slices): dw.set_slice(s) ds = verify(params.slices, [data[0]], dw.finish(), hashlabel="date", as_chain=True) got_slices = len(ds.chain()) assert got_slices == 2, "%s (built with as_chain=True) has %d datasets in chain, expected 2." % ( ds, got_slices, )
def write(data, **kw): dw = DatasetWriter(columns=columns, **kw) w = dw.get_split_write_dict() for values in data: w(values) return dw.finish()
def prepare(params): if options.trigger_column: assert options.sort_across_slices, 'trigger_column is meaningless without sort_across_slices' assert options.trigger_column in options.sort_columns, 'can only trigger on a column that is sorted on' d = datasets.source ds_list = d.chain(stop_ds={datasets.previous: 'source'}) if options.sort_across_slices: columniter = partial(Dataset.iterate_list, None, datasets=ds_list) sort_idx, sort_extra = sort(columniter) total = len(sort_idx) per_slice = [total // params.slices] * params.slices extra = total % params.slices if extra: # spread the left over length over pseudo-randomly selected slices # (using the start of sort_idx to select slices). # this will always select the first slices if data is already sorted # but at least it's deterministic. selector = sorted(range(min(params.slices, total)), key=sort_idx.__getitem__) for sliceno in selector[:extra]: per_slice[sliceno] += 1 # Switch to tracking what line the slices end at slice_end = [] end = 0 for cnt in per_slice: end += cnt slice_end.append(end) if options.trigger_column: # extra definitely changed value last to simplify loop sort_extra.append(object()) sort_idx.append(-1) # move slice_end counts around to only switch when trigger_column changes def fixup_fwd(cnt): trigger_v = sort_extra[sort_idx[cnt - 1]] while trigger_v == sort_extra[sort_idx[cnt]]: cnt += 1 return cnt def fixup_bck(cnt, min_cnt): trigger_v = sort_extra[sort_idx[cnt - 1]] while cnt > min_cnt and trigger_v == sort_extra[sort_idx[cnt]]: cnt -= 1 return cnt with status('Adjusting for trigger_column'): prev = 0 for sliceno, cnt in enumerate(slice_end[:-1]): if cnt: cnt = max(cnt, prev) choosen = fwd = fixup_fwd(cnt) bck = fixup_bck(cnt, prev) # This could be smarter if (cnt - bck) <= (fwd < cnt): choosen = bck prev = slice_end[sliceno] = choosen # and now switch sort_idx to be per slice sort_idx = [ sort_idx[start:end] for start, end in zip([0] + slice_end, slice_end) ] assert sum(len(part) for part in sort_idx) == total # all rows used if not options.trigger_column: assert len(set( len(part) for part in sort_idx)) < 3 # only 1 or 2 lengths possible else: sort_idx = None if options.sort_across_slices: hashlabel = None else: hashlabel = d.hashlabel if len(ds_list) == 1: filename = d.filename else: filename = None dw = DatasetWriter( columns=d.columns, caption=params.caption, hashlabel=hashlabel, filename=filename, previous=datasets.previous, ) return dw, ds_list, sort_idx
def synthesis(job): dw = DatasetWriter(name='empty', columns={'v': 'ascii'}) dw.get_split_write() empty_ds = dw.finish() assert empty_ds.min('non-existant column') is empty_ds.max( 'non-existant column' ) is None, 'Dataset.min/max() broken for non-existant columns' for typ, groups in tests.items(): t_ds = subjobs.build('dataset_type', column2type={ 'v': typ }, source=empty_ds).dataset() minmax = (t_ds.columns['v'].min, t_ds.columns['v'].max) if minmax != (None, None): raise Exception( 'Typing empty dataset as %s did not give minmax == None, gave %r' % ( typ, minmax, )) all_names = list( chain.from_iterable(groupdata[group].keys() for group in groups)) # just 1 and 2, so we don't make way too many for num_groups in ( 1, 2, ): for names in combinations(all_names, num_groups): ds, mn, mx = make_source(names) t_ds = subjobs.build('dataset_type', column2type={ 'v': typ }, source=ds).dataset() got_minmax = (t_ds.columns['v'].min, t_ds.columns['v'].max) want_minmax = (mn, mx) chk_minmax( got_minmax, want_minmax, 'Typing %s as %s gave wrong minmax: expected %r, got %r (in %s)' % ( ds, typ, want_minmax, got_minmax, t_ds, )) chk_minmax(got_minmax, (t_ds.min('v'), t_ds.max('v')), 'Dataset.min/max() broken on ' + t_ds) # verify writing the same data normally also gives the correct result dw = DatasetWriter(name='rewrite ' + t_ds, columns=t_ds.columns) write = dw.get_split_write() for v in t_ds.iterate(None, 'v'): write(v) re_ds = dw.finish() got_minmax = (re_ds.columns['v'].min, re_ds.columns['v'].max) want_minmax = (mn, mx) chk_minmax( got_minmax, want_minmax, 'Rewriting %s gave the wrong minmax: expected %r, got %r (in %s)' % ( t_ds, want_minmax, got_minmax, re_ds, ))
def prepare(job, slices): # use 256 as a marker value, because that's not a possible char value (assuming 8 bit chars) lf_char = char2int("newline", 256) # separator uses lf_char or \n as the empty value, because memchr might mishandle 256. separator = char2int("separator", 10 if lf_char == 256 else lf_char) comment_char = char2int("comment", 256) if options.quotes == 'True': quote_char = 256 elif options.quotes == 'False': quote_char = 257 else: quote_char = char2int("quotes", 257, "True/False/empty") filename = os.path.join(job.source_directory, options.filename) orig_filename = filename assert 1 <= options.compression <= 9 fds = [os.pipe() for _ in range(slices)] read_fds = [t[0] for t in fds] write_fds = [t[1] for t in fds] if options.labelsonfirstline: labels_rfd, labels_wfd = os.pipe() else: labels_wfd = -1 success_rfd, success_wfd = os.pipe() status_rfd, status_wfd = os.pipe() p = Process(target=reader_process, name="reader", args=(slices, filename, write_fds, labels_wfd, success_wfd, status_wfd, comment_char, lf_char)) p.start() for fd in write_fds: os.close(fd) os.close(success_wfd) os.close(status_wfd) if options.labelsonfirstline: os.close(labels_wfd) # re-use import logic out_fns = ["labels"] r_num = cstuff.mk_uint64(3) res = cstuff.backend.import_slice(*cstuff.bytesargs(labels_rfd, -1, -1, -1, out_fns, b"wb1", separator, r_num, quote_char, lf_char, 0)) os.close(labels_rfd) assert res == 0, "c backend failed in label parsing" with typed_reader("bytes")("labels") as fh: labels_from_file = [lab.decode("utf-8", "backslashreplace") for lab in fh] os.unlink("labels") else: labels_from_file = None labels = options.labels or labels_from_file assert labels, "No labels" labels = [options.rename.get(x, x) for x in labels] assert '' not in labels, "Empty label for column %d" % (labels.index(''),) assert len(labels) == len(set(labels)), "Duplicate labels: %r" % (labels,) dw = DatasetWriter( columns={n: 'bytes' for n in labels if n not in options.discard}, filename=orig_filename, caption='csvimport of ' + orig_filename, previous=datasets.previous, meta_only=True, ) if options.lineno_label: dw.add(options.lineno_label, "int64") if options.allow_bad: bad_dw = DatasetWriter( name="bad", columns=dict(lineno="int64", data="bytes"), caption='bad lines from csvimport of ' + orig_filename, meta_only=True, ) else: bad_dw = None if options.comment or options.skip_lines: skipped_dw = DatasetWriter( name="skipped", columns=dict(lineno="int64", data="bytes"), caption='skipped lines from csvimport of ' + orig_filename, meta_only=True, ) else: skipped_dw = None return separator, quote_char, lf_char, filename, orig_filename, labels, dw, bad_dw, skipped_dw, read_fds, success_rfd, status_rfd,
def mk_dw(name, cols, **kw): dw = DatasetWriter(name=name, **kw) for colname in cols: dw.add(colname, "unicode") return dw
def prepare(params): assert params.slices >= 3 dw_3 = DatasetWriter(name="three", columns={"num": "int32"}) dw_long = DatasetWriter(name="long", columns={"num": "int32"}) dw_uneven = DatasetWriter(name="uneven", columns={"num": "int32"}) return dw_3, dw_long, dw_uneven