def synthesis(): lines = [ 'printing a bunch of lines, this is line %d.' % (n, ) for n in range(150) ] if options.inner: for s in lines: print(s) raise Exception('this is an exception, but nothing went wrong') else: try: subjobs.build('test_output_on_error', inner=True) except JobError as e: job = e.job else: raise Exception("test_output_on_error with inner=True didn't fail") # give the iowrapper some time to finish for attempt in range(25): got_lines = job.output().split('\n') if got_lines[:len(lines)] == lines: for line in got_lines: if line == 'Exception: this is an exception, but nothing went wrong': return # not yet, wait a little (total of 30s) if attempt > 1: print( 'Output from %s has not appeared yet, waiting more (%d).' % ( job, attempt, )) sleep(attempt / 10.0) raise Exception('Not all output from %s was saved in OUTPUT' % (job, ))
def synthesis(): assert options.level < 5, "Too deep subjob nesting allowed" try: subjobs.build('test_subjobs_nesting', options={'level': options.level + 1}) except JobError: assert options.level == 4, "Not enough subjob nesting allowed"
def synthesis(params): source = Dataset(subjobs.build("test_sorting_gendata")) # Test that all datatypes work for sorting for key in test_data.data: check_one(params.slices, key, source) # Check reverse sorting check_one(params.slices, "int32", source, reverse=True) # Check that sorting across slices and by two columns works jid = subjobs.build( "dataset_sort", options=dict( sort_columns=["int64", "int32"], sort_order="descending", sort_across_slices=True, ), datasets=dict(source=source), ) int64_off = sorted(test_data.data).index("int64") int32_off = sorted(test_data.data).index("int32") all_data = chain.from_iterable( test_data.sort_data_for_slice(sliceno) for sliceno in range(params.slices)) good = sorted(all_data, key=lambda t: ( noneninf(t[int64_off]), noneninf(t[int32_off]), ), reverse=True) ds = Dataset(jid) check = list(ds.iterate(None)) assert unnan(check) == unnan( good), "Sorting across slices on [int64, int32] bad (%s)" % (jid, )
def test_filter_bad_with_rename_and_chain(): dw = DatasetWriter(name="filter bad with rename", allow_missing_slices=True) dw.add('a', 'ascii') dw.add('b', 'bytes') dw.add('c', 'unicode') dw.set_slice(0) dw.write('0', b'1', '2') dw.write('9', B'A', 'B') dw.write('C', B'D', 'E') source_ds = dw.finish() jid = subjobs.build( 'dataset_type', column2type=dict(b='int32_10', c='int64_16', d='int32_16'), filter_bad=True, rename=dict(a='b', b='c', c='d'), source=source_ds, ) typed_ds = jid.dataset() coltypes = sorted( (name, col.type) for name, col in typed_ds.columns.items()) assert coltypes == [('b', 'int32'), ('c', 'int64'), ('d', 'int32')], coltypes assert list(typed_ds.iterate(0)) == [(0, 1, 2), (9, 10, 11)] bad_ds = jid.dataset('bad') coltypes = sorted((name, col.type) for name, col in bad_ds.columns.items()) assert coltypes == [('b', 'ascii'), ('c', 'bytes'), ('d', 'unicode')], coltypes assert list(bad_ds.iterate(0)) == [('C', b'D', 'E')] dw = DatasetWriter(name="filter bad with rename chain", allow_missing_slices=True, previous=source_ds) dw.add('a', 'ascii') dw.add('b', 'ascii') dw.add('c', 'ascii') dw.set_slice(0) dw.write('3', '4', '5') dw.write('6', '7', 'eight') source_ds = dw.finish() jid = subjobs.build( 'dataset_type', column2type=dict(a='number', b='int32_10', c='int64_10'), defaults=dict(a='8'), filter_bad=True, rename=dict(a='b', b='c', c='a'), source=source_ds, ) typed_ds = jid.dataset() coltypes = sorted( (name, col.type) for name, col in typed_ds.columns.items()) assert coltypes == [('a', 'number'), ('b', 'int32'), ('c', 'int64')], coltypes assert list(typed_ds.iterate(0)) == [(2, 0, 1), (5, 3, 4), (8, 6, 7)] bad_ds = jid.dataset('bad') coltypes = sorted((name, col.type) for name, col in bad_ds.columns.items()) assert coltypes == [('a', 'unicode'), ('b', 'ascii'), ('c', 'bytes')], coltypes assert list(bad_ds.iterate(0)) == [('B', '9', b'A'), ('E', 'C', b'D')]
def test(src_ds, opts, expect_lines): opts = DotDict(opts) def rename(colname): return opts.get('rename', {}).get(colname, colname) cols = set(opts.column2type) opts.discard_untyped = True msg = 'Testing with types %s' % (', '.join(v for k, v in sorted(opts.column2type.items())),) expect_hl = None if src_ds.hashlabel and opts.column2type.get(src_ds.hashlabel) == 'json': # json is not hashable, so we have to override the hashlabel to nothing in this case. opts.hashlabel = '' msg += ' (clearing hashlabel)' elif src_ds.hashlabel: expect_hl = rename(src_ds.hashlabel) if expect_hl in opts.column2type: msg += ' (hashed on %s)' % (opts.column2type[expect_hl],) else: expect_hl = None msg += ' (hashed on <untyped column>)' print(msg) just_typed = subjobs.build('dataset_type', options=opts, datasets=dict(source=src_ds)).dataset() assert just_typed.hashlabel == expect_hl, just_typed assert set(just_typed.columns) == cols, just_typed assert sum(just_typed.lines) == expect_lines, just_typed if rename(src_ds.hashlabel) not in opts.column2type or opts.get('hashlabel') == '': assert just_typed.hashlabel is None, just_typed else: assert just_typed.hashlabel == rename(src_ds.hashlabel), just_typed del opts.discard_untyped rev_rename = {v: k for k, v in opts.get('rename', {}).items()} discard = set(src_ds.columns) - set(rev_rename.get(n, n) for n in cols) if discard: d = opts.get('rename', {}) d.update({k: None for k in discard}) opts.rename = d for hashlabel in cols: if opts.column2type[hashlabel] == 'json': # not hashable continue opts['hashlabel'] = hashlabel print('%s rehashed on %s' % (msg, opts.column2type[hashlabel],)) hashed_by_type = subjobs.build('dataset_type', options=opts, datasets=dict(source=src_ds)).dataset() assert hashed_by_type.hashlabel == hashlabel, hashed_by_type assert set(hashed_by_type.columns) == cols, hashed_by_type assert sum(hashed_by_type.lines) == expect_lines, hashed_by_type hashed_after = subjobs.build('dataset_hashpart', options=dict(hashlabel=hashlabel), datasets=dict(source=just_typed)).dataset() assert hashed_after.hashlabel == hashlabel, hashed_after assert set(hashed_after.columns) == cols, hashed_after assert sum(hashed_after.lines) == expect_lines, hashed_after if src_ds.hashlabel: # if src_ds has a hashlabel then just_typed will actually already be hashed, so hashed_after # will have been hashed twice and therefore have a different order than hashed_by_type. if rename(src_ds.hashlabel) == hashlabel: # These should be the same though. subjobs.build('test_compare_datasets', datasets=dict(a=hashed_by_type, b=just_typed)) hashed_by_type = subjobs.build('dataset_sort', options=dict(sort_columns=rename('a')), datasets=dict(source=hashed_by_type)) hashed_after = subjobs.build('dataset_sort', options=dict(sort_columns=rename('a')), datasets=dict(source=hashed_after)) subjobs.build('test_compare_datasets', datasets=dict(a=hashed_by_type, b=hashed_after))
def synthesis(job): dw = DatasetWriter(name='empty', columns={'v': 'ascii'}) dw.get_split_write() empty_ds = dw.finish() assert empty_ds.min('non-existant column') is empty_ds.max('non-existant column') is None, 'Dataset.min/max() broken for non-existant columns' for typ, groups in tests.items(): t_ds = subjobs.build('dataset_type', column2type={'v': typ}, source=empty_ds).dataset() minmax = (t_ds.columns['v'].min, t_ds.columns['v'].max) if minmax != (None, None): raise Exception('Typing empty dataset as %s did not give minmax == None, gave %r' % (typ, minmax,)) all_names = list(chain.from_iterable(groupdata[group].keys() for group in groups)) # just 1 and 2, so we don't make way too many for num_groups in (1, 2,): for names in combinations(all_names, num_groups): ds, mn, mx = make_source(names) t_ds = subjobs.build('dataset_type', column2type={'v': typ}, source=ds).dataset() got_minmax = (t_ds.columns['v'].min, t_ds.columns['v'].max) want_minmax = (mn, mx) chk_minmax(got_minmax, want_minmax, 'Typing %s as %s gave wrong minmax: expected %r, got %r (in %s)' % (ds, typ, want_minmax, got_minmax, t_ds,)) chk_minmax(got_minmax, (t_ds.min('v'), t_ds.max('v')), 'Dataset.min/max() broken on ' + t_ds) # verify writing the same data normally also gives the correct result dw = DatasetWriter(name='rewrite ' + t_ds, columns=t_ds.columns) write = dw.get_split_write() for v in t_ds.iterate(None, 'v'): write(v) re_ds = dw.finish() got_minmax = (re_ds.columns['v'].min, re_ds.columns['v'].max) want_minmax = (mn, mx) chk_minmax(got_minmax, want_minmax, 'Rewriting %s gave the wrong minmax: expected %r, got %r (in %s)' % (t_ds, want_minmax, got_minmax, re_ds,)) # make sure renaming doesn't mix anything up dw = DatasetWriter(name='rename', columns={'a': 'ascii', 'b': 'ascii'}) write = dw.get_split_write() write('5', '3') write('7', 'oops') ds = dw.finish() t_ds = subjobs.build( 'dataset_type', column2type=dict(num='number', int='int32_10'), defaults=dict(num='1', int='2'), rename=dict(a='num', b='int'), source=ds, ).dataset() for name, want_minmax in ( ('num', (5, 7)), ('int', (2, 3)), ): got_minmax = (t_ds.columns[name].min, t_ds.columns[name].max) msg = 'Typing %s gave wrong minmax: expected %r, got %r (in %s)' % (ds, want_minmax, got_minmax, t_ds,) chk_minmax(got_minmax, want_minmax, msg)
def synthesis(): dw_a = DatasetWriter(name='a', columns={'num': 'int32'}) dw_b = DatasetWriter(name='b', columns={'num': 'int32'}, previous=dw_a) dw_c = DatasetWriter(name='c', columns={'num': 'int32'}, previous=dw_b) w = dw_a.get_split_write() w(3) w(2) w = dw_b.get_split_write() w(2) w(1) w = dw_c.get_split_write() w(0) a = dw_a.finish() b = dw_b.finish() c = dw_c.finish() opts = dict( sort_columns='num', sort_across_slices=True, ) # sort as a chain jid = subjobs.build('dataset_sort', options=opts, datasets=dict(source=a, previous=None)) assert list(Dataset(jid).iterate(None, 'num')) == [2, 3] sorted_a = jid jid = subjobs.build('dataset_sort', options=opts, datasets=dict(source=b, previous=jid)) assert list(Dataset(jid).iterate_chain(None, 'num')) == [2, 3, 1, 2] jid = subjobs.build('dataset_sort', options=opts, datasets=dict(source=c, previous=jid)) assert list(Dataset(jid).iterate_chain(None, 'num')) == [2, 3, 1, 2, 0] # sort all as a single dataset jid = subjobs.build('dataset_sort', options=opts, datasets=dict(source=c, previous=None)) assert list(Dataset(jid).iterate_chain(None, 'num')) == [0, 1, 2, 2, 3] # merge b and c but not a jid = subjobs.build('dataset_sort', options=opts, datasets=dict(source=c, previous=sorted_a)) # test with new style job.dataset assert list(jid.dataset().iterate(None, 'num')) == [0, 1, 2] assert list(jid.dataset().iterate_chain(None, 'num')) == [2, 3, 0, 1, 2]
def synthesis(job): dw = job.datasetwriter() dw.add('', 'number') dw.add('word', 'ascii') w = dw.get_split_write() w(0, 'foo') w(1, 'bar') ds = dw.finish() assert set(ds.columns) == {'', 'word'} assert list(ds.iterate(None, '')) == [0, 1] assert list(ds.iterate(None)) == [(0, 'foo'), (1, 'bar')] job = subjobs.build('csvexport', source=ds, filename='out.csv') job = subjobs.build('csvimport', filename=job.filename('out.csv')) job = subjobs.build('dataset_type', source=job, column2type={'': 'number', 'word': 'ascii'}) assert list(job.dataset().iterate(None)) == list(ds.iterate(None))
def test_column_discarding(): dw = DatasetWriter(name='column discarding') dw.add('a', 'bytes') dw.add('b', 'bytes') dw.add('c', 'bytes') w = dw.get_split_write() w(b'a', b'b', b'c') source = dw.finish() # Discard b because it's not typed ac_implicit = subjobs.build( 'dataset_type', source=source, column2type=dict(a='ascii', c='ascii'), discard_untyped=True, ).dataset() assert sorted(ac_implicit.columns) == ['a', 'c'], '%s: %r' % (ac_implicit, sorted(ac_implicit.columns),) assert list(ac_implicit.iterate(None)) == [('a', 'c',)], ac_implicit # Discard b explicitly ac_explicit = subjobs.build( 'dataset_type', source=source, column2type=dict(a='ascii', c='ascii'), rename=dict(b=None), ).dataset() assert sorted(ac_explicit.columns) == ['a', 'c'], '%s: %r' % (ac_explicit, sorted(ac_explicit.columns),) assert list(ac_explicit.iterate(None)) == [('a', 'c',)], ac_explicit # Discard c by overwriting it with b. Keep untyped b. ac_bASc = subjobs.build( 'dataset_type', source=source, column2type=dict(a='ascii', c='ascii'), rename=dict(b='c'), ).dataset() assert sorted(ac_bASc.columns) == ['a', 'b', 'c'], '%s: %r' % (ac_bASc, sorted(ac_bASc.columns),) assert list(ac_bASc.iterate(None)) == [('a', b'b', 'b',)], ac_bASc # Discard c by overwriting it with b. Also type b as a different type. abc_bASc = subjobs.build( 'dataset_type', source=source, column2type=dict(a='ascii', b='strbool', c='ascii'), rename=dict(b='c'), ).dataset() assert sorted(abc_bASc.columns) == ['a', 'b', 'c'], '%s: %r' % (abc_bASc, sorted(abc_bASc.columns),) assert list(abc_bASc.iterate(None)) == [('a', True, 'b',)], abc_bASc
def synthesis(job): dw = job.datasetwriter(name='a') dw.add('a', 'int32') w = dw.get_split_write() w(0) w(1) w(2) a = dw.finish() for filename, sliced, out_filename, open_func in ( ('name', False, 'name', open), ('name', True, 'name.1', open), ('name%dend', True, 'name1end', open), ('name.gz', False, 'name.gz', gzip.open), ('name.gz', True, 'name.gz.1', gzip.open), ('a%02d.gz.b', True, 'a01.gz.b', gzip.open), ('a%02d.gz.b', False, 'a%02d.gz.b', gzip.open), ('name.gzonk', False, 'name.gzonk', open), ): job = subjobs.build('csvexport', filename=filename, sliced=sliced, source=a, labels=['a']) fn = job.filename(out_filename) with open_func(fn, mode='rb') as fh: got = fh.read() if sliced: want = b'a\n1\n' else: want = b'a\n0\n1\n2\n' assert want == got, 'wanted %r, got %r in %s' % (want, got, fn)
def verify(source, lazy_quotes, q, sep, expect, **kw): j = subjobs.build('csvexport', chain_source=True, source=source, lazy_quotes=lazy_quotes, quote_fields=q, separator=sep, **kw) with j.open('result.csv', 'r' if PY3 else 'rb') as fh: got = fh.read() if lazy_quotes and sep: quote_func = make_lazy(sep, q) else: quote_func = lambda v: q + v.replace(q, q + q) + q want = '\n'.join(sep.join(map(quote_func, line)) for line in expect) if want != got: print('Unhappy with %s:' % (j.filename('result.csv'), )) print() print('Expected:') print(want) print('Got:') print(got) raise Exception( 'csvexport failed with quote_fields=%r, separator=%r, lazy_quotes=%r' % ( q, sep, lazy_quotes, ))
def verify(params, jwf): jid = subjobs.build('test_jobwithfile', options=dict(inner=True, file=jwf)) for sliceno in range(params.slices): assert jid.load('inner.pickle', sliceno) == {'inner': sliceno} assert jid.json_load('inner.json', sliceno) == {'inner': sliceno} assert jid.load('inner.pickle') == {'inner': None} assert jid.json_load('inner.json') == {'inner': None}
def synthesis(job): if options.pos == -1: previous = job other = None j2p = {job: -1} alles = [job] for pos in range(5): previous = subjobs.build('test_jobchain', pos=pos, previous=previous, other=other) alles.append(previous) j2p[previous] = pos if pos == 2: other = alles[1] else: other = None assert alles == previous.chain() assert list(reversed(alles)) == previous.chain(reverse=True) def chk(tip, first, **kw): c = alles[tip].chain(**kw) c = [j2p[j] for j in c] assert c == list(range(first, tip)), (tip, first, kw) chk(5, -1) chk(5, 3, stop_job=alles[3]) chk(5, 2, length=3) chk(4, 1, length=3) chk(5, 3, stop_job={alles[4]: 'previous'}) chk(5, 1, stop_job={alles[4]: 'other'}) chk(4, 1, stop_job={alles[4]: 'other'}) chk(5, 2, stop_job={alles[4]: 'other'}, length=3) chk(5, 1, stop_job={alles[4]: 'other'}, length=5) assert alles[2].chain(length=0) == [] assert job.chain() == [job] assert job.chain(stop_job=job) == []
def test_filter_bad_across_types(): columns={ 'bytes': 'bytes', 'float64': 'bytes', 'int32_10': 'ascii', 'json': 'unicode', 'number:int': 'unicode', 'unicode:utf-8': 'bytes', } # all_good, *values # Make sure all those types (except bytes) can filter other lines, # and be filtered by other lines. And that several filtering values # is not a problem (line 11). data = [ (True, b'first', b'1.1', '1', '"a"', '001', b'ett',), (True, b'second', b'2.2', '2', '"b"', '02', b'tv\xc3\xa5',), (True, b'third', b'3.3', '3', '["c"]', '3.0', b'tre',), (False, b'fourth', b'4.4', '4', '"d"', '4.4', b'fyra',), # number:int bad (False, b'fifth', b'5.5', '-', '"e"', '5', b'fem',), # int32_10 bad (False, b'sixth', b'6.b', '6', '"f"', '6', b'sex',), # float64 bad [False, b'seventh', b'7.7', '7', '{"g"}', '7', b'sju',], # json bad (False, b'eigth', b'8.8', '8', '"h"', '8', b'\xa5\xc3tta',),# unicode:utf-8 bad (True, b'ninth', b'9.9', '9', '"i"', '9', b'nio',), (True, b'tenth', b'10', '10', '"j"', '10', b'tio',), (False, b'eleventh', b'11a', '1-', '"k",', '1,', b'elva',), # float64, int32_10 and number:int bad (True, b'twelfth', b'12', '12', '"l"', '12', b'tolv',), ] dw = DatasetWriter(name="filter bad across types", columns=columns) dw.set_slice(0) want = [] def add_want(v): want.append((int(v[3]), v[1], json.loads(v[4]), v[6].decode('utf-8'),)) for v in data: if v[0]: add_want(v) dw.write(*v[1:]) for sliceno in range(1, g.slices): dw.set_slice(sliceno) source_ds = dw.finish() # Once with just filter_bad, once with some defaults too. defaults = {} for _ in range(2): jid = subjobs.build( 'dataset_type', datasets=dict(source=source_ds), options=dict(column2type={t: t for t in columns}, filter_bad=True, defaults=defaults), ) typed_ds = Dataset(jid) got = list(typed_ds.iterate(0, ['int32_10', 'bytes', 'json', 'unicode:utf-8'])) assert got == want, "Exptected %r, got %r from %s (from %r%s)" % (want, got, typed_ds, source_ds, ' with defaults' if defaults else '') # make more lines "ok" for the second lap defaults = {'number:int': '0', 'float64': '0', 'json': '"replacement"'} add_want(data[3]) add_want(data[5]) data[6][4] = '"replacement"' add_want(data[6]) want.sort() # adding them out of order, int32_10 sorts correctly.
def synthesis(): typerename = dict( int64="int64_10", int32="int32_10", bits64="bits64_10", bits32="bits32_10", bool="strbool", datetime="datetime:%Y-%m-%d %H:%M:%S.%f", date="date:%Y-%m-%d", time="time:%H:%M:%S.%f", unicode="unicode:utf-8", ) columns = {k: typerename.get(v.type, v.type) for k, v in datasets.typed.columns.items()} retyped = subjobs.build( "dataset_type", options=dict(column2type=columns), datasets=dict(source=datasets.untyped) ) subjobs.build("test_compare_datasets", datasets=dict(a=datasets.typed, b=retyped))
def sort(src, ix, **kw): ds = subjobs.build('dataset_sort', source=src, sort_across_slices=True, **kw).dataset() want = sorted(src.iterate(None), key=itemgetter(ix)) assert list(ds.iterate(None)) == want, '%s != sorted(%s)' % ( ds, src, ) return ds
def verify(zipname, inside_filenames, want_ds, **kw): opts = dict( filename=g.job.filename(zipname), inside_filenames=inside_filenames, ) opts.update(kw) jid = subjobs.build('csvimport_zip', options=opts) for dsn, want_data in want_ds.items(): got_data = list(Dataset(jid, dsn).iterate(None, '0')) assert got_data == want_data, "%s/%s from %s didn't contain %r, instead contained %r" % ( jid, dsn, zipname, want_data, got_data)
def getImageCluster(lat_deg, lon_deg, delta_lat, delta_long, zoom): xmin, ymax = deg2num(lat_deg, lon_deg, zoom) xmax, ymin = deg2num(lat_deg + delta_lat, lon_deg + delta_long, zoom) size = ((xmax - xmin + 1) * 256 - 1, (ymax - ymin + 1) * 256 - 1) cluster = Image.new('RGB', size) for xtile in range(xmin, xmax + 1): for ytile in range(ymin, ymax + 1): t = build('tile', zoom=zoom, xtile=xtile, ytile=ytile) tile = Image.open(t.filename('tile.png')) cluster.paste(tile, box=((xtile - xmin) * 256, (ytile - ymin) * 255)) return cluster, size, num2deg(xmin, ymin, zoom), num2deg(xmax + 1, ymax + 1, zoom)
def _verify(name, types, data, coltype, want, default, want_fail, kw): if callable(want): check = want else: def check(got, fromstr, filtered=False): want1 = want if isinstance(want, list) else want[typ] if filtered: want1 = want1[::2] assert got == want1, 'Expected %r, got %r from %s.' % (want1, got, fromstr,) dw = DatasetWriter(name=name, columns={'data': coltype, 'extra': 'bytes'}) dw.set_slice(0) for ix, v in enumerate(data): dw.write(v, b'1' if ix % 2 == 0 else b'skip') for sliceno in range(1, g.slices): dw.set_slice(sliceno) bytes_ds = dw.finish() for typ in types: opts = dict(column2type=dict(data=typ)) opts.update(kw) if default is not no_default: opts['defaults'] = {'data': default} try: jid = subjobs.build('dataset_type', datasets=dict(source=bytes_ds), options=opts) except JobError: if want_fail: continue raise Exception('Typing %r as %s failed.' % (bytes_ds, typ,)) assert not want_fail, "Typing %r as %s should have failed, but didn't (%s)." % (bytes_ds, typ, jid) typed_ds = Dataset(jid) got = list(typed_ds.iterate(0, 'data')) check(got, '%s (typed as %s from %r)' % (typed_ds, typ, bytes_ds,)) if 'filter_bad' not in opts and not callable(want): opts['filter_bad'] = True opts['column2type']['extra'] = 'int32_10' jid = subjobs.build('dataset_type', datasets=dict(source=bytes_ds), options=opts) typed_ds = Dataset(jid) got = list(typed_ds.iterate(0, 'data')) check(got, '%s (typed as %s from %r with every other line skipped from filter_bad)' % (typed_ds, typ, bytes_ds,), True) used_type(typ)
def synthesis(job): ix = 0 jobs = [] for current, subdirs, files in walk(options.path): files = sorted(x for x in files if splitext(x)[1].upper() in options.validextensions and not islink(join(current, x))) if files: jobs.append( build('scandir', directory=current, files=sorted(files))) print(ix, current) ix += 1 return jobs
def verify_hashing(caption, want_values, **kw): ds = subjobs.build('dataset_type', source=source, column2type=dict(a='int32_10'), caption=caption, **kw).dataset() got_values = set() for sliceno in range(g.slices): for got in ds.iterate(sliceno): assert hashfunc(got[0]) % g.slices == sliceno assert got not in got_values got_values.add(got) assert want_values == got_values
def check_one(slices, key, source, reverse=False): jid = subjobs.build( "dataset_sort", options=dict( sort_columns=key, sort_order="descending" if reverse else "ascending", ), datasets=dict(source=source), ) ds = Dataset(jid) key_off = sorted(test_data.data).index(key) # This provides better separation than the replacement values # used in the actual sort method (but this is slow). if 'date' in key or 'time' in key: nonepos = 1 else: nonepos = -1 def cmp(a, b): a = a[key_off] b = b[key_off] if a is None: if b is None: return 0 return nonepos if b is None: return -nonepos if isinstance(a, float): if isnan(a): if isnan(b): return 0 return 1 if isnan(b): return -1 if a < b: return -1 return a > b keycmp = cmp_to_key(cmp) for sliceno in range(slices): good = sorted(test_data.sort_data_for_slice(sliceno), key=keycmp, reverse=reverse) check = list(ds.iterate(sliceno)) assert unnan(check) == unnan( good), "Slice %d sorted on %s bad (%s)" % ( sliceno, key, jid, )
def synthesis(): sum = 0 jobs = datasets.source.chain(length=options.chain_length, stop_ds=datasets.stop) for src in jobs: jid = build('dataset_checksum', options=dict(columns=options.columns, sort=options.sort), datasets=dict(source=src)) data = blob.load(jobid=jid) sum ^= data.sum print("Total: %016x" % (sum, )) return DotDict(sum=sum, columns=data.columns, sort=options.sort, sources=jobs)
def synthesis(params, prepare_res): dw = prepare_res source = dw.finish() jid = subjobs.build( "dataset_sort", options=dict( sort_columns="num", sort_across_slices=True, ), datasets=dict(source=source), ) ds = Dataset(jid) data = list(ds.iterate(None, "str")) good = list("cghjabdefi") + \ [str(sliceno) for sliceno in range(params.slices)] * 64 assert data == good
def test(params, p=False, a=False, s=False): prefix = "A bit of text." opts = {'prefix': prefix} name = 'test_output_' cookie = randint(10000, 99999) if p: name += 'p' opts['p'] = "Some words\nfrom prepare\nwith %d in them." % (cookie,) if a: name += 'a' opts['a'] = "A few words\nfrom analysis(%%d)\nwith the cookie %d in them." % (cookie,) if s: name += 's' opts['s'] = "Words\nfrom synthesis\ncookie is %d." % (cookie,) jid = subjobs.build(name, options=opts) d = jid.filename('OUTPUT/') chked = set() all = [] def chk(part): output = jid.output(part) if isinstance(part, int): data = opts['a'] % (part,) part = str(part) else: data = opts[part[0]] chked.add(part) with open(d + part, 'r') as fh: got = fh.read().replace('\r\n', '\n') want = prefix + '\n' + data + '\n' assert got == prefix + '\n' + data + '\n', "%s produced %r in %s, expected %r" % (jid, got, part, want,) assert output == got, 'job.output disagrees with manual file reading for %s in %s. %r != %r' % (part, jid, output, got,) all.append(got) if p: chk('prepare') if a: for sliceno in range(params.slices): chk(sliceno) if s: chk('synthesis') unchked = set(os.listdir(d)) - chked assert not unchked, "Unexpected OUTPUT files from %s: %r" % (jid, unchked,) output = jid.output() got = ''.join(all) assert output == got, 'job.output disagrees with manual file reading for <all> in %s. %r != %r' % (jid, output, got,)
def synthesis(prepare_res): opts = DotDict( (k, v) for k, v in options.items() if k in a_csvimport.options) lst = prepare_res previous = datasets.previous for fn, info, dsn in lst: opts.filename = fn jid = subjobs.build('csvimport', options=opts, datasets=dict(previous=previous), caption="Import of %s from %s" % ( info.filename, options.filename, )) previous = Dataset(jid).link_to_here(dsn) if options.chaining == 'off': previous = None if (len(lst) == 1 or options.chaining != 'off') and dsn != 'default': Dataset(jid).link_to_here('default')
def synthesis(): if options.inner: res = DotDict() res.datetime = options.datetime + options.timedelta res.time = options.time.replace(minute=0) res.date = options.date.replace(month=1) return res else: opts = dict( datetime=datetime(2019, 11, 6, 17, 37, 2, 987654), time=time(17, 37, 2, 987654), date=date(2019, 11, 6), timedelta=timedelta(microseconds=987654), inner=True, ) jid = subjobs.build('test_datetime', options=opts) res = jid.load() assert res.datetime == datetime(2019, 11, 6, 17, 37, 3, 975308) assert res.time == time(17, 0, 2, 987654) assert res.date == date(2019, 1, 6)
def check_no_separator(job): def write(data): fh.write(data + nl_b) wrote_c[data] += 1 if q_b: data = q_b + data + q_b fh.write(q_b + data.replace(q_b, q_b + q_b) + q_b + nl_b) wrote_c[data] += 1 for nl in (10, 0, 255): for q in (None, 0, 34, 13, 10, 228): if nl == q: continue filename = "no separator.%r.%r.txt" % ( nl, q, ) nl_b = bytechr(nl) q_b = bytechr(q) if q else b'' wrote_c = Counter() with openx(filename) as fh: for splitpoint in range(256): write(byteline(0, splitpoint, nl, q)) write(byteline(splitpoint, 256, nl, q)) try: jid = subjobs.build("csvimport", options=dict( filename=job.filename(filename), quotes=q_b.decode("iso-8859-1"), newline=nl_b.decode("iso-8859-1"), separator='', labelsonfirstline=False, labels=["data"], )) except JobError: raise Exception("Importing %r failed" % (filename, )) got_c = Counter(Dataset(jid).iterate(None, "data")) assert got_c == wrote_c, "Importing %r (%s) gave wrong contents" % ( filename, jid, )
def synthesis(job): dw = job.datasetwriter() dw.add('a', 'ascii') dw.add('b', 'unicode') w = dw.get_split_write() w('A', 'B') w('\0', '\xe4') ds = dw.finish() def verify(data, filename): want = [] for line in [['a', 'b']] + data: want.append(separator.join(quote + item + quote for item in line)) want.append(line_separator) want = ''.join(want).encode('utf-8') if ext == '.gz': open_func = gzip.open else: open_func = open with open_func(j.filename(filename), 'rb') as fh: got = fh.read() assert want == got, "Expected %s/%s to contain %r, but contained %r" % (j, filename, want, got,) for separator in ('', '\0', 'wheeee'): for line_separator in ('', '\0', 'woooooo'): for quote in ('', 'qqq'): for ext in ('.csv', '.gz'): for sliced, filename in ((False, 'out' + ext), (True, 'out.%d' + ext)): j = subjobs.build( 'csvexport', filename=filename, separator=separator, line_separator=line_separator, quote_fields=quote, sliced=sliced, source=ds, ) if sliced: for sliceno, data in ((0, ['A', 'B']), (1, ['\0', '\xe4'])): verify([data], filename % (sliceno,)) else: verify([['A', 'B'], ['\0', '\xe4']], filename)
def verify(slices, data, source, previous=None, **options): jid = subjobs.build( "dataset_hashpart", datasets=dict(source=source, previous=previous), options=options, ) hl = options["hashlabel"] h = typed_writer(columns[hl][0]).hash ds = Dataset(jid) good = {row[hl]: row for row in data} names = list(source.columns) for slice in range(slices): for row in ds.iterate_chain(slice, names): row = dict(zip(names, row)) assert h( row[hl] ) % slices == slice, "row %r is incorrectly in slice %d in %s" % ( row, slice, ds) want = good[row[hl]] assert row == want, '%s (rehashed from %s) did not contain the right data for "%s".\nWanted\n%r\ngot\n%r' % ( ds, source, hl, want, row) return ds