def analysis(sliceno, params): assert list(datasets.source.iterate(sliceno, "a")) == [sliceno, 42] assert list(datasets.source.iterate(sliceno, "b")) == ["a", str(sliceno)] named = Dataset(datasets.source, "named") assert list(named.iterate(sliceno, "c")) == [True, False] assert list(named.iterate(sliceno, "d")) == [ date(1536, 12, min(sliceno + 1, 31)), date(2236, 5, min(sliceno + 1, 31)) ] if sliceno < test_data.value_cnt: passed = Dataset(datasets.source, "passed") good = tuple(v[sliceno] for _, v in sorted(test_data.data.items())) assert list(passed.iterate(sliceno)) == [good] synthesis_split = Dataset(datasets.source, "synthesis_split") values = zip(( 1, 2, 3, ), "abc") hash = typed_writer("int32").hash good = [v for v in values if hash(v[0]) % params.slices == sliceno] assert list(synthesis_split.iterate(sliceno)) == good synthesis_manual = Dataset(datasets.source, "synthesis_manual") assert list(synthesis_manual.iterate(sliceno, "sliceno")) == [sliceno] nonetest = Dataset(datasets.source, "nonetest") good = tuple(v[0] if k in test_data.not_none_capable else None for k, v in sorted(test_data.data.items())) assert list(nonetest.iterate(sliceno)) == [good]
def synthesis(params): source = Dataset(subjobs.build("test_sorting_gendata")) # Test that all datatypes work for sorting for key in test_data.data: check_one(params.slices, key, source) # Check reverse sorting check_one(params.slices, "int32", source, reverse=True) # Check that sorting across slices and by two columns works jid = subjobs.build( "dataset_sort", options=dict( sort_columns=["int64", "int32"], sort_order="descending", sort_across_slices=True, ), datasets=dict(source=source), ) int64_off = sorted(test_data.data).index("int64") int32_off = sorted(test_data.data).index("int32") all_data = chain.from_iterable( test_data.sort_data_for_slice(sliceno) for sliceno in range(params.slices)) good = sorted(all_data, key=lambda t: ( noneninf(t[int64_off]), noneninf(t[int32_off]), ), reverse=True) ds = Dataset(jid) check = list(ds.iterate(None)) assert unnan(check) == unnan( good), "Sorting across slices on [int64, int32] bad (%s)" % (jid, )
def test_filter_bad_across_types(): columns={ 'bytes': 'bytes', 'float64': 'bytes', 'int32_10': 'ascii', 'json': 'unicode', 'number:int': 'unicode', 'unicode:utf-8': 'bytes', } # all_good, *values # Make sure all those types (except bytes) can filter other lines, # and be filtered by other lines. And that several filtering values # is not a problem (line 11). data = [ (True, b'first', b'1.1', '1', '"a"', '001', b'ett',), (True, b'second', b'2.2', '2', '"b"', '02', b'tv\xc3\xa5',), (True, b'third', b'3.3', '3', '["c"]', '3.0', b'tre',), (False, b'fourth', b'4.4', '4', '"d"', '4.4', b'fyra',), # number:int bad (False, b'fifth', b'5.5', '-', '"e"', '5', b'fem',), # int32_10 bad (False, b'sixth', b'6.b', '6', '"f"', '6', b'sex',), # float64 bad [False, b'seventh', b'7.7', '7', '{"g"}', '7', b'sju',], # json bad (False, b'eigth', b'8.8', '8', '"h"', '8', b'\xa5\xc3tta',),# unicode:utf-8 bad (True, b'ninth', b'9.9', '9', '"i"', '9', b'nio',), (True, b'tenth', b'10', '10', '"j"', '10', b'tio',), (False, b'eleventh', b'11a', '1-', '"k",', '1,', b'elva',), # float64, int32_10 and number:int bad (True, b'twelfth', b'12', '12', '"l"', '12', b'tolv',), ] dw = DatasetWriter(name="filter bad across types", columns=columns) dw.set_slice(0) want = [] def add_want(v): want.append((int(v[3]), v[1], json.loads(v[4]), v[6].decode('utf-8'),)) for v in data: if v[0]: add_want(v) dw.write(*v[1:]) for sliceno in range(1, g.slices): dw.set_slice(sliceno) source_ds = dw.finish() # Once with just filter_bad, once with some defaults too. defaults = {} for _ in range(2): jid = subjobs.build( 'dataset_type', datasets=dict(source=source_ds), options=dict(column2type={t: t for t in columns}, filter_bad=True, defaults=defaults), ) typed_ds = Dataset(jid) got = list(typed_ds.iterate(0, ['int32_10', 'bytes', 'json', 'unicode:utf-8'])) assert got == want, "Exptected %r, got %r from %s (from %r%s)" % (want, got, typed_ds, source_ds, ' with defaults' if defaults else '') # make more lines "ok" for the second lap defaults = {'number:int': '0', 'float64': '0', 'json': '"replacement"'} add_want(data[3]) add_want(data[5]) data[6][4] = '"replacement"' add_want(data[6]) want.sort() # adding them out of order, int32_10 sorts correctly.
def _verify(name, types, data, coltype, want, default, want_fail, kw): if callable(want): check = want else: def check(got, fromstr, filtered=False): want1 = want if isinstance(want, list) else want[typ] if filtered: want1 = want1[::2] assert got == want1, 'Expected %r, got %r from %s.' % (want1, got, fromstr,) dw = DatasetWriter(name=name, columns={'data': coltype, 'extra': 'bytes'}) dw.set_slice(0) for ix, v in enumerate(data): dw.write(v, b'1' if ix % 2 == 0 else b'skip') for sliceno in range(1, g.slices): dw.set_slice(sliceno) bytes_ds = dw.finish() for typ in types: opts = dict(column2type=dict(data=typ)) opts.update(kw) if default is not no_default: opts['defaults'] = {'data': default} try: jid = subjobs.build('dataset_type', datasets=dict(source=bytes_ds), options=opts) except JobError: if want_fail: continue raise Exception('Typing %r as %s failed.' % (bytes_ds, typ,)) assert not want_fail, "Typing %r as %s should have failed, but didn't (%s)." % (bytes_ds, typ, jid) typed_ds = Dataset(jid) got = list(typed_ds.iterate(0, 'data')) check(got, '%s (typed as %s from %r)' % (typed_ds, typ, bytes_ds,)) if 'filter_bad' not in opts and not callable(want): opts['filter_bad'] = True opts['column2type']['extra'] = 'int32_10' jid = subjobs.build('dataset_type', datasets=dict(source=bytes_ds), options=opts) typed_ds = Dataset(jid) got = list(typed_ds.iterate(0, 'data')) check(got, '%s (typed as %s from %r with every other line skipped from filter_bad)' % (typed_ds, typ, bytes_ds,), True) used_type(typ)
def check_one(slices, key, source, reverse=False): jid = subjobs.build( "dataset_sort", options=dict( sort_columns=key, sort_order="descending" if reverse else "ascending", ), datasets=dict(source=source), ) ds = Dataset(jid) key_off = sorted(test_data.data).index(key) # This provides better separation than the replacement values # used in the actual sort method (but this is slow). if 'date' in key or 'time' in key: nonepos = 1 else: nonepos = -1 def cmp(a, b): a = a[key_off] b = b[key_off] if a is None: if b is None: return 0 return nonepos if b is None: return -nonepos if isinstance(a, float): if isnan(a): if isnan(b): return 0 return 1 if isnan(b): return -1 if a < b: return -1 return a > b keycmp = cmp_to_key(cmp) for sliceno in range(slices): good = sorted(test_data.sort_data_for_slice(sliceno), key=keycmp, reverse=reverse) check = list(ds.iterate(sliceno)) assert unnan(check) == unnan( good), "Slice %d sorted on %s bad (%s)" % ( sliceno, key, jid, )
def synthesis(params, prepare_res): dw = prepare_res source = dw.finish() jid = subjobs.build( "dataset_sort", options=dict( sort_columns="num", sort_across_slices=True, ), datasets=dict(source=source), ) ds = Dataset(jid) data = list(ds.iterate(None, "str")) good = list("cghjabdefi") + \ [str(sliceno) for sliceno in range(params.slices)] * 64 assert data == good
def dataset(dsid): ds = Dataset(dsid.rstrip('/')) q = bottle.request.query if q.column: lines = int(q.lines or 10) it = ds.iterate(None, q.column) it = itertools.islice(it, lines) t = ds.columns[q.column].type if t in ('datetime', 'date', 'time',): it = map(str, it) elif t in ('bytes', 'pickle',): it = map(repr, it) res = list(it) bottle.response.content_type = 'application/json; charset=UTF-8' return json.dumps(res) else: return dict(ds=ds)
def verify_ds(options, d, d_bad, d_skipped, filename): jid = subjobs.build("csvimport", options=options) ds = Dataset(jid) expected_columns = {"ix", "0", "1"} if options.get("lineno_label"): expected_columns.add(options["lineno_label"]) lineno_want = {ix: int(ix) for ix in ds.iterate(None, "ix")} assert set(ds.columns) == expected_columns # Order varies depending on slice count, so we use a dict {ix: data} for ix, a, b in ds.iterate(None, ["ix", "0", "1"]): try: ix = int(ix) except ValueError: # We have a few non-numeric ones pass assert ix in d, "Bad index %r in %r (%s)" % ( ix, filename, jid, ) assert a == b == d[ix], "Wrong data for line %r in %r (%s)" % ( ix, filename, jid, ) del d[ix] assert not d, "Not all lines returned from %r (%s), %r missing" % ( filename, jid, set(d.keys()), ) if options.get("allow_bad"): for ix, data in Dataset(jid, "bad").iterate(None, ["lineno", "data"]): assert ix in d_bad, "Bad bad_lineno %d in %r (%s/bad) %r" % ( ix, filename, jid, data, ) assert data == d_bad[ ix], "Wrong saved bad line %d in %r (%s/bad).\nWanted %r.\nGot %r." % ( ix, filename, jid, d_bad[ix], data, ) del d_bad[ix] assert not d_bad, "Not all bad lines returned from %r (%s), %r missing" % ( filename, jid, set(d_bad.keys()), ) if options.get("comment") or options.get("skip_lines"): for ix, data in Dataset(jid, "skipped").iterate(None, ["lineno", "data"]): assert ix in d_skipped, "Bad skipped_lineno %d in %r (%s/skipped) %r" % ( ix, filename, jid, data, ) assert data == d_skipped[ ix], "Wrong saved skipped line %d in %r (%s/skipped).\nWanted %r.\nGot %r." % ( ix, filename, jid, d_skipped[ix], data, ) del d_skipped[ix] assert not d_skipped, "Not all bad lines returned from %r (%s), %r missing" % ( filename, jid, set(d_skipped.keys()), ) if options.get("lineno_label"): lineno_got = dict(ds.iterate(None, ["ix", options.get("lineno_label")])) assert lineno_got == lineno_want, "%r != %r" % ( lineno_got, lineno_want, )
def check_one(job, newline, sep, data, want_res=None, prefix="", quotes=False, leave_bad=False): sep_c = uni(chr(sep)) # Can't have separator character in unquoted values if not quotes and not leave_bad: data = [[el.replace(sep_c, "") for el in line] for line in data] if not want_res: want_res = [ tuple(s.encode("ascii") for s in line) for line in data[1:] ] filename = "%s_csv.%d.%s.txt" % (prefix, sep, "CRLF" if newline == "\r\n" else ord(newline)) newline = uni(newline) with job.open(filename, "w", encoding="iso-8859-1", temp=True) as fh: for line in data: if quotes: line = [ quotes + el.replace(quotes, quotes + quotes) + quotes for el in line ] fh.write(sep_c.join(line)) fh.write(newline) try: jid = subjobs.build("csvimport", options=dict( filename=job.filename(filename), separator=sep_c, quotes=quotes, newline='' if "\n" in newline else newline, )) except JobError as e: raise CSVImportException( "Failed to csvimport for separator %d with newline %r, csvimport error was:\n%s" % (sep, newline, e.format_msg())) ds = Dataset(jid) labels = sorted(ds.columns) if labels != data[0]: raise WrongLabelsException( "csvimport gave wrong labels for separator %d with newline %r: %r (expected %r)" % ( sep, newline, labels, data[0], )) res = list(ds.iterate(None, data[0])) if res != want_res: raise WrongDataException( "csvimport gave wrong data for separator %d with newline %r: %r (expected %r)" % ( sep, newline, res, want_res, ))
def test_filter_bad_across_types(): columns = { 'bytes': 'bytes', 'float64': 'bytes', 'int32_10': 'ascii', 'json': 'unicode', 'number:int': 'unicode', 'unicode:utf-8': 'bytes', } # all_good, *values # Make sure all those types (except bytes) can filter other lines, # and be filtered by other lines. And that several filtering values # is not a problem (line 11). data = [ [ True, b'first', b'1.1', '1', '"a"', '001', b'ett', ], [ True, b'second', b'2.2', '2', '"b"', '02', b'tv\xc3\xa5', ], [ True, b'third', b'3.3', '3', '["c"]', '3.0', b'tre', ], [ False, b'fourth', b'4.4', '4', '"d"', '4.4', b'fyra', ], # number:int bad [ False, b'fifth', b'5.5', '-', '"e"', '5', b'fem', ], # int32_10 bad [ False, b'sixth', b'6.b', '6', '"f"', '6', b'sex', ], # float64 bad [ False, b'seventh', b'7.7', '7', '{"g"}', '7', b'sju', ], # json bad [ False, b'eigth', b'8.8', '8', '"h"', '8', b'\xa5\xc3tta', ], # unicode:utf-8 bad [ True, b'ninth', b'9.9', '9', '"i"', '9', b'nio', ], [ True, b'tenth', b'10', '10', '"j"', '10', b'tio', ], [ False, b'eleventh', b'11a', '1-', '"k",', '1,', b'elva', ], # float64, int32_10 and number:int bad [ True, b'twelfth', b'12', '12', '"l"', '12', b'tolv', ], ] want_bad = [tuple(l[1:]) for l in data if not l[0]] dw = DatasetWriter(name="filter bad across types", columns=columns, allow_missing_slices=True) cols_to_check = ['int32_10', 'bytes', 'json', 'unicode:utf-8'] if PY3: # z so it sorts last. dw.add('zpickle', 'pickle') cols_to_check.append('zpickle') for ix in range(len(data)): data[ix].append({ix}) dw.set_slice(0) want = [] def add_want(ix): v = data[ix] want.append(( int(v[3]), v[1], json.loads(v[4]), v[6].decode('utf-8'), )) if PY3: want[-1] = want[-1] + (v[7], ) for ix, v in enumerate(data): if v[0]: add_want(ix) dw.write(*v[1:]) source_ds = dw.finish() # Once with just filter_bad, once with some defaults too. defaults = {} for _ in range(2): jid = subjobs.build( 'dataset_type', datasets=dict(source=source_ds), options=dict(column2type={t: t for t in columns}, filter_bad=True, defaults=defaults), ) typed_ds = Dataset(jid) got = list(typed_ds.iterate(0, cols_to_check)) assert got == want, "Expected %r, got %r from %s (from %r%s)" % ( want, got, typed_ds, source_ds, ' with defaults' if defaults else '') bad_ds = Dataset(jid, 'bad') got_bad = list(bad_ds.iterate(0, sorted(columns))) assert got_bad == want_bad, "Expected %r, got %r from %s (from %r%s)" % ( want_bad, got_bad, bad_ds, source_ds, ' with defaults' if defaults else '') # make more lines "ok" for the second lap if not defaults: want_bad.pop(0) # number:int want_bad.pop(1) # float64 want_bad.pop(1) # json defaults = {'number:int': '0', 'float64': '0', 'json': '"replacement"'} add_want(3) add_want(5) data[6][4] = '"replacement"' add_want(6) want.sort() # adding them out of order, int32_10 sorts correctly.
def synthesis(params): ds = Dataset(params.jobid) assert set(ds.iterate(None, "data")) == {"foo", "bar"}
def analysis(sliceno, params): ds = Dataset(params.jobid) assert set(ds.iterate(None, "data")) == {"foo", "bar"}