Exemplo n.º 1
0
def synthesis(params):
    source = Dataset(subjobs.build("test_sorting_gendata"))
    # Test that all datatypes work for sorting
    for key in test_data.data:
        check_one(params.slices, key, source)
    # Check reverse sorting
    check_one(params.slices, "int32", source, reverse=True)
    # Check that sorting across slices and by two columns works
    jid = subjobs.build(
        "dataset_sort",
        options=dict(
            sort_columns=["int64", "int32"],
            sort_order="descending",
            sort_across_slices=True,
        ),
        datasets=dict(source=source),
    )
    int64_off = sorted(test_data.data).index("int64")
    int32_off = sorted(test_data.data).index("int32")
    all_data = chain.from_iterable(
        test_data.sort_data_for_slice(sliceno)
        for sliceno in range(params.slices))
    good = sorted(all_data,
                  key=lambda t: (
                      noneninf(t[int64_off]),
                      noneninf(t[int32_off]),
                  ),
                  reverse=True)
    ds = Dataset(jid)
    check = list(ds.iterate(None))
    assert unnan(check) == unnan(
        good), "Sorting across slices on [int64, int32] bad (%s)" % (jid, )
def analysis(sliceno, params):
    assert list(datasets.source.iterate(sliceno, "a")) == [sliceno, 42]
    assert list(datasets.source.iterate(sliceno, "b")) == ["a", str(sliceno)]
    named = Dataset(datasets.source, "named")
    assert list(named.iterate(sliceno, "c")) == [True, False]
    assert list(named.iterate(sliceno, "d")) == [
        date(1536, 12, min(sliceno + 1, 31)),
        date(2236, 5, min(sliceno + 1, 31))
    ]
    if sliceno < test_data.value_cnt:
        passed = Dataset(datasets.source, "passed")
        good = tuple(v[sliceno] for _, v in sorted(test_data.data.items()))
        assert list(passed.iterate(sliceno)) == [good]
    synthesis_split = Dataset(datasets.source, "synthesis_split")
    values = zip((
        1,
        2,
        3,
    ), "abc")
    hash = typed_writer("int32").hash
    good = [v for v in values if hash(v[0]) % params.slices == sliceno]
    assert list(synthesis_split.iterate(sliceno)) == good
    synthesis_manual = Dataset(datasets.source, "synthesis_manual")
    assert list(synthesis_manual.iterate(sliceno, "sliceno")) == [sliceno]
    nonetest = Dataset(datasets.source, "nonetest")
    good = tuple(v[0] if k in test_data.not_none_capable else None
                 for k, v in sorted(test_data.data.items()))
    assert list(nonetest.iterate(sliceno)) == [good]
Exemplo n.º 3
0
def synthesis():
    dw_a = DatasetWriter(name='a', columns={'num': 'int32'})
    dw_b = DatasetWriter(name='b', columns={'num': 'int32'}, previous=dw_a)
    dw_c = DatasetWriter(name='c', columns={'num': 'int32'}, previous=dw_b)
    w = dw_a.get_split_write()
    w(3)
    w(2)
    w = dw_b.get_split_write()
    w(2)
    w(1)
    w = dw_c.get_split_write()
    w(0)
    a = dw_a.finish()
    b = dw_b.finish()
    c = dw_c.finish()

    opts = dict(
        sort_columns='num',
        sort_across_slices=True,
    )

    # sort as a chain
    jid = subjobs.build('dataset_sort',
                        options=opts,
                        datasets=dict(source=a, previous=None))
    assert list(Dataset(jid).iterate(None, 'num')) == [2, 3]
    sorted_a = jid
    jid = subjobs.build('dataset_sort',
                        options=opts,
                        datasets=dict(source=b, previous=jid))
    assert list(Dataset(jid).iterate_chain(None, 'num')) == [2, 3, 1, 2]
    jid = subjobs.build('dataset_sort',
                        options=opts,
                        datasets=dict(source=c, previous=jid))
    assert list(Dataset(jid).iterate_chain(None, 'num')) == [2, 3, 1, 2, 0]

    # sort all as a single dataset
    jid = subjobs.build('dataset_sort',
                        options=opts,
                        datasets=dict(source=c, previous=None))
    assert list(Dataset(jid).iterate_chain(None, 'num')) == [0, 1, 2, 2, 3]

    # merge b and c but not a
    jid = subjobs.build('dataset_sort',
                        options=opts,
                        datasets=dict(source=c, previous=sorted_a))
    # test with new style job.dataset
    assert list(jid.dataset().iterate(None, 'num')) == [0, 1, 2]
    assert list(jid.dataset().iterate_chain(None, 'num')) == [2, 3, 0, 1, 2]
Exemplo n.º 4
0
def name2ds(n):
	if exists(n):
		# it's a path - dig out parts, maybe update WORKDIRS
		n = realpath(n)
		if n.endswith("/dataset.pickle"):
			n = n.rsplit("/", 1)[0]
		if exists(join(n, "dataset.pickle")):
			# includes ds name
			base, jid, name = n.rsplit("/", 2)
			n = (jid, name)
		else:
			# bare jid (no ds name)
			base, jid = n.rsplit("/", 1)
			n = jid
		k = jid.rsplit("-", 1)[0]
		if WORKDIRS.get(k, base) != base:
			print("### Overriding workdir %s to %s" % (k, base,))
		WORKDIRS[k] = base
	elif n.startswith('/'):
		# meant to be a path, but it does not exist
		return None
	try:
		ds = Dataset(n)
	except IOError:
		return None
	slices = ds.jobid.params.slices
	from accelerator import g
	if hasattr(g, 'slices'):
		assert g.slices == slices, "Dataset %s needs %d slices, by we are already using %d slices" % (ds, slices, g.slices)
	else:
		g.slices = slices
	return ds
def test_filter_bad_across_types():
	columns={
		'bytes': 'bytes',
		'float64': 'bytes',
		'int32_10': 'ascii',
		'json': 'unicode',
		'number:int': 'unicode',
		'unicode:utf-8': 'bytes',
	}
	# all_good, *values
	# Make sure all those types (except bytes) can filter other lines,
	# and be filtered by other lines. And that several filtering values
	# is not a problem (line 11).
	data = [
		(True,  b'first',    b'1.1', '1',  '"a"',   '001', b'ett',),
		(True,  b'second',   b'2.2', '2',  '"b"',   '02',  b'tv\xc3\xa5',),
		(True,  b'third',    b'3.3', '3',  '["c"]', '3.0', b'tre',),
		(False, b'fourth',   b'4.4', '4',  '"d"',   '4.4', b'fyra',),       # number:int bad
		(False, b'fifth',    b'5.5', '-',  '"e"',   '5',   b'fem',),        # int32_10 bad
		(False, b'sixth',    b'6.b', '6',  '"f"',   '6',   b'sex',),        # float64 bad
		[False, b'seventh',  b'7.7', '7',  '{"g"}', '7',   b'sju',],        # json bad
		(False, b'eigth',    b'8.8', '8',  '"h"',   '8',   b'\xa5\xc3tta',),# unicode:utf-8 bad
		(True,  b'ninth',    b'9.9', '9',  '"i"',   '9',   b'nio',),
		(True,  b'tenth',    b'10',  '10', '"j"',   '10',  b'tio',),
		(False, b'eleventh', b'11a', '1-', '"k",',  '1,',  b'elva',),       # float64, int32_10 and number:int bad
		(True,  b'twelfth',  b'12',  '12', '"l"',   '12',  b'tolv',),
	]
	dw = DatasetWriter(name="filter bad across types", columns=columns)
	dw.set_slice(0)
	want = []
	def add_want(v):
		want.append((int(v[3]), v[1], json.loads(v[4]), v[6].decode('utf-8'),))
	for v in data:
		if v[0]:
			add_want(v)
		dw.write(*v[1:])
	for sliceno in range(1, g.slices):
		dw.set_slice(sliceno)
	source_ds = dw.finish()
	# Once with just filter_bad, once with some defaults too.
	defaults = {}
	for _ in range(2):
		jid = subjobs.build(
			'dataset_type',
			datasets=dict(source=source_ds),
			options=dict(column2type={t: t for t in columns}, filter_bad=True, defaults=defaults),
		)
		typed_ds = Dataset(jid)
		got = list(typed_ds.iterate(0, ['int32_10', 'bytes', 'json', 'unicode:utf-8']))
		assert got == want, "Exptected %r, got %r from %s (from %r%s)" % (want, got, typed_ds, source_ds, ' with defaults' if defaults else '')
		# make more lines "ok" for the second lap
		defaults = {'number:int': '0', 'float64': '0', 'json': '"replacement"'}
		add_want(data[3])
		add_want(data[5])
		data[6][4] = '"replacement"'
		add_want(data[6])
		want.sort() # adding them out of order, int32_10 sorts correctly.
Exemplo n.º 6
0
def synthesis(prepare_res):
    opts = DotDict(
        (k, v) for k, v in options.items() if k in a_csvimport.options)
    lst = prepare_res
    previous = datasets.previous
    for fn, info, dsn in lst:
        opts.filename = fn
        jid = subjobs.build('csvimport',
                            options=opts,
                            datasets=dict(previous=previous),
                            caption="Import of %s from %s" % (
                                info.filename,
                                options.filename,
                            ))
        previous = Dataset(jid).link_to_here(dsn)
        if options.chaining == 'off':
            previous = None
    if (len(lst) == 1 or options.chaining != 'off') and dsn != 'default':
        Dataset(jid).link_to_here('default')
Exemplo n.º 7
0
def verify(zipname, inside_filenames, want_ds, **kw):
    opts = dict(
        filename=g.job.filename(zipname),
        inside_filenames=inside_filenames,
    )
    opts.update(kw)
    jid = subjobs.build('csvimport_zip', options=opts)
    for dsn, want_data in want_ds.items():
        got_data = list(Dataset(jid, dsn).iterate(None, '0'))
        assert got_data == want_data, "%s/%s from %s didn't contain %r, instead contained %r" % (
            jid, dsn, zipname, want_data, got_data)
Exemplo n.º 8
0
def synthesis(job):
    manual_chain = [Dataset(jobids.selfchain, name) for name in "abcdefgh"]
    manual_abf = [manual_chain[0], manual_chain[1], manual_chain[5]]
    # build a local abf chain
    prev = None
    for ix, ds in enumerate(manual_abf):
        name = "abf%d" % (ix, )
        prev = ds.link_to_here(name, override_previous=prev)
    manual_abf_data = list(Dataset.iterate_list(None, None, manual_abf))
    local_abf_data = list(Dataset(job, "abf2").iterate_chain(None, None))
    assert manual_abf_data == local_abf_data
    # disconnect h, verify there is no chain
    manual_chain[-1].link_to_here("alone", override_previous=None)
    assert len(Dataset(job, "alone").chain()) == 1
    # check that the original chain is unhurt
    assert manual_chain == manual_chain[-1].chain()

    # So far so good, now make a chain long enough to have a cache.
    prev = None
    ix = 0
    going = True
    while going:
        if prev and "cache" in prev._data:
            going = False
        name = "longchain%d" % (ix, )
        dw = DatasetWriter(name=name, previous=prev)
        dw.add("ix", "number")
        dw.get_split_write()(ix)
        prev = dw.finish()
        ix += 1
    # we now have a chain that goes one past the first cache point
    full_chain = Dataset(prev).chain()
    assert "cache" in full_chain[
        -2]._data  # just to check the above logic is correct
    assert "cache" not in full_chain[-1]._data  # just to be sure..
    full_chain[-2].link_to_here("nocache", override_previous=None)
    full_chain[-1].link_to_here("withcache", override_previous=full_chain[-3])
    assert "cache" not in Dataset(job, "nocache")._data
    assert "cache" in Dataset(job, "withcache")._data
    # And make sure they both get the right data too.
    assert list(Dataset(prev).iterate_chain(None, "ix")) == list(range(ix))
    assert list(Dataset(job, "nocache").iterate_chain(None, "ix")) == [ix - 2]
    assert list(Dataset(job, "withcache").iterate_chain(
        None, "ix")) == list(range(ix - 2)) + [ix - 1]
def _verify(name, types, data, coltype, want, default, want_fail, kw):
	if callable(want):
		check = want
	else:
		def check(got, fromstr, filtered=False):
			want1 = want if isinstance(want, list) else want[typ]
			if filtered:
				want1 = want1[::2]
			assert got == want1, 'Expected %r, got %r from %s.' % (want1, got, fromstr,)
	dw = DatasetWriter(name=name, columns={'data': coltype, 'extra': 'bytes'})
	dw.set_slice(0)
	for ix, v in enumerate(data):
		dw.write(v, b'1' if ix % 2 == 0 else b'skip')
	for sliceno in range(1, g.slices):
		dw.set_slice(sliceno)
	bytes_ds = dw.finish()
	for typ in types:
		opts = dict(column2type=dict(data=typ))
		opts.update(kw)
		if default is not no_default:
			opts['defaults'] = {'data': default}
		try:
			jid = subjobs.build('dataset_type', datasets=dict(source=bytes_ds), options=opts)
		except JobError:
			if want_fail:
				continue
			raise Exception('Typing %r as %s failed.' % (bytes_ds, typ,))
		assert not want_fail, "Typing %r as %s should have failed, but didn't (%s)." % (bytes_ds, typ, jid)
		typed_ds = Dataset(jid)
		got = list(typed_ds.iterate(0, 'data'))
		check(got, '%s (typed as %s from %r)' % (typed_ds, typ, bytes_ds,))
		if 'filter_bad' not in opts and not callable(want):
			opts['filter_bad'] = True
			opts['column2type']['extra'] = 'int32_10'
			jid = subjobs.build('dataset_type', datasets=dict(source=bytes_ds), options=opts)
			typed_ds = Dataset(jid)
			got = list(typed_ds.iterate(0, 'data'))
			check(got, '%s (typed as %s from %r with every other line skipped from filter_bad)' % (typed_ds, typ, bytes_ds,), True)
		used_type(typ)
Exemplo n.º 10
0
def check_one(slices, key, source, reverse=False):
    jid = subjobs.build(
        "dataset_sort",
        options=dict(
            sort_columns=key,
            sort_order="descending" if reverse else "ascending",
        ),
        datasets=dict(source=source),
    )
    ds = Dataset(jid)
    key_off = sorted(test_data.data).index(key)
    # This provides better separation than the replacement values
    # used in the actual sort method (but this is slow).
    if 'date' in key or 'time' in key:
        nonepos = 1
    else:
        nonepos = -1

    def cmp(a, b):
        a = a[key_off]
        b = b[key_off]
        if a is None:
            if b is None:
                return 0
            return nonepos
        if b is None:
            return -nonepos
        if isinstance(a, float):
            if isnan(a):
                if isnan(b):
                    return 0
                return 1
            if isnan(b):
                return -1
        if a < b:
            return -1
        return a > b

    keycmp = cmp_to_key(cmp)
    for sliceno in range(slices):
        good = sorted(test_data.sort_data_for_slice(sliceno),
                      key=keycmp,
                      reverse=reverse)
        check = list(ds.iterate(sliceno))
        assert unnan(check) == unnan(
            good), "Slice %d sorted on %s bad (%s)" % (
                sliceno,
                key,
                jid,
            )
def synthesis(params, prepare_res):
    dw = prepare_res
    source = dw.finish()
    jid = subjobs.build(
        "dataset_sort",
        options=dict(
            sort_columns="num",
            sort_across_slices=True,
        ),
        datasets=dict(source=source),
    )
    ds = Dataset(jid)
    data = list(ds.iterate(None, "str"))
    good = list("cghjabdefi") + \
           [str(sliceno) for sliceno in range(params.slices)] * 64
    assert data == good
Exemplo n.º 12
0
	def dataset(dsid):
		ds = Dataset(dsid.rstrip('/'))
		q = bottle.request.query
		if q.column:
			lines = int(q.lines or 10)
			it = ds.iterate(None, q.column)
			it = itertools.islice(it, lines)
			t = ds.columns[q.column].type
			if t in ('datetime', 'date', 'time',):
				it = map(str, it)
			elif t in ('bytes', 'pickle',):
				it = map(repr, it)
			res = list(it)
			bottle.response.content_type = 'application/json; charset=UTF-8'
			return json.dumps(res)
		else:
			return dict(ds=ds)
def check_no_separator(job):
    def write(data):
        fh.write(data + nl_b)
        wrote_c[data] += 1
        if q_b:
            data = q_b + data + q_b
            fh.write(q_b + data.replace(q_b, q_b + q_b) + q_b + nl_b)
            wrote_c[data] += 1

    for nl in (10, 0, 255):
        for q in (None, 0, 34, 13, 10, 228):
            if nl == q:
                continue
            filename = "no separator.%r.%r.txt" % (
                nl,
                q,
            )
            nl_b = bytechr(nl)
            q_b = bytechr(q) if q else b''
            wrote_c = Counter()
            with openx(filename) as fh:
                for splitpoint in range(256):
                    write(byteline(0, splitpoint, nl, q))
                    write(byteline(splitpoint, 256, nl, q))
            try:
                jid = subjobs.build("csvimport",
                                    options=dict(
                                        filename=job.filename(filename),
                                        quotes=q_b.decode("iso-8859-1"),
                                        newline=nl_b.decode("iso-8859-1"),
                                        separator='',
                                        labelsonfirstline=False,
                                        labels=["data"],
                                    ))
            except JobError:
                raise Exception("Importing %r failed" % (filename, ))
            got_c = Counter(Dataset(jid).iterate(None, "data"))
            assert got_c == wrote_c, "Importing %r (%s) gave wrong contents" % (
                filename,
                jid,
            )
Exemplo n.º 14
0
def verify(slices, data, source, previous=None, **options):
    jid = subjobs.build(
        "dataset_hashpart",
        datasets=dict(source=source, previous=previous),
        options=options,
    )
    hl = options["hashlabel"]
    h = typed_writer(columns[hl][0]).hash
    ds = Dataset(jid)
    good = {row[hl]: row for row in data}
    names = list(source.columns)
    for slice in range(slices):
        for row in ds.iterate_chain(slice, names):
            row = dict(zip(names, row))
            assert h(
                row[hl]
            ) % slices == slice, "row %r is incorrectly in slice %d in %s" % (
                row, slice, ds)
            want = good[row[hl]]
            assert row == want, '%s (rehashed from %s) did not contain the right data for "%s".\nWanted\n%r\ngot\n%r' % (
                ds, source, hl, want, row)
    return ds
Exemplo n.º 15
0
 def dataset(self, name='default'):
     from accelerator.dataset import Dataset
     return Dataset(self, name)
def verify_ds(options, d, d_bad, d_skipped, filename):
    jid = subjobs.build("csvimport", options=options)
    ds = Dataset(jid)
    expected_columns = {"ix", "0", "1"}
    if options.get("lineno_label"):
        expected_columns.add(options["lineno_label"])
        lineno_want = {ix: int(ix) for ix in ds.iterate(None, "ix")}
    assert set(ds.columns) == expected_columns
    # Order varies depending on slice count, so we use a dict {ix: data}
    for ix, a, b in ds.iterate(None, ["ix", "0", "1"]):
        try:
            ix = int(ix)
        except ValueError:
            # We have a few non-numeric ones
            pass
        assert ix in d, "Bad index %r in %r (%s)" % (
            ix,
            filename,
            jid,
        )
        assert a == b == d[ix], "Wrong data for line %r in %r (%s)" % (
            ix,
            filename,
            jid,
        )
        del d[ix]
    assert not d, "Not all lines returned from %r (%s), %r missing" % (
        filename,
        jid,
        set(d.keys()),
    )
    if options.get("allow_bad"):
        for ix, data in Dataset(jid, "bad").iterate(None, ["lineno", "data"]):
            assert ix in d_bad, "Bad bad_lineno %d in %r (%s/bad) %r" % (
                ix,
                filename,
                jid,
                data,
            )
            assert data == d_bad[
                ix], "Wrong saved bad line %d in %r (%s/bad).\nWanted %r.\nGot    %r." % (
                    ix,
                    filename,
                    jid,
                    d_bad[ix],
                    data,
                )
            del d_bad[ix]
    assert not d_bad, "Not all bad lines returned from %r (%s), %r missing" % (
        filename,
        jid,
        set(d_bad.keys()),
    )

    if options.get("comment") or options.get("skip_lines"):
        for ix, data in Dataset(jid,
                                "skipped").iterate(None, ["lineno", "data"]):
            assert ix in d_skipped, "Bad skipped_lineno %d in %r (%s/skipped) %r" % (
                ix,
                filename,
                jid,
                data,
            )
            assert data == d_skipped[
                ix], "Wrong saved skipped line %d in %r (%s/skipped).\nWanted %r.\nGot    %r." % (
                    ix,
                    filename,
                    jid,
                    d_skipped[ix],
                    data,
                )
            del d_skipped[ix]
    assert not d_skipped, "Not all bad lines returned from %r (%s), %r missing" % (
        filename,
        jid,
        set(d_skipped.keys()),
    )

    if options.get("lineno_label"):
        lineno_got = dict(ds.iterate(None,
                                     ["ix", options.get("lineno_label")]))
        assert lineno_got == lineno_want, "%r != %r" % (
            lineno_got,
            lineno_want,
        )
Exemplo n.º 17
0
def main(urd):
    assert urd.info.slices >= 3, "The tests don't work with less than 3 slices (you have %d)." % (
        urd.info.slices, )

    print()
    print("Testing urd.build and job.load")
    want = ({
        'foo': 'foo',
        'a': 'a'
    }, {
        'foo': None,
        'b': None
    }, {
        'foo': None,
        'c': None
    })
    job = urd.build("test_build_kws")
    assert job.load() == want
    bad = None
    try:
        urd.build("test_build_kws", options=dict(foo='bar'), foo='baz')
        bad = 'Allowed ambiguous keyword "foo"'
    except Exception:
        pass
    assert not bad, bad
    want[0]['foo'] = 'bar'
    want[0]['a'] = 'A'
    job = urd.build("test_build_kws", options=dict(foo='bar'), a='A')
    assert job.load() == want
    assert urd.build("test_build_kws",
                     options=dict(foo='bar'),
                     a='A',
                     b=None,
                     c=None) == job
    want[2]['c'] = job
    job = urd.build("test_build_kws",
                    options=dict(foo='bar', a='override this from kw'),
                    a='A',
                    c=job)
    assert job.load() == want
    want[0]['foo'] = 'foo'
    want[2]['c'] = job
    job = urd.build("test_build_kws",
                    a='A',
                    b=None,
                    c=job,
                    datasets=dict(b='overridden'))
    assert job.load() == want

    print()
    print("Testing urd.begin/end/truncate/get/peek/latest/first/since")
    urd.truncate("tests_urd", 0)
    assert not urd.peek_latest("tests_urd").joblist
    urd.begin("tests_urd", 1, caption="first")
    urd.build("test_build_kws")
    fin = urd.finish("tests_urd")
    assert fin == {'new': True, 'changed': False, 'is_ghost': False}, fin
    urd.begin("tests_urd", 1)
    job = urd.build("test_build_kws")
    fin = urd.finish("tests_urd", caption="first")
    assert fin == {'new': False, 'changed': False, 'is_ghost': False}, fin
    urd.begin("tests_urd", 1)  # will be overridden to 2 in finish
    jl = urd.latest("tests_urd").joblist
    assert jl == [job], '%r != [%r]' % (
        jl,
        job,
    )
    urd.build("test_build_kws", options=dict(foo='bar', a='A'))
    urd.finish("tests_urd", 2, caption="second")
    u = urd.peek_latest("tests_urd")
    assert u.caption == "second"
    dep0 = list(u.deps.values())[0]
    assert dep0.caption == "first", dep0.caption
    assert dep0.joblist == jl, '%r != %r' % (
        dep0.joblist,
        jl,
    )
    assert urd.since("tests_urd", 0) == ['1', '2']
    urd.truncate("tests_urd", 2)
    assert urd.since("tests_urd", 0) == ['1']
    urd.truncate("tests_urd", 0)
    assert urd.since("tests_urd", 0) == []
    ordered_ts = [
        1, 2, 1000000000, '1978-01-01', '1978-01-01+0', '1978-01-01+2',
        '1978-01-01 00:00', '1978-01-01T00:00+42', '2017-06-27',
        '2017-06-27T17:00:00', '2017-06-27 17:00:00+42'
    ]
    for ts in ordered_ts:
        urd.begin("tests_urd")
        if ts == 1000000000:
            urd.get("tests_urd", '1')
        urd.build("test_build_kws")
        urd.finish("tests_urd", ts)
    urd.begin("tests_urd")
    urd.build("test_build_kws")
    urd.finish("tests_urd", ('2019-12', 3))
    ordered_ts.append('2019-12+3')
    ordered_ts = [str(v).replace(' ', 'T') for v in ordered_ts]
    assert urd.since("tests_urd", 0) == ordered_ts
    assert urd.since("tests_urd", '1978-01-01') == ordered_ts[4:]
    assert urd.peek_first("tests_urd").timestamp == '1'
    assert not urd.peek("tests_urd", 2).deps
    dep_jl = list(urd.peek("tests_urd", 1000000000).deps.values())[0].joblist
    assert dep_jl == [job]
    assert urd.peek(
        "tests_urd",
        ('2017-06-27 17:00:00', 42)).timestamp == '2017-06-27T17:00:00+42'
    while ordered_ts:
        urd.truncate("tests_urd", ordered_ts.pop())
        assert urd.since("tests_urd", 0) == ordered_ts, ordered_ts
    want = [date.today() - timedelta(10), datetime.utcnow()]
    for ts in want:
        urd.begin("tests_urd", ts)
        urd.build("test_build_kws")
        urd.finish("tests_urd")
    assert urd.since("tests_urd",
                     0) == [str(ts).replace(' ', 'T') for ts in want]
    urd.truncate("tests_urd", 0)

    for how in (
            "exiting",
            "dying",
    ):
        print()
        print("Verifying that an analysis process %s kills the job" % (how, ))
        time_before = monotonic()
        try:
            job = urd.build("test_analysis_died", how=how)
            print(
                "test_analysis_died completed successfully (%s), that shouldn't happen"
                % (job, ))
            exit(1)
        except JobError:
            time_after = monotonic()
        time_to_die = time_after - time_before
        if time_to_die > 13:
            print(
                "test_analysis_died took %d seconds to die, it should be faster"
                % (time_to_die, ))
            exit(1)
        elif time_to_die > 2:
            print(
                "test_analysis_died took %d seconds to die, so death detection is slow, but works"
                % (time_to_die, ))
        else:
            print(
                "test_analysis_died took %.1f seconds to die, so death detection works"
                % (time_to_die, ))

    print()
    print("Testing dataset creation, export, import")
    source = urd.build("test_datasetwriter")
    urd.build("test_datasetwriter_verify", source=source)
    source = urd.build("test_datasetwriter_copy", source=source)
    urd.build("test_datasetwriter_verify", source=source)
    urd.build("test_datasetwriter_parent")
    urd.build("test_dataset_in_prepare")
    ds = Dataset(source, "passed")
    csvname = "out.csv.gz"
    csvname_uncompressed = "out.csv"
    csv = urd.build("csvexport", filename=csvname, separator="\t", source=ds)
    csv_uncompressed = urd.build("csvexport",
                                 filename=csvname_uncompressed,
                                 separator="\t",
                                 source=ds)
    csv_quoted = urd.build("csvexport",
                           filename=csvname,
                           quote_fields='"',
                           source=ds)
    urd.build("csvexport", filename='slice%d.csv', sliced=True,
              source=ds)  # unused
    reimp_csv = urd.build("csvimport",
                          filename=csv.filename(csvname),
                          separator="\t")
    reimp_csv_uncompressed = urd.build(
        "csvimport",
        filename=csv_uncompressed.filename(csvname_uncompressed),
        separator="\t")
    reimp_csv_quoted = urd.build("csvimport",
                                 filename=csv_quoted.filename(csvname),
                                 quotes=True)
    urd.build("test_compare_datasets", a=reimp_csv, b=reimp_csv_uncompressed)
    urd.build("test_compare_datasets", a=reimp_csv, b=reimp_csv_quoted)

    print()
    print("Testing subjobs")
    urd.build("test_subjobs_type", typed=ds, untyped=reimp_csv)
    urd.build("test_subjobs_nesting")

    print()
    print("Testing datasets more")
    urd.build("test_dataset_column_names")
    urd.build("test_dataset_merge")
    urd.build("test_dataset_filter_columns")
    urd.build("test_dataset_empty_colname")
    urd.build("test_dataset_nan")

    print()
    print("Testing csvimport with more difficult files")
    urd.build("test_csvimport_corner_cases")
    urd.build("test_csvimport_separators")

    print()
    print("Testing csvexport with all column types, strange separators, ...")
    urd.build("test_csvexport_naming")
    urd.build("test_csvexport_all_coltypes")
    urd.build("test_csvexport_separators")
    urd.build("test_csvexport_chains")
    urd.build("test_csvexport_quoting")

    print()
    print("Testing dataset typing")
    try:
        # Test if numeric_comma is broken (presumably because no suitable locale
        # was found, since there are not actually any commas in the source dataset.)
        urd.build("dataset_type",
                  source=source,
                  numeric_comma=True,
                  column2type=dict(b="float64"),
                  defaults=dict(b="0"))
        comma_broken = False
    except JobError as e:
        comma_broken = True
        urd.warn()
        urd.warn('SKIPPED NUMERIC COMMA TESTS')
        urd.warn(
            'Follow the instructions in this error to enable numeric comma:')
        urd.warn()
        urd.warn(e.format_msg())
    urd.build("test_dataset_type_corner_cases", numeric_comma=not comma_broken)

    print()
    print("Testing dataset chaining, filtering, callbacks and rechaining")
    selfchain = urd.build("test_selfchain")
    urd.build("test_rechain", jobs=dict(selfchain=selfchain))
    urd.build("test_dataset_callbacks")

    print()
    print("Testing dataset sorting and rehashing (with subjobs again)")
    urd.build("test_sorting")
    urd.build("test_sort_stability")
    urd.build("test_sort_chaining")
    urd.build("test_sort_trigger")
    urd.build("test_hashpart")
    urd.build("test_dataset_type_hashing")
    urd.build("test_dataset_type_chaining")

    print()
    print("Test hashlabels")
    urd.build("test_hashlabel")

    print()
    print("Test dataset roundrobin iteration and slicing")
    urd.build("test_dataset_roundrobin")
    urd.build("test_dataset_slice")
    urd.build("test_dataset_unroundrobin")
    urd.build("test_dataset_unroundrobin_trigger")

    print()
    print("Test dataset_checksum")
    urd.build("test_dataset_checksum")

    print()
    print("Test csvimport_zip")
    urd.build("test_csvimport_zip")

    print()
    print("Test output handling")
    urd.build("test_output")
    urd.build("test_output_on_error")

    print()
    print("Test datetime types in options")
    urd.build("test_datetime")

    print()
    print("Test various utility functions")
    urd.build("test_optionenum")
    urd.build("test_json")
    urd.build("test_jobwithfile")
    urd.build("test_jobchain")
    summary = urd.build("test_summary", joblist=urd.joblist)
    summary.link_result('summary.html')
def check_one(job,
              newline,
              sep,
              data,
              want_res=None,
              prefix="",
              quotes=False,
              leave_bad=False):
    sep_c = uni(chr(sep))
    # Can't have separator character in unquoted values
    if not quotes and not leave_bad:
        data = [[el.replace(sep_c, "") for el in line] for line in data]
    if not want_res:
        want_res = [
            tuple(s.encode("ascii") for s in line) for line in data[1:]
        ]
    filename = "%s_csv.%d.%s.txt" % (prefix, sep, "CRLF"
                                     if newline == "\r\n" else ord(newline))
    newline = uni(newline)
    with job.open(filename, "w", encoding="iso-8859-1", temp=True) as fh:
        for line in data:
            if quotes:
                line = [
                    quotes + el.replace(quotes, quotes + quotes) + quotes
                    for el in line
                ]
            fh.write(sep_c.join(line))
            fh.write(newline)
    try:
        jid = subjobs.build("csvimport",
                            options=dict(
                                filename=job.filename(filename),
                                separator=sep_c,
                                quotes=quotes,
                                newline='' if "\n" in newline else newline,
                            ))
    except JobError as e:
        raise CSVImportException(
            "Failed to csvimport for separator %d with newline %r, csvimport error was:\n%s"
            % (sep, newline, e.format_msg()))
    ds = Dataset(jid)
    labels = sorted(ds.columns)
    if labels != data[0]:
        raise WrongLabelsException(
            "csvimport gave wrong labels for separator %d with newline %r: %r (expected %r)"
            % (
                sep,
                newline,
                labels,
                data[0],
            ))
    res = list(ds.iterate(None, data[0]))
    if res != want_res:
        raise WrongDataException(
            "csvimport gave wrong data for separator %d with newline %r: %r (expected %r)"
            % (
                sep,
                newline,
                res,
                want_res,
            ))
Exemplo n.º 19
0
def synthesis(params):
	ds = Dataset(params.jobid)
	assert set(ds.iterate(None, "data")) == {"foo", "bar"}
Exemplo n.º 20
0
def synthesis(prepare_res, params, job, slices):
    dws = prepare_res
    for dw in (
            dws.unhashed_split,
            dws.up_split,
    ):
        w = dw.get_split_write_list()
        for row in all_data:
            w(row)
    for dw in dws.values():
        dw.finish()

    # Verify that the different ways of writing gave the same result
    for names in (
        ("unhashed_split", "unhashed_manual"),
        ("up_checked", "up_split"),
        ("down_checked", "down_discarded", "down_discarded_list",
         "down_discarded_dict"),
    ):
        dws = {name: job.dataset(name) for name in names}
        assert dws == {name: Dataset((params.jobid, name))
                       for name in names
                       }, "Old style Dataset((params.jobid, name)) broken"
        for sliceno in range(slices):
            data = {name: list(dws[name].iterate(sliceno)) for name in names}
            good = data[names[0]]
            for name in names[1:]:
                assert data[
                    name] == good, "%s doesn't match %s in slice %d" % (
                        names[0],
                        name,
                        sliceno,
                    )

    # Verify that both up and down hashed on the expected column
    hash = typed_writer("int32").hash
    for colname in ("up", "down"):
        ds = job.dataset(colname + "_checked")
        for sliceno in range(slices):
            for value in ds.iterate(sliceno, colname):
                assert hash(
                    value
                ) % slices == sliceno, "Bad hashing on %s in slice %d" % (
                    colname,
                    sliceno,
                )

    # Verify that up and down are not the same, to catch hashing
    # not actually hashing.
    up = list(job.dataset("up_checked").iterate(None))
    down = list(job.dataset("down_checked").iterate(None))
    assert up != down, "Hashlabel did not change slice distribution"
    # And check that the data is still the same.
    assert sorted(up) == sorted(
        down) == all_data, "Hashed datasets have wrong data"

    # Verify that rehashing works.
    # (Can't use sliceno None, because that won't rehash, and even if it did
    # the order wouldn't match. Order doesn't even match in the rehashed
    # individual slices.)
    up = job.dataset("up_checked")
    down = job.dataset("down_checked")
    unhashed = job.dataset("unhashed_manual")
    for sliceno in range(slices):
        a = list(up.iterate(sliceno))
        b = list(down.iterate(sliceno, hashlabel="up", rehash=True))
        c = list(unhashed.iterate(sliceno, hashlabel="up", rehash=True))
        assert sorted(a) == sorted(b) == sorted(
            c), "Rehashing is broken (slice %d)" % (sliceno, )

    # And finally verify that we are not allowed to specify the wrong hashlabel
    good = True
    try:
        up.iterate(None, hashlabel="down")
        good = False
    except AssertionError:
        pass
    try:
        unhashed.iterate(None, hashlabel="down")
        good = False
    except AssertionError:
        pass
    assert good, "Iteration allowed on the wrong hashlabel"
Exemplo n.º 21
0
def analysis(sliceno, params):
	ds = Dataset(params.jobid)
	assert set(ds.iterate(None, "data")) == {"foo", "bar"}
def synthesis(job, slices):
    def verify(a, b):
        for col in 'abcd':
            for sliceno in range(slices):
                a_data = list(Dataset.iterate_list(sliceno, col, a))
                b_data = list(map(str, Dataset.iterate_list(sliceno, col, b)))
                assert a_data == b_data, '%r has different contents to %r in slice %d column %s' % (
                    a,
                    b,
                    sliceno,
                    col,
                )

    def verify_sorted(a, b):
        for col in 'abcd':
            a_data = list(Dataset.iterate_list(None, col, a))
            b_data = list(map(str, Dataset.iterate_list(None, col, b)))
            a_data.sort()
            b_data.sort()
            assert a_data == b_data, '%r has different contents to %r in column %s' % (
                a,
                b,
                col,
            )

    def write(name, previous, low, high, filter=lambda ix: True):
        dw = job.datasetwriter(
            name=name,
            previous=previous,
            columns={
                'a': 'unicode',
                'b': 'unicode',
                'c': 'unicode',
                'd': 'unicode',
            },
        )
        w = dw.get_split_write()
        for ix in range(low, high):
            if filter(ix):
                w('%d' % (ix, ), '%d.2' % (ix, ),
                  '%d%s' % (ix, '.5' if ix % 2 else ''), '[%d]' % (ix, ))
        return dw.finish()

    untyped_A = write('A', None, 0, 100)
    untyped_B = write('B', untyped_A, 100, 1000)
    untyped_C = write('C', untyped_B, 1000, 2000)
    untyped_D = write('D', untyped_C, 2000, 10000)
    untyped_E = write('E', untyped_D, 10000, 10100)

    # All four different classes of converters
    opts = DotDict(column2type=dict(a='int32_10',
                                    b='number',
                                    c='ascii',
                                    d='json'),
                   as_chain=False)
    src_chain = []
    simple_chain = []
    previous = None
    for src in (untyped_A, untyped_B, untyped_C, untyped_D, untyped_E):
        previous = subjobs.build('dataset_type',
                                 datasets=dict(source=src, previous=previous),
                                 options=opts)
        simple_chain.append(previous)
        src_chain.append(src)
        verify([src], [previous])
        assert simple_chain == Dataset(previous).chain(), previous
        verify(src_chain, simple_chain)
    typed_B = simple_chain[1]
    typed_D = simple_chain[3]

    # No previous -> should contain both A and B
    typed_AB = subjobs.build('dataset_type',
                             datasets=dict(source=untyped_B),
                             options=opts)
    verify(src_chain[:2], [typed_AB])
    typed_CDE = subjobs.build('dataset_type',
                              datasets=dict(source=untyped_E,
                                            previous=typed_B),
                              options=opts)
    verify(src_chain[2:], [typed_CDE])
    verify(src_chain, Dataset(typed_CDE).chain())
    # A and B through typed_B, but length=2 only gets D and E, not C.
    opts.length = 2
    typed_DE_noC = subjobs.build('dataset_type',
                                 datasets=dict(source=untyped_E,
                                               previous=typed_B),
                                 options=opts)
    del opts.length
    verify((untyped_A, untyped_B, untyped_D, untyped_E),
           Dataset(typed_DE_noC).chain())

    # with as_chain (and a hashlabel so as_chain happens)
    opts.as_chain = True
    opts.hashlabel = 'a'
    previous = None
    for ix, src in enumerate(src_chain, 1):
        previous = subjobs.build('dataset_type',
                                 datasets=dict(source=src, previous=previous),
                                 options=opts)
        ds = Dataset(previous)
        assert len(ds.chain()) == ix * slices, ds
        verify_sorted([src], ds.chain(length=slices))
        verify_sorted(src_chain[:ix], ds.chain())

    # And one with as_chain just on the last job, discarding half the rows from bad typing.
    opts.column2type['b'] = 'ascii'
    opts.column2type['c'] = 'number:int'
    opts.filter_bad = True
    typed_and_hashed_Ehalf = subjobs.build('dataset_type',
                                           datasets=dict(source=untyped_E,
                                                         previous=typed_D),
                                           options=opts)
    typed_and_hashed_Ehalf = Dataset(typed_and_hashed_Ehalf)
    assert len(
        typed_and_hashed_Ehalf.chain()) == slices + 4, typed_and_hashed_Ehalf
    untyped_Ehalf = write('Ehalf',
                          untyped_D,
                          10000,
                          10100,
                          filter=lambda ix: ix % 2 == 0)
    verify_sorted([untyped_Ehalf], typed_and_hashed_Ehalf.chain(length=slices))
Exemplo n.º 23
0
def test_filter_bad_across_types():
    columns = {
        'bytes': 'bytes',
        'float64': 'bytes',
        'int32_10': 'ascii',
        'json': 'unicode',
        'number:int': 'unicode',
        'unicode:utf-8': 'bytes',
    }
    # all_good, *values
    # Make sure all those types (except bytes) can filter other lines,
    # and be filtered by other lines. And that several filtering values
    # is not a problem (line 11).
    data = [
        [
            True,
            b'first',
            b'1.1',
            '1',
            '"a"',
            '001',
            b'ett',
        ],
        [
            True,
            b'second',
            b'2.2',
            '2',
            '"b"',
            '02',
            b'tv\xc3\xa5',
        ],
        [
            True,
            b'third',
            b'3.3',
            '3',
            '["c"]',
            '3.0',
            b'tre',
        ],
        [
            False,
            b'fourth',
            b'4.4',
            '4',
            '"d"',
            '4.4',
            b'fyra',
        ],  # number:int bad
        [
            False,
            b'fifth',
            b'5.5',
            '-',
            '"e"',
            '5',
            b'fem',
        ],  # int32_10 bad
        [
            False,
            b'sixth',
            b'6.b',
            '6',
            '"f"',
            '6',
            b'sex',
        ],  # float64 bad
        [
            False,
            b'seventh',
            b'7.7',
            '7',
            '{"g"}',
            '7',
            b'sju',
        ],  # json bad
        [
            False,
            b'eigth',
            b'8.8',
            '8',
            '"h"',
            '8',
            b'\xa5\xc3tta',
        ],  # unicode:utf-8 bad
        [
            True,
            b'ninth',
            b'9.9',
            '9',
            '"i"',
            '9',
            b'nio',
        ],
        [
            True,
            b'tenth',
            b'10',
            '10',
            '"j"',
            '10',
            b'tio',
        ],
        [
            False,
            b'eleventh',
            b'11a',
            '1-',
            '"k",',
            '1,',
            b'elva',
        ],  # float64, int32_10 and number:int bad
        [
            True,
            b'twelfth',
            b'12',
            '12',
            '"l"',
            '12',
            b'tolv',
        ],
    ]
    want_bad = [tuple(l[1:]) for l in data if not l[0]]
    dw = DatasetWriter(name="filter bad across types",
                       columns=columns,
                       allow_missing_slices=True)
    cols_to_check = ['int32_10', 'bytes', 'json', 'unicode:utf-8']
    if PY3:
        # z so it sorts last.
        dw.add('zpickle', 'pickle')
        cols_to_check.append('zpickle')
        for ix in range(len(data)):
            data[ix].append({ix})
    dw.set_slice(0)
    want = []

    def add_want(ix):
        v = data[ix]
        want.append((
            int(v[3]),
            v[1],
            json.loads(v[4]),
            v[6].decode('utf-8'),
        ))
        if PY3:
            want[-1] = want[-1] + (v[7], )

    for ix, v in enumerate(data):
        if v[0]:
            add_want(ix)
        dw.write(*v[1:])
    source_ds = dw.finish()
    # Once with just filter_bad, once with some defaults too.
    defaults = {}
    for _ in range(2):
        jid = subjobs.build(
            'dataset_type',
            datasets=dict(source=source_ds),
            options=dict(column2type={t: t
                                      for t in columns},
                         filter_bad=True,
                         defaults=defaults),
        )
        typed_ds = Dataset(jid)
        got = list(typed_ds.iterate(0, cols_to_check))
        assert got == want, "Expected %r, got %r from %s (from %r%s)" % (
            want, got, typed_ds, source_ds,
            ' with defaults' if defaults else '')
        bad_ds = Dataset(jid, 'bad')
        got_bad = list(bad_ds.iterate(0, sorted(columns)))
        assert got_bad == want_bad, "Expected %r, got %r from %s (from %r%s)" % (
            want_bad, got_bad, bad_ds, source_ds,
            ' with defaults' if defaults else '')
        # make more lines "ok" for the second lap
        if not defaults:
            want_bad.pop(0)  # number:int
            want_bad.pop(1)  # float64
            want_bad.pop(1)  # json
        defaults = {'number:int': '0', 'float64': '0', 'json': '"replacement"'}
        add_want(3)
        add_want(5)
        data[6][4] = '"replacement"'
        add_want(6)
        want.sort()  # adding them out of order, int32_10 sorts correctly.
Exemplo n.º 24
0
def main(urd):
	assert urd.info.slices >= 3, "The tests don't work with less than 3 slices (you have %d)." % (urd.info.slices,)

	print()
	print("Testing urd.build and job.load")
	want = ({'foo': 'foo', 'a': 'a'}, {'foo': None, 'b': None}, {'foo': None, 'c': None})
	job = urd.build("test_build_kws")
	assert job.load() == want
	bad = None
	try:
		urd.build("test_build_kws", options=dict(foo='bar'), foo='baz')
		bad = 'Allowed ambiguous keyword "foo"'
	except Exception:
		pass
	assert not bad, bad
	want[0]['foo'] = 'bar'
	want[0]['a'] = 'A'
	job = urd.build("test_build_kws", options=dict(foo='bar'), a='A')
	assert job.load() == want
	assert urd.build("test_build_kws", options=dict(foo='bar'), a='A', b=None, c=None) == job
	want[2]['c'] = job
	job = urd.build("test_build_kws", options=dict(foo='bar', a='override this from kw'), a='A', c=job)
	assert job.load() == want
	want[0]['foo'] = 'foo'
	want[2]['c'] = job
	job = urd.build("test_build_kws", a='A', b=None, c=job, datasets=dict(b='overridden'))
	assert job.load() == want

	print()
	print("Testing urd.begin/end/truncate/get/peek/latest/first/since")
	urd.truncate("tests_urd", 0)
	assert not urd.peek_latest("tests_urd").joblist
	urd.begin("tests_urd", 1, caption="first")
	urd.build("test_build_kws")
	fin = urd.finish("tests_urd")
	assert fin == {'new': True, 'changed': False, 'is_ghost': False}, fin
	urd.begin("tests_urd", 1)
	job = urd.build("test_build_kws")
	fin = urd.finish("tests_urd", caption="first")
	assert fin == {'new': False, 'changed': False, 'is_ghost': False}, fin
	urd.begin("tests_urd", 1) # will be overridden to 2 in finish
	jl = urd.latest("tests_urd").joblist
	assert jl == [job], '%r != [%r]' % (jl, job,)
	urd.build("test_build_kws", options=dict(foo='bar', a='A'))
	urd.finish("tests_urd", 2, caption="second")
	u = urd.peek_latest("tests_urd")
	assert u.caption == "second"
	dep0 = list(u.deps.values())[0]
	assert dep0.caption == "first", dep0.caption
	assert dep0.joblist == jl, '%r != %r' % (dep0.joblist, jl,)
	assert urd.since("tests_urd", 0) == ['1', '2']
	urd.truncate("tests_urd", 2)
	assert urd.since("tests_urd", 0) == ['1']
	urd.truncate("tests_urd", 0)
	assert urd.since("tests_urd", 0) == []
	ordered_ts = [1, 2, 1000000000, '1978-01-01', '1978-01-01+0', '1978-01-01+2', '1978-01-01 00:00', '1978-01-01T00:00+42', '2017-06-27', '2017-06-27T17:00:00', '2017-06-27 17:00:00+42']
	for ts in ordered_ts:
		urd.begin("tests_urd")
		if ts == 1000000000:
			urd.get("tests_urd", '1')
		urd.build("test_build_kws")
		urd.finish("tests_urd", ts)
	urd.begin("tests_urd")
	urd.build("test_build_kws")
	urd.finish("tests_urd", ('2019-12', 3))
	ordered_ts.append('2019-12+3')
	ordered_ts = [str(v).replace(' ', 'T') for v in ordered_ts]
	assert urd.since("tests_urd", 0) == ordered_ts
	assert urd.since("tests_urd", '1978-01-01') == ordered_ts[4:]
	assert urd.peek_first("tests_urd").timestamp == '1'
	assert not urd.peek("tests_urd", 2).deps
	dep_jl = list(urd.peek("tests_urd", 1000000000).deps.values())[0].joblist
	assert dep_jl == [job]
	assert urd.peek("tests_urd", ('2017-06-27 17:00:00', 42)).timestamp == '2017-06-27T17:00:00+42'
	while ordered_ts:
		urd.truncate("tests_urd", ordered_ts.pop())
		assert urd.since("tests_urd", 0) == ordered_ts, ordered_ts
	want = [date.today() - timedelta(10), datetime.utcnow()]
	for ts in want:
		urd.begin("tests_urd", ts)
		urd.build("test_build_kws")
		urd.finish("tests_urd")
	assert urd.since("tests_urd", 0) == [str(ts).replace(' ', 'T') for ts in want]
	urd.truncate("tests_urd", 0)

	for how in ("exiting", "dying",):
		print()
		print("Verifying that an analysis process %s kills the job" % (how,))
		time_before = monotonic()
		try:
			job = urd.build("test_analysis_died", how=how)
			print("test_analysis_died completed successfully (%s), that shouldn't happen" % (job,))
			exit(1)
		except JobError:
			time_after = monotonic()
		time_to_die = time_after - time_before
		if time_to_die > 13:
			print("test_analysis_died took %d seconds to die, it should be faster" % (time_to_die,))
			exit(1)
		elif time_to_die > 2:
			print("test_analysis_died took %d seconds to die, so death detection is slow, but works" % (time_to_die,))
		else:
			print("test_analysis_died took %.1f seconds to die, so death detection works" % (time_to_die,))

	print()
	print("Testing dataset creation, export, import")
	source = urd.build("test_datasetwriter")
	urd.build("test_datasetwriter_verify", source=source)
	source = urd.build("test_datasetwriter_copy", source=source)
	urd.build("test_datasetwriter_verify", source=source)
	urd.build("test_datasetwriter_parent")
	urd.build("test_datasetwriter_missing_slices")
	urd.build("test_dataset_in_prepare")
	ds = Dataset(source, "passed")
	csvname = "out.csv.gz"
	csvname_uncompressed = "out.csv"
	csv = urd.build("csvexport", filename=csvname, separator="\t", source=ds)
	csv_uncompressed = urd.build("csvexport", filename=csvname_uncompressed, separator="\t", source=ds)
	csv_quoted = urd.build("csvexport", filename=csvname, quote_fields='"', source=ds)
	urd.build("csvexport", filename='slice%d.csv', sliced=True, source=ds) # unused
	reimp_csv = urd.build("csvimport", filename=csv.filename(csvname), separator="\t")
	reimp_csv_uncompressed = urd.build("csvimport", filename=csv_uncompressed.filename(csvname_uncompressed), separator="\t")
	reimp_csv_quoted = urd.build("csvimport", filename=csv_quoted.filename(csvname), quotes=True)
	urd.build("test_compare_datasets", a=reimp_csv, b=reimp_csv_uncompressed)
	urd.build("test_compare_datasets", a=reimp_csv, b=reimp_csv_quoted)

	print()
	print("Testing subjobs")
	urd.build("test_subjobs_type", typed=ds, untyped=reimp_csv)
	urd.build("test_subjobs_nesting")

	print()
	print("Testing datasets more")
	dsnamejob = urd.build("test_dataset_names")
	# make sure .datasets works with these names (only possible after job finishes)
	assert [ds.name for ds in dsnamejob.datasets] == dsnamejob.load()
	urd.build("test_dataset_column_names")
	urd.build("test_dataset_merge")
	urd.build("test_dataset_filter_columns")
	urd.build("test_dataset_empty_colname")
	urd.build("test_dataset_nan")
	urd.build('test_dataset_parsing_writer')

	print()
	print("Testing csvimport with more difficult files")
	urd.build("test_csvimport_corner_cases")
	urd.build("test_csvimport_separators")

	print()
	print("Testing csvexport with all column types, strange separators, ...")
	urd.build("test_csvexport_naming")
	urd.build("test_csvexport_all_coltypes")
	urd.build("test_csvexport_separators")
	urd.build("test_csvexport_chains")
	urd.build("test_csvexport_quoting")

	print()
	print("Testing dataset typing")
	try:
		# Test if numeric_comma is broken (presumably because no suitable locale
		# was found, since there are not actually any commas in the source dataset.)
		urd.build("dataset_type", source=source, numeric_comma=True, column2type=dict(b="float64"), defaults=dict(b="0"))
		comma_broken = False
	except JobError as e:
		comma_broken = True
		urd.warn()
		urd.warn('SKIPPED NUMERIC COMMA TESTS')
		urd.warn('Follow the instructions in this error to enable numeric comma:')
		urd.warn()
		urd.warn(e.format_msg())
	urd.build("test_dataset_type_corner_cases", numeric_comma=not comma_broken)
	urd.build("test_dataset_type_minmax")

	print()
	print("Testing dataset chaining, filtering, callbacks and rechaining")
	selfchain = urd.build("test_selfchain")
	urd.build("test_rechain", jobs=dict(selfchain=selfchain))
	urd.build("test_dataset_callbacks")

	print()
	print("Testing dataset sorting and rehashing (with subjobs again)")
	urd.build("test_sorting")
	urd.build("test_sort_stability")
	urd.build("test_sort_chaining")
	urd.build("test_sort_trigger")
	urd.build("test_hashpart")
	urd.build("test_dataset_type_hashing")
	urd.build("test_dataset_type_chaining")

	print()
	print("Test hashlabels")
	urd.build("test_hashlabel")

	print()
	print("Test dataset roundrobin iteration and slicing")
	urd.build("test_dataset_roundrobin")
	urd.build("test_dataset_slice")
	urd.build("test_dataset_unroundrobin")
	urd.build("test_dataset_unroundrobin_trigger")
	urd.build("test_number")

	print()
	print("Test dataset_checksum")
	urd.build("test_dataset_checksum")

	print()
	print("Test csvimport_zip")
	urd.build("test_csvimport_zip")

	print()
	print("Test output handling")
	urd.build("test_output")
	urd.build("test_output_on_error")

	print()
	print("Test datetime types in options")
	urd.build("test_datetime")

	print()
	print("Test various utility functions")
	urd.build("test_optionenum")
	urd.build("test_json")
	urd.build("test_jobwithfile")
	urd.build("test_jobchain")

	print()
	print("Test shell commands")
	from sys import argv
	from accelerator.shell import cfg
	command_prefix = [argv[0], '--config', cfg.config_filename]
	urd.truncate("tests_urd", 0)
	# These have to be rebuilt every time, or the resolving might give other jobs.
	urd.begin("tests_urd", 1)
	a = urd.build('test_shell_data', force_build=True)
	b = urd.build('test_shell_data', force_build=True)
	c = urd.build('test_shell_data', datasets={'previous': a})
	urd.finish("tests_urd")
	urd.begin("tests_urd", "2021-09-27T03:14")
	d = urd.build('test_shell_data', datasets={'previous': c, 'parent': a + '/j'}, jobs={'previous': b})
	urd.finish("tests_urd")
	urd.begin("tests_urd", "2021-09-27T03:14+1")
	e = urd.build('test_shell_data', jobs={'previous': d})
	urd.finish("tests_urd")
	# ~ finds earlier jobs with that method, ^ follows jobs.previous falling back to datasets.previous.
	want = {
		'test_shell_data': e, # just the plain method -> job resolution.
		c + '~~': a, # not using .previous, just going back jobs
		'test_shell_data~3': b, # numbered tildes
		'test_shell_data~2^': a, # ~~ goes to c, ^ follows .previous to a.
		d + '^': b, # prefers jobs.previous to .datasets.previous
		':tests_urd:': e,
		':tests_urd/2021-09-27T03:14:': d,
		':tests_urd/1:1': b, # 1 is the second entry
		':tests_urd/1:-3': a, # third entry from the end
		':tests_urd:^': d,
		':tests_urd/2021-09-27T03:14+1^^:0': a, # ^ in :: goes to earlier entries
		':tests_urd/1~:': d, # ~ in :: goes to later entries
	}
	urd.build('test_shell_job', command_prefix=command_prefix, want=want)
	# the job is resolved first, so the old specs give the same results
	want = {spec: job + '/default' for spec, job in want.items()}
	want.update({
		d + '/j^': a + '/j', # .parent
		d + '/j~': b + '/default', # .previous
		'test_shell_data~/j^': a + '/j', # both job and ds movement
		e + '/j~^': a + '/j', # .previous.parent
		# some urdlist ones with datasets on
		':tests_urd:/j': e + '/j',
		':tests_urd/1:1/j': b + '/j',
		':tests_urd:^/j': d + '/j',
		':tests_urd/2021-09-27T03:14:/j': d + '/j',
		# finally one with : in the list and / in the ds name
		':tests_urd/2021-09-27T03:14+1:0/name/with/slash': e + '/name/with/slash',
	})
	urd.build('test_shell_ds', command_prefix=command_prefix, want=want)
	urd.truncate("tests_urd", 0)
	urd.build('test_shell_grep', command_prefix=command_prefix)

	summary = urd.build("test_summary", joblist=urd.joblist)
	summary.link_result('summary.html')
Exemplo n.º 25
0
def main(urd):
    assert urd.info.slices >= 3, "The tests don't work with less than 3 slices (you have %d)." % (
        urd.info.slices, )

    print()
    print("Testing dataset creation, export, import")
    source = urd.build("test_datasetwriter")
    urd.build("test_datasetwriter_verify", datasets=dict(source=source))
    urd.build("test_dataset_in_prepare")
    ds = Dataset(source, "passed")
    csvname = "out.csv.gz"
    csvname_uncompressed = "out.csv"
    csv = urd.build("csvexport",
                    options=dict(filename=csvname, separator="\t"),
                    datasets=dict(source=ds))
    csv_uncompressed = urd.build("csvexport",
                                 options=dict(filename=csvname_uncompressed,
                                              separator="\t"),
                                 datasets=dict(source=ds))
    csv_quoted = urd.build("csvexport",
                           options=dict(filename=csvname, quote_fields='"'),
                           datasets=dict(source=ds))
    reimp_csv = urd.build("csvimport",
                          options=dict(filename=csv.filename(csvname),
                                       separator="\t"))
    reimp_csv_uncompressed = urd.build(
        "csvimport",
        options=dict(filename=csv_uncompressed.filename(csvname_uncompressed),
                     separator="\t"))
    reimp_csv_quoted = urd.build("csvimport",
                                 options=dict(
                                     filename=csv_quoted.filename(csvname),
                                     quotes=True))
    urd.build("test_compare_datasets",
              datasets=dict(a=reimp_csv, b=reimp_csv_uncompressed))
    urd.build("test_compare_datasets",
              datasets=dict(a=reimp_csv, b=reimp_csv_quoted))
    urd.build("test_dataset_column_names")
    urd.build("test_dataset_merge")

    print()
    print("Testing csvimport with more difficult files")
    urd.build("test_csvimport_corner_cases")
    urd.build("test_csvimport_separators")

    print()
    print("Testing subjobs and dataset typing")
    urd.build("test_subjobs_type", datasets=dict(typed=ds, untyped=reimp_csv))
    urd.build("test_subjobs_nesting")
    try:
        # Test if numeric_comma is broken (presumably because no suitable locale
        # was found, since there are not actually any commas in the source dataset.)
        urd.build("dataset_type",
                  datasets=dict(source=source),
                  options=dict(numeric_comma=True,
                               column2type=dict(b="float64"),
                               defaults=dict(b="0")))
        comma_broken = False
    except JobError as e:
        comma_broken = True
        urd.warn()
        urd.warn('SKIPPED NUMERIC COMMA TESTS')
        urd.warn(
            'Follow the instructions in this error to enable numeric comma:')
        urd.warn()
        urd.warn(e.format_msg())
    urd.build("test_dataset_type_corner_cases",
              options=dict(numeric_comma=not comma_broken))

    print()
    print("Testing dataset chaining, filtering, callbacks and rechaining")
    selfchain = urd.build("test_selfchain")
    urd.build("test_rechain", jobids=dict(selfchain=selfchain))

    print()
    print("Testing dataset sorting and rehashing (with subjobs again)")
    urd.build("test_sorting")
    urd.build("test_sort_stability")
    urd.build("test_sort_chaining")
    urd.build("test_rehash")
    urd.build("test_dataset_type_hashing")
    urd.build("test_dataset_type_chaining")

    print()
    print("Test hashlabels")
    urd.build("test_hashlabel")

    print()
    print("Test dataset roundrobin iteration")
    urd.build("test_dataset_roundrobin")

    print()
    print("Test dataset_checksum")
    urd.build("test_dataset_checksum")

    print()
    print("Test csvimport_zip")
    urd.build("test_csvimport_zip")

    print()
    print("Test output handling")
    urd.build("test_output")

    print()
    print("Test datetime types in options")
    urd.build("test_datetime")

    print()
    print("Test various utility functions")
    urd.build("test_optionenum")
    urd.build("test_json")
    urd.build("test_jobwithfile")
    urd.build("test_report")