def synthesis(params):
	dw = DatasetWriter(name="parent")
	in_parent = [ # list because order matters
		"-",      # becomes _ because everything must be a valid python identifier.
		"a b",    # becomes a_b because everything must be a valid python identifier.
		"42",     # becomes _42 because everything must be a valid python identifier.
		"print",  # becomes print_ because print is a keyword (in py2).
		"print@", # becomes print__ because print_ is taken.
		"None",   # becomes None_ because None is a keyword (in py3).
	]
	for colname in in_parent:
		dw.add(colname, "unicode")
	w = dw.get_split_write()
	w(_="- 1", a_b="a b 1", _42="42 1", print_="print 1", None_="None 1", print__="Will be overwritten 1")
	w(_="- 2", a_b="a b 2", _42="42 2", print_="print 2", None_="None 2", print__="Will be overwritten 2")
	parent = dw.finish()
	dw = DatasetWriter(name="child", parent=parent)
	in_child = [ # order still matters
		"print_*", # becomes print___ because print__ is taken.
		"print_",  # becomes print____ because all shorter are taken.
		"normal",  # no collision.
		"Normal",  # no collision.
		"print@",  # re-uses print__ from the parent dataset.
	]
	for colname in in_child:
		dw.add(colname, "unicode")
	w = dw.get_split_write()
	w(print__="print@ 1", print___="print_* 1", print____="print_ 1", normal="normal 1", Normal="Normal 1")
	w(print__="print@ 2", print___="print_* 2", print____="print_ 2", normal="normal 2", Normal="Normal 2")
	child = dw.finish()
	for colname in in_parent + in_child:
		data = set(child.iterate(None, colname))
		assert data == {colname + " 1", colname + " 2"}, "Bad data for %s: %r" % (colname, data)
예제 #2
0
def synthesis(jobid):
    manual_chain = [Dataset(jobids.selfchain, name) for name in "abcdefgh"]
    manual_abf = [manual_chain[0], manual_chain[1], manual_chain[5]]
    # build a local abf chain
    prev = None
    for ix, ds in enumerate(manual_abf):
        name = "abf%d" % (ix, )
        ds.link_to_here(name, override_previous=prev)
        prev = (
            jobid,
            name,
        )
    manual_abf_data = list(Dataset.iterate_list(None, None, manual_abf))
    local_abf_data = list(Dataset(jobid, "abf2").iterate_chain(None, None))
    assert manual_abf_data == local_abf_data
    # disconnect h, verify there is no chain
    manual_chain[-1].link_to_here("alone", override_previous=None)
    assert len(Dataset(jobid, "alone").chain()) == 1
    # check that the original chain is unhurt
    assert manual_chain == manual_chain[-1].chain()

    # So far so good, now make a chain long enough to have a cache.
    prev = None
    ix = 0
    going = True
    while going:
        if prev and "cache" in Dataset(prev)._data:
            going = False
        name = "longchain%d" % (ix, )
        dw = DatasetWriter(name=name, previous=prev)
        dw.add("ix", "number")
        dw.get_split_write()(ix)
        dw.finish()
        prev = (
            jobid,
            name,
        )
        ix += 1
    # we now have a chain that goes one past the first cache point
    full_chain = Dataset(prev).chain()
    assert "cache" in full_chain[
        -2]._data  # just to check the above logic is correct
    assert "cache" not in full_chain[-1]._data  # just to be sure..
    full_chain[-2].link_to_here("nocache", override_previous=None)
    full_chain[-1].link_to_here("withcache", override_previous=full_chain[-3])
    assert "cache" not in Dataset(jobid, "nocache")._data
    assert "cache" in Dataset(jobid, "withcache")._data
    # And make sure they both get the right data too.
    assert list(Dataset(prev).iterate_chain(None, "ix")) == list(range(ix))
    assert list(Dataset(jobid, "nocache").iterate_chain(None,
                                                        "ix")) == [ix - 2]
    assert list(Dataset(jobid, "withcache").iterate_chain(
        None, "ix")) == list(range(ix - 2)) + [ix - 1]
def _verify(name, types, data, coltype, want, default, want_fail, kw):
    if callable(want):
        check = want
    else:

        def check(got, fromstr, filtered=False):
            want1 = want if isinstance(want, list) else want[typ]
            if filtered:
                want1 = want1[::2]
            assert got == want1, 'Expected %r, got %r from %s.' % (
                want1,
                got,
                fromstr,
            )

    dw = DatasetWriter(name=name, columns={'data': coltype, 'extra': 'bytes'})
    dw.set_slice(0)
    for ix, v in enumerate(data):
        dw.write(v, b'1' if ix % 2 == 0 else b'skip')
    for sliceno in range(1, g.SLICES):
        dw.set_slice(sliceno)
    bytes_ds = dw.finish()
    for typ in types:
        opts = dict(column2type=dict(data=typ))
        opts.update(kw)
        if default is not no_default:
            opts['defaults'] = {'data': default}
        try:
            jid = subjobs.build('dataset_type',
                                datasets=dict(source=bytes_ds),
                                options=opts)
        except JobError:
            if want_fail:
                continue
            raise Exception('Typing %r as %s failed.' % (
                bytes_ds,
                typ,
            ))
        assert not want_fail, "Typing %r as %s should have failed, but didn't (%s)." % (
            bytes_ds, typ, jid)
        typed_ds = Dataset(jid)
        got = list(typed_ds.iterate(0, 'data'))
        check(got, '%s (typed as %s from %r)' % (
            typed_ds,
            typ,
            bytes_ds,
        ))
        if 'filter_bad' not in opts and not callable(want):
            opts['filter_bad'] = True
            opts['column2type']['extra'] = 'int32_10'
            jid = subjobs.build('dataset_type',
                                datasets=dict(source=bytes_ds),
                                options=opts)
            typed_ds = Dataset(jid)
            got = list(typed_ds.iterate(0, 'data'))
            check(
                got,
                '%s (typed as %s from %r with every other line skipped from filter_bad)'
                % (
                    typed_ds,
                    typ,
                    bytes_ds,
                ), True)
        used_type(typ)
def test_filter_bad_across_types():
    columns = {
        'bytes': 'bytes',
        'float64': 'bytes',
        'int32_10': 'ascii',
        'json': 'unicode',
        'number:int': 'unicode',
        'unicode:utf-8': 'bytes',
    }
    # all_good, *values
    # Make sure all those types (except bytes) can filter other lines,
    # and be filtered by other lines. And that several filtering values
    # is not a problem (line 11).
    data = [
        (
            True,
            b'first',
            b'1.1',
            '1',
            '"a"',
            '001',
            b'ett',
        ),
        (
            True,
            b'second',
            b'2.2',
            '2',
            '"b"',
            '02',
            b'tv\xc3\xa5',
        ),
        (
            True,
            b'third',
            b'3.3',
            '3',
            '["c"]',
            '3.0',
            b'tre',
        ),
        (
            False,
            b'fourth',
            b'4.4',
            '4',
            '"d"',
            '4.4',
            b'fyra',
        ),  # number:int bad
        (
            False,
            b'fifth',
            b'5.5',
            '-',
            '"e"',
            '5',
            b'fem',
        ),  # int32_10 bad
        (
            False,
            b'sixth',
            b'6.b',
            '6',
            '"f"',
            '6',
            b'sex',
        ),  # float64 bad
        [
            False,
            b'seventh',
            b'7.7',
            '7',
            '{"g"}',
            '7',
            b'sju',
        ],  # json bad
        (
            False,
            b'eigth',
            b'8.8',
            '8',
            '"h"',
            '8',
            b'\xa5\xc3tta',
        ),  # unicode:utf-8 bad
        (
            True,
            b'ninth',
            b'9.9',
            '9',
            '"i"',
            '9',
            b'nio',
        ),
        (
            True,
            b'tenth',
            b'10',
            '10',
            '"j"',
            '10',
            b'tio',
        ),
        (
            False,
            b'eleventh',
            b'11a',
            '1-',
            '"k",',
            '1,',
            b'elva',
        ),  # float64, int32_10 and number:int bad
        (
            True,
            b'twelfth',
            b'12',
            '12',
            '"l"',
            '12',
            b'tolv',
        ),
    ]
    dw = DatasetWriter(name="filter bad across types", columns=columns)
    dw.set_slice(0)
    want = []

    def add_want(v):
        want.append((
            int(v[3]),
            v[1],
            json.loads(v[4]),
            v[6].decode('utf-8'),
        ))

    for v in data:
        if v[0]:
            add_want(v)
        dw.write(*v[1:])
    for sliceno in range(1, g.SLICES):
        dw.set_slice(sliceno)
    source_ds = dw.finish()
    # Once with just filter_bad, once with some defaults too.
    defaults = {}
    for _ in range(2):
        jid = subjobs.build(
            'dataset_type',
            datasets=dict(source=source_ds),
            options=dict(column2type={t: t
                                      for t in columns},
                         filter_bad=True,
                         defaults=defaults),
        )
        typed_ds = Dataset(jid)
        got = list(
            typed_ds.iterate(0,
                             ['int32_10', 'bytes', 'json', 'unicode:utf-8']))
        assert got == want, "Exptected %r, got %r from %s (from %r%s)" % (
            want, got, typed_ds, source_ds,
            ' with defaults' if defaults else '')
        # make more lines "ok" for the second lap
        defaults = {'number:int': '0', 'float64': '0', 'json': '"replacement"'}
        add_want(data[3])
        add_want(data[5])
        data[6][4] = '"replacement"'
        add_want(data[6])
        want.sort()  # adding them out of order, int32_10 sorts correctly.
예제 #5
0
def write(data, **kw):
    dw = DatasetWriter(columns=columns, **kw)
    w = dw.get_split_write_dict()
    for values in data:
        w(values)
    return dw.finish()
예제 #6
0
def do_one(params, name, data):
	dw = DatasetWriter(name=name, columns=columns)
	dw.set_slice(0)
	for v in data:
		if v is None:
			d = dict(
				ascii_new=None,
				ascii_old=None,
				bytes_new=None,
				bytes_old=None,
				unicode_new=None,
				unicode_old=None,
			)
		else:
			d = dict(
				ascii_new=v,
				ascii_old=v,
				bytes_new=uni(v).encode("ascii"),
				bytes_old=uni(v).encode("ascii"),
				unicode_new=uni(v),
				unicode_old=uni(v),
			)
		dw.write_dict(d)
	# We don't really want the other slices, but write one thing to
	# each, to make sure it doesn't show up in slice 0.
	# (Small slice merging will put it in the same file, so this is
	# a real risk.)
	for sliceno in range(1, params.slices):
		dw.set_slice(sliceno)
		dw.write_dict(d)
	dw.finish()

	# verify we got what we asked for
	me_ds = Dataset(params.jobid, name)
	for colname, coltype in columns.items():
		col = me_ds.columns[colname]
		assert col.type == coltype.split("_")[-1], colname
		assert col.backing_type == coltype, colname
		for want, got in zip(data, me_ds.iterate(0, colname)):
			if want is not None:
				if PY2 and "unicode" in coltype:
					want = uni(want)
				if PY3 and "bytes" in coltype:
					want = want.encode("ascii")
			assert want == got, "%s in %s did not contain the expected value. Wanted %r but got %r." % (colname, me_ds, want, got)

	# check that both types of bytes filter correctly through typing
	jid = subjobs.build("dataset_type", datasets=dict(source=me_ds), options=dict(
		column2type=dict(
			ascii_new="bytes",
			ascii_old="number", # fails on the string, so that gets filtered out everywhere
			bytes_new="bytes",
			bytes_old="bytes",
		),
		filter_bad=True,
	))
	ds = Dataset(jid)
	# verify the number first
	data_it = iter(raw_data)
	next(data_it) # skip the filtered out string
	for got in ds.iterate(0, "ascii_old"):
		want = next(data_it)
		if want is None:
			# Becomes 0 because the typer (unfortunately) sees it as an empty string
			want = 0
		assert want == got, "ascii_old in %s did not type correctly as number. Wanted %r but got %r." % (ds, want, got)
	# now verify all the bytes ones are ok, no longer containing the string.
	for colname in ("ascii_new", "bytes_new", "bytes_old",):
		data_it = iter(data)
		next(data_it) # skip the filtered out string
		for got in ds.iterate(0, colname):
			want = next(data_it)
			if want is not None:
				want = want.encode("ascii")
			assert want == got, "%s in %s did not roundtrip correctly as bytes. Wanted %r but got %r." % (colname, ds, want, got)

	# and now check that the Nones are ok after making bytes from ascii and unicode from bytes.
	jid = subjobs.build("dataset_type", datasets=dict(source=me_ds), options=dict(
		column2type=dict(
			ascii_new="bytes",
			ascii_old="bytes",
			bytes_new="unicode:ascii",
			bytes_old="unicode:ascii",
		),
	))
	ds = Dataset(jid)
	for colname in ("ascii_new", "ascii_old", "bytes_new", "bytes_old",):
		for want, got in ds.iterate(0, ["unicode_new", colname]):
			assert uni(want) == uni(got), "%s in %s did not roundtrip correctly as bytes. Wanted %r but got %r." % (colname, ds, want, got)