def analysis(sliceno, params):
    assert list(datasets.source.iterate(sliceno, "a")) == [sliceno, 42]
    assert list(datasets.source.iterate(sliceno, "b")) == ["a", str(sliceno)]
    named = Dataset(datasets.source, "named")
    assert list(named.iterate(sliceno, "c")) == [True, False]
    assert list(named.iterate(sliceno, "d")) == [
        date(1536, 12, min(sliceno + 1, 31)),
        date(2236, 5, min(sliceno + 1, 31))
    ]
    if sliceno < test_data.value_cnt:
        passed = Dataset(datasets.source, "passed")
        good = tuple(v[sliceno] for _, v in sorted(test_data.data.items()))
        assert list(passed.iterate(sliceno)) == [good]
    synthesis_split = Dataset(datasets.source, "synthesis_split")
    values = zip((
        1,
        2,
        3,
    ), "abc")
    hash = typed_writer("int32").hash
    good = [v for v in values if hash(v[0]) % params.slices == sliceno]
    assert list(synthesis_split.iterate(sliceno)) == good
    synthesis_manual = Dataset(datasets.source, "synthesis_manual")
    assert list(synthesis_manual.iterate(sliceno, "sliceno")) == [sliceno]
    nonetest = Dataset(datasets.source, "nonetest")
    good = tuple(v[0] if k in test_data.not_none_capable else None
                 for k, v in sorted(test_data.data.items()))
    assert list(nonetest.iterate(sliceno)) == [good]
Exemplo n.º 2
0
def synthesis(params):
    source = Dataset(subjobs.build("test_sorting_gendata"))
    # Test that all datatypes work for sorting
    for key in test_data.data:
        check_one(params.slices, key, source)
    # Check reverse sorting
    check_one(params.slices, "int32", source, reverse=True)
    # Check that sorting across slices and by two columns works
    jid = subjobs.build(
        "dataset_sort",
        options=dict(
            sort_columns=["int64", "int32"],
            sort_order="descending",
            sort_across_slices=True,
        ),
        datasets=dict(source=source),
    )
    int64_off = sorted(test_data.data).index("int64")
    int32_off = sorted(test_data.data).index("int32")
    all_data = chain.from_iterable(
        test_data.sort_data_for_slice(sliceno)
        for sliceno in range(params.slices))
    good = sorted(all_data,
                  key=lambda t: (
                      noneninf(t[int64_off]),
                      noneninf(t[int32_off]),
                  ),
                  reverse=True)
    ds = Dataset(jid)
    check = list(ds.iterate(None))
    assert unnan(check) == unnan(
        good), "Sorting across slices on [int64, int32] bad (%s)" % (jid, )
def test_filter_bad_across_types():
	columns={
		'bytes': 'bytes',
		'float64': 'bytes',
		'int32_10': 'ascii',
		'json': 'unicode',
		'number:int': 'unicode',
		'unicode:utf-8': 'bytes',
	}
	# all_good, *values
	# Make sure all those types (except bytes) can filter other lines,
	# and be filtered by other lines. And that several filtering values
	# is not a problem (line 11).
	data = [
		(True,  b'first',    b'1.1', '1',  '"a"',   '001', b'ett',),
		(True,  b'second',   b'2.2', '2',  '"b"',   '02',  b'tv\xc3\xa5',),
		(True,  b'third',    b'3.3', '3',  '["c"]', '3.0', b'tre',),
		(False, b'fourth',   b'4.4', '4',  '"d"',   '4.4', b'fyra',),       # number:int bad
		(False, b'fifth',    b'5.5', '-',  '"e"',   '5',   b'fem',),        # int32_10 bad
		(False, b'sixth',    b'6.b', '6',  '"f"',   '6',   b'sex',),        # float64 bad
		[False, b'seventh',  b'7.7', '7',  '{"g"}', '7',   b'sju',],        # json bad
		(False, b'eigth',    b'8.8', '8',  '"h"',   '8',   b'\xa5\xc3tta',),# unicode:utf-8 bad
		(True,  b'ninth',    b'9.9', '9',  '"i"',   '9',   b'nio',),
		(True,  b'tenth',    b'10',  '10', '"j"',   '10',  b'tio',),
		(False, b'eleventh', b'11a', '1-', '"k",',  '1,',  b'elva',),       # float64, int32_10 and number:int bad
		(True,  b'twelfth',  b'12',  '12', '"l"',   '12',  b'tolv',),
	]
	dw = DatasetWriter(name="filter bad across types", columns=columns)
	dw.set_slice(0)
	want = []
	def add_want(v):
		want.append((int(v[3]), v[1], json.loads(v[4]), v[6].decode('utf-8'),))
	for v in data:
		if v[0]:
			add_want(v)
		dw.write(*v[1:])
	for sliceno in range(1, g.slices):
		dw.set_slice(sliceno)
	source_ds = dw.finish()
	# Once with just filter_bad, once with some defaults too.
	defaults = {}
	for _ in range(2):
		jid = subjobs.build(
			'dataset_type',
			datasets=dict(source=source_ds),
			options=dict(column2type={t: t for t in columns}, filter_bad=True, defaults=defaults),
		)
		typed_ds = Dataset(jid)
		got = list(typed_ds.iterate(0, ['int32_10', 'bytes', 'json', 'unicode:utf-8']))
		assert got == want, "Exptected %r, got %r from %s (from %r%s)" % (want, got, typed_ds, source_ds, ' with defaults' if defaults else '')
		# make more lines "ok" for the second lap
		defaults = {'number:int': '0', 'float64': '0', 'json': '"replacement"'}
		add_want(data[3])
		add_want(data[5])
		data[6][4] = '"replacement"'
		add_want(data[6])
		want.sort() # adding them out of order, int32_10 sorts correctly.
def _verify(name, types, data, coltype, want, default, want_fail, kw):
	if callable(want):
		check = want
	else:
		def check(got, fromstr, filtered=False):
			want1 = want if isinstance(want, list) else want[typ]
			if filtered:
				want1 = want1[::2]
			assert got == want1, 'Expected %r, got %r from %s.' % (want1, got, fromstr,)
	dw = DatasetWriter(name=name, columns={'data': coltype, 'extra': 'bytes'})
	dw.set_slice(0)
	for ix, v in enumerate(data):
		dw.write(v, b'1' if ix % 2 == 0 else b'skip')
	for sliceno in range(1, g.slices):
		dw.set_slice(sliceno)
	bytes_ds = dw.finish()
	for typ in types:
		opts = dict(column2type=dict(data=typ))
		opts.update(kw)
		if default is not no_default:
			opts['defaults'] = {'data': default}
		try:
			jid = subjobs.build('dataset_type', datasets=dict(source=bytes_ds), options=opts)
		except JobError:
			if want_fail:
				continue
			raise Exception('Typing %r as %s failed.' % (bytes_ds, typ,))
		assert not want_fail, "Typing %r as %s should have failed, but didn't (%s)." % (bytes_ds, typ, jid)
		typed_ds = Dataset(jid)
		got = list(typed_ds.iterate(0, 'data'))
		check(got, '%s (typed as %s from %r)' % (typed_ds, typ, bytes_ds,))
		if 'filter_bad' not in opts and not callable(want):
			opts['filter_bad'] = True
			opts['column2type']['extra'] = 'int32_10'
			jid = subjobs.build('dataset_type', datasets=dict(source=bytes_ds), options=opts)
			typed_ds = Dataset(jid)
			got = list(typed_ds.iterate(0, 'data'))
			check(got, '%s (typed as %s from %r with every other line skipped from filter_bad)' % (typed_ds, typ, bytes_ds,), True)
		used_type(typ)
Exemplo n.º 5
0
def check_one(slices, key, source, reverse=False):
    jid = subjobs.build(
        "dataset_sort",
        options=dict(
            sort_columns=key,
            sort_order="descending" if reverse else "ascending",
        ),
        datasets=dict(source=source),
    )
    ds = Dataset(jid)
    key_off = sorted(test_data.data).index(key)
    # This provides better separation than the replacement values
    # used in the actual sort method (but this is slow).
    if 'date' in key or 'time' in key:
        nonepos = 1
    else:
        nonepos = -1

    def cmp(a, b):
        a = a[key_off]
        b = b[key_off]
        if a is None:
            if b is None:
                return 0
            return nonepos
        if b is None:
            return -nonepos
        if isinstance(a, float):
            if isnan(a):
                if isnan(b):
                    return 0
                return 1
            if isnan(b):
                return -1
        if a < b:
            return -1
        return a > b

    keycmp = cmp_to_key(cmp)
    for sliceno in range(slices):
        good = sorted(test_data.sort_data_for_slice(sliceno),
                      key=keycmp,
                      reverse=reverse)
        check = list(ds.iterate(sliceno))
        assert unnan(check) == unnan(
            good), "Slice %d sorted on %s bad (%s)" % (
                sliceno,
                key,
                jid,
            )
def synthesis(params, prepare_res):
    dw = prepare_res
    source = dw.finish()
    jid = subjobs.build(
        "dataset_sort",
        options=dict(
            sort_columns="num",
            sort_across_slices=True,
        ),
        datasets=dict(source=source),
    )
    ds = Dataset(jid)
    data = list(ds.iterate(None, "str"))
    good = list("cghjabdefi") + \
           [str(sliceno) for sliceno in range(params.slices)] * 64
    assert data == good
Exemplo n.º 7
0
	def dataset(dsid):
		ds = Dataset(dsid.rstrip('/'))
		q = bottle.request.query
		if q.column:
			lines = int(q.lines or 10)
			it = ds.iterate(None, q.column)
			it = itertools.islice(it, lines)
			t = ds.columns[q.column].type
			if t in ('datetime', 'date', 'time',):
				it = map(str, it)
			elif t in ('bytes', 'pickle',):
				it = map(repr, it)
			res = list(it)
			bottle.response.content_type = 'application/json; charset=UTF-8'
			return json.dumps(res)
		else:
			return dict(ds=ds)
def verify_ds(options, d, d_bad, d_skipped, filename):
    jid = subjobs.build("csvimport", options=options)
    ds = Dataset(jid)
    expected_columns = {"ix", "0", "1"}
    if options.get("lineno_label"):
        expected_columns.add(options["lineno_label"])
        lineno_want = {ix: int(ix) for ix in ds.iterate(None, "ix")}
    assert set(ds.columns) == expected_columns
    # Order varies depending on slice count, so we use a dict {ix: data}
    for ix, a, b in ds.iterate(None, ["ix", "0", "1"]):
        try:
            ix = int(ix)
        except ValueError:
            # We have a few non-numeric ones
            pass
        assert ix in d, "Bad index %r in %r (%s)" % (
            ix,
            filename,
            jid,
        )
        assert a == b == d[ix], "Wrong data for line %r in %r (%s)" % (
            ix,
            filename,
            jid,
        )
        del d[ix]
    assert not d, "Not all lines returned from %r (%s), %r missing" % (
        filename,
        jid,
        set(d.keys()),
    )
    if options.get("allow_bad"):
        for ix, data in Dataset(jid, "bad").iterate(None, ["lineno", "data"]):
            assert ix in d_bad, "Bad bad_lineno %d in %r (%s/bad) %r" % (
                ix,
                filename,
                jid,
                data,
            )
            assert data == d_bad[
                ix], "Wrong saved bad line %d in %r (%s/bad).\nWanted %r.\nGot    %r." % (
                    ix,
                    filename,
                    jid,
                    d_bad[ix],
                    data,
                )
            del d_bad[ix]
    assert not d_bad, "Not all bad lines returned from %r (%s), %r missing" % (
        filename,
        jid,
        set(d_bad.keys()),
    )

    if options.get("comment") or options.get("skip_lines"):
        for ix, data in Dataset(jid,
                                "skipped").iterate(None, ["lineno", "data"]):
            assert ix in d_skipped, "Bad skipped_lineno %d in %r (%s/skipped) %r" % (
                ix,
                filename,
                jid,
                data,
            )
            assert data == d_skipped[
                ix], "Wrong saved skipped line %d in %r (%s/skipped).\nWanted %r.\nGot    %r." % (
                    ix,
                    filename,
                    jid,
                    d_skipped[ix],
                    data,
                )
            del d_skipped[ix]
    assert not d_skipped, "Not all bad lines returned from %r (%s), %r missing" % (
        filename,
        jid,
        set(d_skipped.keys()),
    )

    if options.get("lineno_label"):
        lineno_got = dict(ds.iterate(None,
                                     ["ix", options.get("lineno_label")]))
        assert lineno_got == lineno_want, "%r != %r" % (
            lineno_got,
            lineno_want,
        )
def check_one(job,
              newline,
              sep,
              data,
              want_res=None,
              prefix="",
              quotes=False,
              leave_bad=False):
    sep_c = uni(chr(sep))
    # Can't have separator character in unquoted values
    if not quotes and not leave_bad:
        data = [[el.replace(sep_c, "") for el in line] for line in data]
    if not want_res:
        want_res = [
            tuple(s.encode("ascii") for s in line) for line in data[1:]
        ]
    filename = "%s_csv.%d.%s.txt" % (prefix, sep, "CRLF"
                                     if newline == "\r\n" else ord(newline))
    newline = uni(newline)
    with job.open(filename, "w", encoding="iso-8859-1", temp=True) as fh:
        for line in data:
            if quotes:
                line = [
                    quotes + el.replace(quotes, quotes + quotes) + quotes
                    for el in line
                ]
            fh.write(sep_c.join(line))
            fh.write(newline)
    try:
        jid = subjobs.build("csvimport",
                            options=dict(
                                filename=job.filename(filename),
                                separator=sep_c,
                                quotes=quotes,
                                newline='' if "\n" in newline else newline,
                            ))
    except JobError as e:
        raise CSVImportException(
            "Failed to csvimport for separator %d with newline %r, csvimport error was:\n%s"
            % (sep, newline, e.format_msg()))
    ds = Dataset(jid)
    labels = sorted(ds.columns)
    if labels != data[0]:
        raise WrongLabelsException(
            "csvimport gave wrong labels for separator %d with newline %r: %r (expected %r)"
            % (
                sep,
                newline,
                labels,
                data[0],
            ))
    res = list(ds.iterate(None, data[0]))
    if res != want_res:
        raise WrongDataException(
            "csvimport gave wrong data for separator %d with newline %r: %r (expected %r)"
            % (
                sep,
                newline,
                res,
                want_res,
            ))
Exemplo n.º 10
0
def test_filter_bad_across_types():
    columns = {
        'bytes': 'bytes',
        'float64': 'bytes',
        'int32_10': 'ascii',
        'json': 'unicode',
        'number:int': 'unicode',
        'unicode:utf-8': 'bytes',
    }
    # all_good, *values
    # Make sure all those types (except bytes) can filter other lines,
    # and be filtered by other lines. And that several filtering values
    # is not a problem (line 11).
    data = [
        [
            True,
            b'first',
            b'1.1',
            '1',
            '"a"',
            '001',
            b'ett',
        ],
        [
            True,
            b'second',
            b'2.2',
            '2',
            '"b"',
            '02',
            b'tv\xc3\xa5',
        ],
        [
            True,
            b'third',
            b'3.3',
            '3',
            '["c"]',
            '3.0',
            b'tre',
        ],
        [
            False,
            b'fourth',
            b'4.4',
            '4',
            '"d"',
            '4.4',
            b'fyra',
        ],  # number:int bad
        [
            False,
            b'fifth',
            b'5.5',
            '-',
            '"e"',
            '5',
            b'fem',
        ],  # int32_10 bad
        [
            False,
            b'sixth',
            b'6.b',
            '6',
            '"f"',
            '6',
            b'sex',
        ],  # float64 bad
        [
            False,
            b'seventh',
            b'7.7',
            '7',
            '{"g"}',
            '7',
            b'sju',
        ],  # json bad
        [
            False,
            b'eigth',
            b'8.8',
            '8',
            '"h"',
            '8',
            b'\xa5\xc3tta',
        ],  # unicode:utf-8 bad
        [
            True,
            b'ninth',
            b'9.9',
            '9',
            '"i"',
            '9',
            b'nio',
        ],
        [
            True,
            b'tenth',
            b'10',
            '10',
            '"j"',
            '10',
            b'tio',
        ],
        [
            False,
            b'eleventh',
            b'11a',
            '1-',
            '"k",',
            '1,',
            b'elva',
        ],  # float64, int32_10 and number:int bad
        [
            True,
            b'twelfth',
            b'12',
            '12',
            '"l"',
            '12',
            b'tolv',
        ],
    ]
    want_bad = [tuple(l[1:]) for l in data if not l[0]]
    dw = DatasetWriter(name="filter bad across types",
                       columns=columns,
                       allow_missing_slices=True)
    cols_to_check = ['int32_10', 'bytes', 'json', 'unicode:utf-8']
    if PY3:
        # z so it sorts last.
        dw.add('zpickle', 'pickle')
        cols_to_check.append('zpickle')
        for ix in range(len(data)):
            data[ix].append({ix})
    dw.set_slice(0)
    want = []

    def add_want(ix):
        v = data[ix]
        want.append((
            int(v[3]),
            v[1],
            json.loads(v[4]),
            v[6].decode('utf-8'),
        ))
        if PY3:
            want[-1] = want[-1] + (v[7], )

    for ix, v in enumerate(data):
        if v[0]:
            add_want(ix)
        dw.write(*v[1:])
    source_ds = dw.finish()
    # Once with just filter_bad, once with some defaults too.
    defaults = {}
    for _ in range(2):
        jid = subjobs.build(
            'dataset_type',
            datasets=dict(source=source_ds),
            options=dict(column2type={t: t
                                      for t in columns},
                         filter_bad=True,
                         defaults=defaults),
        )
        typed_ds = Dataset(jid)
        got = list(typed_ds.iterate(0, cols_to_check))
        assert got == want, "Expected %r, got %r from %s (from %r%s)" % (
            want, got, typed_ds, source_ds,
            ' with defaults' if defaults else '')
        bad_ds = Dataset(jid, 'bad')
        got_bad = list(bad_ds.iterate(0, sorted(columns)))
        assert got_bad == want_bad, "Expected %r, got %r from %s (from %r%s)" % (
            want_bad, got_bad, bad_ds, source_ds,
            ' with defaults' if defaults else '')
        # make more lines "ok" for the second lap
        if not defaults:
            want_bad.pop(0)  # number:int
            want_bad.pop(1)  # float64
            want_bad.pop(1)  # json
        defaults = {'number:int': '0', 'float64': '0', 'json': '"replacement"'}
        add_want(3)
        add_want(5)
        data[6][4] = '"replacement"'
        add_want(6)
        want.sort()  # adding them out of order, int32_10 sorts correctly.
Exemplo n.º 11
0
def synthesis(params):
	ds = Dataset(params.jobid)
	assert set(ds.iterate(None, "data")) == {"foo", "bar"}
Exemplo n.º 12
0
def analysis(sliceno, params):
	ds = Dataset(params.jobid)
	assert set(ds.iterate(None, "data")) == {"foo", "bar"}