def synthesis():
    lines = [
        'printing a bunch of lines, this is line %d.' % (n, )
        for n in range(150)
    ]
    if options.inner:
        for s in lines:
            print(s)
        raise Exception('this is an exception, but nothing went wrong')
    else:
        try:
            subjobs.build('test_output_on_error', inner=True)
        except JobError as e:
            job = e.job
        else:
            raise Exception("test_output_on_error with inner=True didn't fail")
        # give the iowrapper some time to finish
        for attempt in range(25):
            got_lines = job.output().split('\n')
            if got_lines[:len(lines)] == lines:
                for line in got_lines:
                    if line == 'Exception: this is an exception, but nothing went wrong':
                        return
            # not yet, wait a little (total of 30s)
            if attempt > 1:
                print(
                    'Output from %s has not appeared yet, waiting more (%d).' %
                    (
                        job,
                        attempt,
                    ))
            sleep(attempt / 10.0)
        raise Exception('Not all output from %s was saved in OUTPUT' % (job, ))
def synthesis():
    assert options.level < 5, "Too deep subjob nesting allowed"
    try:
        subjobs.build('test_subjobs_nesting',
                      options={'level': options.level + 1})
    except JobError:
        assert options.level == 4, "Not enough subjob nesting allowed"
def synthesis(params):
    source = Dataset(subjobs.build("test_sorting_gendata"))
    # Test that all datatypes work for sorting
    for key in test_data.data:
        check_one(params.slices, key, source)
    # Check reverse sorting
    check_one(params.slices, "int32", source, reverse=True)
    # Check that sorting across slices and by two columns works
    jid = subjobs.build(
        "dataset_sort",
        options=dict(
            sort_columns=["int64", "int32"],
            sort_order="descending",
            sort_across_slices=True,
        ),
        datasets=dict(source=source),
    )
    int64_off = sorted(test_data.data).index("int64")
    int32_off = sorted(test_data.data).index("int32")
    all_data = chain.from_iterable(
        test_data.sort_data_for_slice(sliceno)
        for sliceno in range(params.slices))
    good = sorted(all_data,
                  key=lambda t: (
                      noneninf(t[int64_off]),
                      noneninf(t[int32_off]),
                  ),
                  reverse=True)
    ds = Dataset(jid)
    check = list(ds.iterate(None))
    assert unnan(check) == unnan(
        good), "Sorting across slices on [int64, int32] bad (%s)" % (jid, )
示例#4
0
def test_filter_bad_with_rename_and_chain():
    dw = DatasetWriter(name="filter bad with rename",
                       allow_missing_slices=True)
    dw.add('a', 'ascii')
    dw.add('b', 'bytes')
    dw.add('c', 'unicode')
    dw.set_slice(0)
    dw.write('0', b'1', '2')
    dw.write('9', B'A', 'B')
    dw.write('C', B'D', 'E')
    source_ds = dw.finish()
    jid = subjobs.build(
        'dataset_type',
        column2type=dict(b='int32_10', c='int64_16', d='int32_16'),
        filter_bad=True,
        rename=dict(a='b', b='c', c='d'),
        source=source_ds,
    )
    typed_ds = jid.dataset()
    coltypes = sorted(
        (name, col.type) for name, col in typed_ds.columns.items())
    assert coltypes == [('b', 'int32'), ('c', 'int64'),
                        ('d', 'int32')], coltypes
    assert list(typed_ds.iterate(0)) == [(0, 1, 2), (9, 10, 11)]
    bad_ds = jid.dataset('bad')
    coltypes = sorted((name, col.type) for name, col in bad_ds.columns.items())
    assert coltypes == [('b', 'ascii'), ('c', 'bytes'),
                        ('d', 'unicode')], coltypes
    assert list(bad_ds.iterate(0)) == [('C', b'D', 'E')]

    dw = DatasetWriter(name="filter bad with rename chain",
                       allow_missing_slices=True,
                       previous=source_ds)
    dw.add('a', 'ascii')
    dw.add('b', 'ascii')
    dw.add('c', 'ascii')
    dw.set_slice(0)
    dw.write('3', '4', '5')
    dw.write('6', '7', 'eight')
    source_ds = dw.finish()
    jid = subjobs.build(
        'dataset_type',
        column2type=dict(a='number', b='int32_10', c='int64_10'),
        defaults=dict(a='8'),
        filter_bad=True,
        rename=dict(a='b', b='c', c='a'),
        source=source_ds,
    )
    typed_ds = jid.dataset()
    coltypes = sorted(
        (name, col.type) for name, col in typed_ds.columns.items())
    assert coltypes == [('a', 'number'), ('b', 'int32'),
                        ('c', 'int64')], coltypes
    assert list(typed_ds.iterate(0)) == [(2, 0, 1), (5, 3, 4), (8, 6, 7)]
    bad_ds = jid.dataset('bad')
    coltypes = sorted((name, col.type) for name, col in bad_ds.columns.items())
    assert coltypes == [('a', 'unicode'), ('b', 'ascii'),
                        ('c', 'bytes')], coltypes
    assert list(bad_ds.iterate(0)) == [('B', '9', b'A'), ('E', 'C', b'D')]
示例#5
0
def test(src_ds, opts, expect_lines):
	opts = DotDict(opts)
	def rename(colname):
		return opts.get('rename', {}).get(colname, colname)
	cols = set(opts.column2type)
	opts.discard_untyped = True
	msg = 'Testing with types %s' % (', '.join(v for k, v in sorted(opts.column2type.items())),)
	expect_hl = None
	if src_ds.hashlabel and opts.column2type.get(src_ds.hashlabel) == 'json':
		# json is not hashable, so we have to override the hashlabel to nothing in this case.
		opts.hashlabel = ''
		msg += ' (clearing hashlabel)'
	elif src_ds.hashlabel:
		expect_hl = rename(src_ds.hashlabel)
		if expect_hl in opts.column2type:
			msg += ' (hashed on %s)' % (opts.column2type[expect_hl],)
		else:
			expect_hl = None
			msg += ' (hashed on <untyped column>)'
	print(msg)
	just_typed = subjobs.build('dataset_type', options=opts, datasets=dict(source=src_ds)).dataset()
	assert just_typed.hashlabel == expect_hl, just_typed
	assert set(just_typed.columns) == cols, just_typed
	assert sum(just_typed.lines) == expect_lines, just_typed
	if rename(src_ds.hashlabel) not in opts.column2type or opts.get('hashlabel') == '':
		assert just_typed.hashlabel is None, just_typed
	else:
		assert just_typed.hashlabel == rename(src_ds.hashlabel), just_typed
	del opts.discard_untyped
	rev_rename = {v: k for k, v in opts.get('rename', {}).items()}
	discard = set(src_ds.columns) - set(rev_rename.get(n, n) for n in cols)
	if discard:
		d = opts.get('rename', {})
		d.update({k: None for k in discard})
		opts.rename = d
	for hashlabel in cols:
		if opts.column2type[hashlabel] == 'json':
			# not hashable
			continue
		opts['hashlabel'] = hashlabel
		print('%s rehashed on %s' % (msg, opts.column2type[hashlabel],))
		hashed_by_type = subjobs.build('dataset_type', options=opts, datasets=dict(source=src_ds)).dataset()
		assert hashed_by_type.hashlabel == hashlabel, hashed_by_type
		assert set(hashed_by_type.columns) == cols, hashed_by_type
		assert sum(hashed_by_type.lines) == expect_lines, hashed_by_type
		hashed_after = subjobs.build('dataset_hashpart', options=dict(hashlabel=hashlabel), datasets=dict(source=just_typed)).dataset()
		assert hashed_after.hashlabel == hashlabel, hashed_after
		assert set(hashed_after.columns) == cols, hashed_after
		assert sum(hashed_after.lines) == expect_lines, hashed_after
		if src_ds.hashlabel:
			# if src_ds has a hashlabel then just_typed will actually already be hashed, so hashed_after
			# will have been hashed twice and therefore have a different order than hashed_by_type.
			if rename(src_ds.hashlabel) == hashlabel:
				# These should be the same though.
				subjobs.build('test_compare_datasets', datasets=dict(a=hashed_by_type, b=just_typed))
			hashed_by_type = subjobs.build('dataset_sort', options=dict(sort_columns=rename('a')), datasets=dict(source=hashed_by_type))
			hashed_after = subjobs.build('dataset_sort', options=dict(sort_columns=rename('a')), datasets=dict(source=hashed_after))
		subjobs.build('test_compare_datasets', datasets=dict(a=hashed_by_type, b=hashed_after))
def synthesis(job):
	dw = DatasetWriter(name='empty', columns={'v': 'ascii'})
	dw.get_split_write()
	empty_ds = dw.finish()
	assert empty_ds.min('non-existant column') is empty_ds.max('non-existant column') is None, 'Dataset.min/max() broken for non-existant columns'
	for typ, groups in tests.items():
		t_ds = subjobs.build('dataset_type', column2type={'v': typ}, source=empty_ds).dataset()
		minmax = (t_ds.columns['v'].min, t_ds.columns['v'].max)

		if minmax != (None, None):
			raise Exception('Typing empty dataset as %s did not give minmax == None, gave %r' % (typ, minmax,))
		all_names = list(chain.from_iterable(groupdata[group].keys() for group in groups))
		# just 1 and 2, so we don't make way too many
		for num_groups in (1, 2,):
			for names in combinations(all_names, num_groups):
				ds, mn, mx = make_source(names)
				t_ds = subjobs.build('dataset_type', column2type={'v': typ}, source=ds).dataset()
				got_minmax = (t_ds.columns['v'].min, t_ds.columns['v'].max)
				want_minmax = (mn, mx)
				chk_minmax(got_minmax, want_minmax, 'Typing %s as %s gave wrong minmax: expected %r, got %r (in %s)' % (ds, typ, want_minmax, got_minmax, t_ds,))
				chk_minmax(got_minmax, (t_ds.min('v'), t_ds.max('v')), 'Dataset.min/max() broken on ' + t_ds)
				# verify writing the same data normally also gives the correct result
				dw = DatasetWriter(name='rewrite ' + t_ds, columns=t_ds.columns)
				write = dw.get_split_write()
				for v in t_ds.iterate(None, 'v'):
					write(v)
				re_ds = dw.finish()
				got_minmax = (re_ds.columns['v'].min, re_ds.columns['v'].max)
				want_minmax = (mn, mx)
				chk_minmax(got_minmax, want_minmax, 'Rewriting %s gave the wrong minmax: expected %r, got %r (in %s)' % (t_ds, want_minmax, got_minmax, re_ds,))

	# make sure renaming doesn't mix anything up
	dw = DatasetWriter(name='rename', columns={'a': 'ascii', 'b': 'ascii'})
	write = dw.get_split_write()
	write('5', '3')
	write('7', 'oops')
	ds = dw.finish()
	t_ds = subjobs.build(
		'dataset_type',
		column2type=dict(num='number', int='int32_10'),
		defaults=dict(num='1', int='2'),
		rename=dict(a='num', b='int'),
		source=ds,
	).dataset()
	for name, want_minmax in (
		('num', (5, 7)),
		('int', (2, 3)),
	):
		got_minmax = (t_ds.columns[name].min, t_ds.columns[name].max)
		msg = 'Typing %s gave wrong minmax: expected %r, got %r (in %s)' % (ds, want_minmax, got_minmax, t_ds,)
		chk_minmax(got_minmax, want_minmax, msg)
示例#7
0
def synthesis():
    dw_a = DatasetWriter(name='a', columns={'num': 'int32'})
    dw_b = DatasetWriter(name='b', columns={'num': 'int32'}, previous=dw_a)
    dw_c = DatasetWriter(name='c', columns={'num': 'int32'}, previous=dw_b)
    w = dw_a.get_split_write()
    w(3)
    w(2)
    w = dw_b.get_split_write()
    w(2)
    w(1)
    w = dw_c.get_split_write()
    w(0)
    a = dw_a.finish()
    b = dw_b.finish()
    c = dw_c.finish()

    opts = dict(
        sort_columns='num',
        sort_across_slices=True,
    )

    # sort as a chain
    jid = subjobs.build('dataset_sort',
                        options=opts,
                        datasets=dict(source=a, previous=None))
    assert list(Dataset(jid).iterate(None, 'num')) == [2, 3]
    sorted_a = jid
    jid = subjobs.build('dataset_sort',
                        options=opts,
                        datasets=dict(source=b, previous=jid))
    assert list(Dataset(jid).iterate_chain(None, 'num')) == [2, 3, 1, 2]
    jid = subjobs.build('dataset_sort',
                        options=opts,
                        datasets=dict(source=c, previous=jid))
    assert list(Dataset(jid).iterate_chain(None, 'num')) == [2, 3, 1, 2, 0]

    # sort all as a single dataset
    jid = subjobs.build('dataset_sort',
                        options=opts,
                        datasets=dict(source=c, previous=None))
    assert list(Dataset(jid).iterate_chain(None, 'num')) == [0, 1, 2, 2, 3]

    # merge b and c but not a
    jid = subjobs.build('dataset_sort',
                        options=opts,
                        datasets=dict(source=c, previous=sorted_a))
    # test with new style job.dataset
    assert list(jid.dataset().iterate(None, 'num')) == [0, 1, 2]
    assert list(jid.dataset().iterate_chain(None, 'num')) == [2, 3, 0, 1, 2]
示例#8
0
def synthesis(job):
	dw = job.datasetwriter()
	dw.add('', 'number')
	dw.add('word', 'ascii')
	w = dw.get_split_write()
	w(0, 'foo')
	w(1, 'bar')
	ds = dw.finish()
	assert set(ds.columns) == {'', 'word'}
	assert list(ds.iterate(None, '')) == [0, 1]
	assert list(ds.iterate(None)) == [(0, 'foo'), (1, 'bar')]
	job = subjobs.build('csvexport', source=ds, filename='out.csv')
	job = subjobs.build('csvimport', filename=job.filename('out.csv'))
	job = subjobs.build('dataset_type', source=job, column2type={'': 'number', 'word': 'ascii'})
	assert list(job.dataset().iterate(None)) == list(ds.iterate(None))
def test_column_discarding():
	dw = DatasetWriter(name='column discarding')
	dw.add('a', 'bytes')
	dw.add('b', 'bytes')
	dw.add('c', 'bytes')
	w = dw.get_split_write()
	w(b'a', b'b', b'c')
	source = dw.finish()

	# Discard b because it's not typed
	ac_implicit = subjobs.build(
		'dataset_type',
		source=source,
		column2type=dict(a='ascii', c='ascii'),
		discard_untyped=True,
	).dataset()
	assert sorted(ac_implicit.columns) == ['a', 'c'], '%s: %r' % (ac_implicit, sorted(ac_implicit.columns),)
	assert list(ac_implicit.iterate(None)) == [('a', 'c',)], ac_implicit

	# Discard b explicitly
	ac_explicit = subjobs.build(
		'dataset_type',
		source=source,
		column2type=dict(a='ascii', c='ascii'),
		rename=dict(b=None),
	).dataset()
	assert sorted(ac_explicit.columns) == ['a', 'c'], '%s: %r' % (ac_explicit, sorted(ac_explicit.columns),)
	assert list(ac_explicit.iterate(None)) == [('a', 'c',)], ac_explicit

	# Discard c by overwriting it with b. Keep untyped b.
	ac_bASc = subjobs.build(
		'dataset_type',
		source=source,
		column2type=dict(a='ascii', c='ascii'),
		rename=dict(b='c'),
	).dataset()
	assert sorted(ac_bASc.columns) == ['a', 'b', 'c'], '%s: %r' % (ac_bASc, sorted(ac_bASc.columns),)
	assert list(ac_bASc.iterate(None)) == [('a', b'b', 'b',)], ac_bASc

	# Discard c by overwriting it with b. Also type b as a different type.
	abc_bASc = subjobs.build(
		'dataset_type',
		source=source,
		column2type=dict(a='ascii', b='strbool', c='ascii'),
		rename=dict(b='c'),
	).dataset()
	assert sorted(abc_bASc.columns) == ['a', 'b', 'c'], '%s: %r' % (abc_bASc, sorted(abc_bASc.columns),)
	assert list(abc_bASc.iterate(None)) == [('a', True, 'b',)], abc_bASc
def synthesis(job):
    dw = job.datasetwriter(name='a')
    dw.add('a', 'int32')
    w = dw.get_split_write()
    w(0)
    w(1)
    w(2)
    a = dw.finish()

    for filename, sliced, out_filename, open_func in (
        ('name', False, 'name', open),
        ('name', True, 'name.1', open),
        ('name%dend', True, 'name1end', open),
        ('name.gz', False, 'name.gz', gzip.open),
        ('name.gz', True, 'name.gz.1', gzip.open),
        ('a%02d.gz.b', True, 'a01.gz.b', gzip.open),
        ('a%02d.gz.b', False, 'a%02d.gz.b', gzip.open),
        ('name.gzonk', False, 'name.gzonk', open),
    ):
        job = subjobs.build('csvexport',
                            filename=filename,
                            sliced=sliced,
                            source=a,
                            labels=['a'])
        fn = job.filename(out_filename)
        with open_func(fn, mode='rb') as fh:
            got = fh.read()
        if sliced:
            want = b'a\n1\n'
        else:
            want = b'a\n0\n1\n2\n'
        assert want == got, 'wanted %r, got %r in %s' % (want, got, fn)
示例#11
0
def verify(source, lazy_quotes, q, sep, expect, **kw):
    j = subjobs.build('csvexport',
                      chain_source=True,
                      source=source,
                      lazy_quotes=lazy_quotes,
                      quote_fields=q,
                      separator=sep,
                      **kw)
    with j.open('result.csv', 'r' if PY3 else 'rb') as fh:
        got = fh.read()
    if lazy_quotes and sep:
        quote_func = make_lazy(sep, q)
    else:
        quote_func = lambda v: q + v.replace(q, q + q) + q
    want = '\n'.join(sep.join(map(quote_func, line)) for line in expect)
    if want != got:
        print('Unhappy with %s:' % (j.filename('result.csv'), ))
        print()
        print('Expected:')
        print(want)
        print('Got:')
        print(got)
        raise Exception(
            'csvexport failed with quote_fields=%r, separator=%r, lazy_quotes=%r'
            % (
                q,
                sep,
                lazy_quotes,
            ))
示例#12
0
def verify(params, jwf):
    jid = subjobs.build('test_jobwithfile', options=dict(inner=True, file=jwf))
    for sliceno in range(params.slices):
        assert jid.load('inner.pickle', sliceno) == {'inner': sliceno}
        assert jid.json_load('inner.json', sliceno) == {'inner': sliceno}
    assert jid.load('inner.pickle') == {'inner': None}
    assert jid.json_load('inner.json') == {'inner': None}
示例#13
0
def synthesis(job):
	if options.pos == -1:
		previous = job
		other = None
		j2p = {job: -1}
		alles = [job]
		for pos in range(5):
			previous = subjobs.build('test_jobchain', pos=pos, previous=previous, other=other)
			alles.append(previous)
			j2p[previous] = pos
			if pos == 2:
				other = alles[1]
			else:
				other = None
		assert alles == previous.chain()
		assert list(reversed(alles)) == previous.chain(reverse=True)
		def chk(tip, first, **kw):
			c = alles[tip].chain(**kw)
			c = [j2p[j] for j in c]
			assert c == list(range(first, tip)), (tip, first, kw)
		chk(5, -1)
		chk(5, 3, stop_job=alles[3])
		chk(5, 2, length=3)
		chk(4, 1, length=3)
		chk(5, 3, stop_job={alles[4]: 'previous'})
		chk(5, 1, stop_job={alles[4]: 'other'})
		chk(4, 1, stop_job={alles[4]: 'other'})
		chk(5, 2, stop_job={alles[4]: 'other'}, length=3)
		chk(5, 1, stop_job={alles[4]: 'other'}, length=5)
		assert alles[2].chain(length=0) == []
		assert job.chain() == [job]
		assert job.chain(stop_job=job) == []
def test_filter_bad_across_types():
	columns={
		'bytes': 'bytes',
		'float64': 'bytes',
		'int32_10': 'ascii',
		'json': 'unicode',
		'number:int': 'unicode',
		'unicode:utf-8': 'bytes',
	}
	# all_good, *values
	# Make sure all those types (except bytes) can filter other lines,
	# and be filtered by other lines. And that several filtering values
	# is not a problem (line 11).
	data = [
		(True,  b'first',    b'1.1', '1',  '"a"',   '001', b'ett',),
		(True,  b'second',   b'2.2', '2',  '"b"',   '02',  b'tv\xc3\xa5',),
		(True,  b'third',    b'3.3', '3',  '["c"]', '3.0', b'tre',),
		(False, b'fourth',   b'4.4', '4',  '"d"',   '4.4', b'fyra',),       # number:int bad
		(False, b'fifth',    b'5.5', '-',  '"e"',   '5',   b'fem',),        # int32_10 bad
		(False, b'sixth',    b'6.b', '6',  '"f"',   '6',   b'sex',),        # float64 bad
		[False, b'seventh',  b'7.7', '7',  '{"g"}', '7',   b'sju',],        # json bad
		(False, b'eigth',    b'8.8', '8',  '"h"',   '8',   b'\xa5\xc3tta',),# unicode:utf-8 bad
		(True,  b'ninth',    b'9.9', '9',  '"i"',   '9',   b'nio',),
		(True,  b'tenth',    b'10',  '10', '"j"',   '10',  b'tio',),
		(False, b'eleventh', b'11a', '1-', '"k",',  '1,',  b'elva',),       # float64, int32_10 and number:int bad
		(True,  b'twelfth',  b'12',  '12', '"l"',   '12',  b'tolv',),
	]
	dw = DatasetWriter(name="filter bad across types", columns=columns)
	dw.set_slice(0)
	want = []
	def add_want(v):
		want.append((int(v[3]), v[1], json.loads(v[4]), v[6].decode('utf-8'),))
	for v in data:
		if v[0]:
			add_want(v)
		dw.write(*v[1:])
	for sliceno in range(1, g.slices):
		dw.set_slice(sliceno)
	source_ds = dw.finish()
	# Once with just filter_bad, once with some defaults too.
	defaults = {}
	for _ in range(2):
		jid = subjobs.build(
			'dataset_type',
			datasets=dict(source=source_ds),
			options=dict(column2type={t: t for t in columns}, filter_bad=True, defaults=defaults),
		)
		typed_ds = Dataset(jid)
		got = list(typed_ds.iterate(0, ['int32_10', 'bytes', 'json', 'unicode:utf-8']))
		assert got == want, "Exptected %r, got %r from %s (from %r%s)" % (want, got, typed_ds, source_ds, ' with defaults' if defaults else '')
		# make more lines "ok" for the second lap
		defaults = {'number:int': '0', 'float64': '0', 'json': '"replacement"'}
		add_want(data[3])
		add_want(data[5])
		data[6][4] = '"replacement"'
		add_want(data[6])
		want.sort() # adding them out of order, int32_10 sorts correctly.
def synthesis():
	typerename = dict(
		int64="int64_10",
		int32="int32_10",
		bits64="bits64_10",
		bits32="bits32_10",
		bool="strbool",
		datetime="datetime:%Y-%m-%d %H:%M:%S.%f",
		date="date:%Y-%m-%d",
		time="time:%H:%M:%S.%f",
		unicode="unicode:utf-8",
	)
	columns = {k: typerename.get(v.type, v.type) for k, v in datasets.typed.columns.items()}
	retyped = subjobs.build(
		"dataset_type",
		options=dict(column2type=columns),
		datasets=dict(source=datasets.untyped)
	)
	subjobs.build("test_compare_datasets", datasets=dict(a=datasets.typed, b=retyped))
示例#16
0
def sort(src, ix, **kw):
    ds = subjobs.build('dataset_sort',
                       source=src,
                       sort_across_slices=True,
                       **kw).dataset()
    want = sorted(src.iterate(None), key=itemgetter(ix))
    assert list(ds.iterate(None)) == want, '%s != sorted(%s)' % (
        ds,
        src,
    )
    return ds
示例#17
0
def verify(zipname, inside_filenames, want_ds, **kw):
    opts = dict(
        filename=g.job.filename(zipname),
        inside_filenames=inside_filenames,
    )
    opts.update(kw)
    jid = subjobs.build('csvimport_zip', options=opts)
    for dsn, want_data in want_ds.items():
        got_data = list(Dataset(jid, dsn).iterate(None, '0'))
        assert got_data == want_data, "%s/%s from %s didn't contain %r, instead contained %r" % (
            jid, dsn, zipname, want_data, got_data)
示例#18
0
def getImageCluster(lat_deg, lon_deg, delta_lat, delta_long, zoom):
    xmin, ymax = deg2num(lat_deg, lon_deg, zoom)
    xmax, ymin = deg2num(lat_deg + delta_lat, lon_deg + delta_long, zoom)
    size = ((xmax - xmin + 1) * 256 - 1, (ymax - ymin + 1) * 256 - 1)
    cluster = Image.new('RGB', size)
    for xtile in range(xmin, xmax + 1):
        for ytile in range(ymin, ymax + 1):
            t = build('tile', zoom=zoom, xtile=xtile, ytile=ytile)
            tile = Image.open(t.filename('tile.png'))
            cluster.paste(tile,
                          box=((xtile - xmin) * 256, (ytile - ymin) * 255))
    return cluster, size, num2deg(xmin, ymin,
                                  zoom), num2deg(xmax + 1, ymax + 1, zoom)
def _verify(name, types, data, coltype, want, default, want_fail, kw):
	if callable(want):
		check = want
	else:
		def check(got, fromstr, filtered=False):
			want1 = want if isinstance(want, list) else want[typ]
			if filtered:
				want1 = want1[::2]
			assert got == want1, 'Expected %r, got %r from %s.' % (want1, got, fromstr,)
	dw = DatasetWriter(name=name, columns={'data': coltype, 'extra': 'bytes'})
	dw.set_slice(0)
	for ix, v in enumerate(data):
		dw.write(v, b'1' if ix % 2 == 0 else b'skip')
	for sliceno in range(1, g.slices):
		dw.set_slice(sliceno)
	bytes_ds = dw.finish()
	for typ in types:
		opts = dict(column2type=dict(data=typ))
		opts.update(kw)
		if default is not no_default:
			opts['defaults'] = {'data': default}
		try:
			jid = subjobs.build('dataset_type', datasets=dict(source=bytes_ds), options=opts)
		except JobError:
			if want_fail:
				continue
			raise Exception('Typing %r as %s failed.' % (bytes_ds, typ,))
		assert not want_fail, "Typing %r as %s should have failed, but didn't (%s)." % (bytes_ds, typ, jid)
		typed_ds = Dataset(jid)
		got = list(typed_ds.iterate(0, 'data'))
		check(got, '%s (typed as %s from %r)' % (typed_ds, typ, bytes_ds,))
		if 'filter_bad' not in opts and not callable(want):
			opts['filter_bad'] = True
			opts['column2type']['extra'] = 'int32_10'
			jid = subjobs.build('dataset_type', datasets=dict(source=bytes_ds), options=opts)
			typed_ds = Dataset(jid)
			got = list(typed_ds.iterate(0, 'data'))
			check(got, '%s (typed as %s from %r with every other line skipped from filter_bad)' % (typed_ds, typ, bytes_ds,), True)
		used_type(typ)
示例#20
0
def synthesis(job):
    ix = 0
    jobs = []
    for current, subdirs, files in walk(options.path):
        files = sorted(x for x in files
                       if splitext(x)[1].upper() in options.validextensions
                       and not islink(join(current, x)))
        if files:
            jobs.append(
                build('scandir', directory=current, files=sorted(files)))
            print(ix, current)
            ix += 1
    return jobs
示例#21
0
 def verify_hashing(caption, want_values, **kw):
     ds = subjobs.build('dataset_type',
                        source=source,
                        column2type=dict(a='int32_10'),
                        caption=caption,
                        **kw).dataset()
     got_values = set()
     for sliceno in range(g.slices):
         for got in ds.iterate(sliceno):
             assert hashfunc(got[0]) % g.slices == sliceno
             assert got not in got_values
             got_values.add(got)
     assert want_values == got_values
示例#22
0
def check_one(slices, key, source, reverse=False):
    jid = subjobs.build(
        "dataset_sort",
        options=dict(
            sort_columns=key,
            sort_order="descending" if reverse else "ascending",
        ),
        datasets=dict(source=source),
    )
    ds = Dataset(jid)
    key_off = sorted(test_data.data).index(key)
    # This provides better separation than the replacement values
    # used in the actual sort method (but this is slow).
    if 'date' in key or 'time' in key:
        nonepos = 1
    else:
        nonepos = -1

    def cmp(a, b):
        a = a[key_off]
        b = b[key_off]
        if a is None:
            if b is None:
                return 0
            return nonepos
        if b is None:
            return -nonepos
        if isinstance(a, float):
            if isnan(a):
                if isnan(b):
                    return 0
                return 1
            if isnan(b):
                return -1
        if a < b:
            return -1
        return a > b

    keycmp = cmp_to_key(cmp)
    for sliceno in range(slices):
        good = sorted(test_data.sort_data_for_slice(sliceno),
                      key=keycmp,
                      reverse=reverse)
        check = list(ds.iterate(sliceno))
        assert unnan(check) == unnan(
            good), "Slice %d sorted on %s bad (%s)" % (
                sliceno,
                key,
                jid,
            )
def synthesis():
    sum = 0
    jobs = datasets.source.chain(length=options.chain_length,
                                 stop_ds=datasets.stop)
    for src in jobs:
        jid = build('dataset_checksum',
                    options=dict(columns=options.columns, sort=options.sort),
                    datasets=dict(source=src))
        data = blob.load(jobid=jid)
        sum ^= data.sum
    print("Total: %016x" % (sum, ))
    return DotDict(sum=sum,
                   columns=data.columns,
                   sort=options.sort,
                   sources=jobs)
def synthesis(params, prepare_res):
    dw = prepare_res
    source = dw.finish()
    jid = subjobs.build(
        "dataset_sort",
        options=dict(
            sort_columns="num",
            sort_across_slices=True,
        ),
        datasets=dict(source=source),
    )
    ds = Dataset(jid)
    data = list(ds.iterate(None, "str"))
    good = list("cghjabdefi") + \
           [str(sliceno) for sliceno in range(params.slices)] * 64
    assert data == good
示例#25
0
def test(params, p=False, a=False, s=False):
	prefix = "A bit of text."
	opts = {'prefix': prefix}
	name = 'test_output_'
	cookie = randint(10000, 99999)
	if p:
		name += 'p'
		opts['p'] = "Some words\nfrom prepare\nwith %d in them." % (cookie,)
	if a:
		name += 'a'
		opts['a'] = "A few words\nfrom analysis(%%d)\nwith the cookie %d in them." % (cookie,)
	if s:
		name += 's'
		opts['s'] = "Words\nfrom synthesis\ncookie is %d." % (cookie,)
	jid = subjobs.build(name, options=opts)
	d = jid.filename('OUTPUT/')
	chked = set()
	all = []
	def chk(part):
		output = jid.output(part)
		if isinstance(part, int):
			data = opts['a'] % (part,)
			part = str(part)
		else:
			data = opts[part[0]]
		chked.add(part)
		with open(d +  part, 'r') as fh:
			got = fh.read().replace('\r\n', '\n')
		want = prefix + '\n' + data + '\n'
		assert got == prefix + '\n' + data + '\n', "%s produced %r in %s, expected %r" % (jid, got, part, want,)
		assert output == got, 'job.output disagrees with manual file reading for %s in %s. %r != %r' % (part, jid, output, got,)
		all.append(got)
	if p:
		chk('prepare')
	if a:
		for sliceno in range(params.slices):
			chk(sliceno)
	if s:
		chk('synthesis')
	unchked = set(os.listdir(d)) - chked
	assert not unchked, "Unexpected OUTPUT files from %s: %r" % (jid, unchked,)
	output = jid.output()
	got = ''.join(all)
	assert output == got, 'job.output disagrees with manual file reading for <all> in %s. %r != %r' % (jid, output, got,)
示例#26
0
def synthesis(prepare_res):
    opts = DotDict(
        (k, v) for k, v in options.items() if k in a_csvimport.options)
    lst = prepare_res
    previous = datasets.previous
    for fn, info, dsn in lst:
        opts.filename = fn
        jid = subjobs.build('csvimport',
                            options=opts,
                            datasets=dict(previous=previous),
                            caption="Import of %s from %s" % (
                                info.filename,
                                options.filename,
                            ))
        previous = Dataset(jid).link_to_here(dsn)
        if options.chaining == 'off':
            previous = None
    if (len(lst) == 1 or options.chaining != 'off') and dsn != 'default':
        Dataset(jid).link_to_here('default')
示例#27
0
def synthesis():
	if options.inner:
		res = DotDict()
		res.datetime = options.datetime + options.timedelta
		res.time = options.time.replace(minute=0)
		res.date = options.date.replace(month=1)
		return res
	else:
		opts = dict(
			datetime=datetime(2019, 11, 6, 17, 37, 2, 987654),
			time=time(17, 37, 2, 987654),
			date=date(2019, 11, 6),
			timedelta=timedelta(microseconds=987654),
			inner=True,
		)
		jid = subjobs.build('test_datetime', options=opts)
		res = jid.load()
		assert res.datetime == datetime(2019, 11, 6, 17, 37, 3, 975308)
		assert res.time == time(17, 0, 2, 987654)
		assert res.date == date(2019, 1, 6)
def check_no_separator(job):
    def write(data):
        fh.write(data + nl_b)
        wrote_c[data] += 1
        if q_b:
            data = q_b + data + q_b
            fh.write(q_b + data.replace(q_b, q_b + q_b) + q_b + nl_b)
            wrote_c[data] += 1

    for nl in (10, 0, 255):
        for q in (None, 0, 34, 13, 10, 228):
            if nl == q:
                continue
            filename = "no separator.%r.%r.txt" % (
                nl,
                q,
            )
            nl_b = bytechr(nl)
            q_b = bytechr(q) if q else b''
            wrote_c = Counter()
            with openx(filename) as fh:
                for splitpoint in range(256):
                    write(byteline(0, splitpoint, nl, q))
                    write(byteline(splitpoint, 256, nl, q))
            try:
                jid = subjobs.build("csvimport",
                                    options=dict(
                                        filename=job.filename(filename),
                                        quotes=q_b.decode("iso-8859-1"),
                                        newline=nl_b.decode("iso-8859-1"),
                                        separator='',
                                        labelsonfirstline=False,
                                        labels=["data"],
                                    ))
            except JobError:
                raise Exception("Importing %r failed" % (filename, ))
            got_c = Counter(Dataset(jid).iterate(None, "data"))
            assert got_c == wrote_c, "Importing %r (%s) gave wrong contents" % (
                filename,
                jid,
            )
def synthesis(job):
	dw = job.datasetwriter()
	dw.add('a', 'ascii')
	dw.add('b', 'unicode')
	w = dw.get_split_write()
	w('A', 'B')
	w('\0', '\xe4')
	ds = dw.finish()
	def verify(data, filename):
		want = []
		for line in [['a', 'b']] + data:
			want.append(separator.join(quote + item + quote for item in line))
			want.append(line_separator)
		want = ''.join(want).encode('utf-8')
		if ext == '.gz':
			open_func = gzip.open
		else:
			open_func = open
		with open_func(j.filename(filename), 'rb') as fh:
			got = fh.read()
		assert want == got, "Expected %s/%s to contain %r, but contained %r" % (j, filename, want, got,)
	for separator in ('', '\0', 'wheeee'):
		for line_separator in ('', '\0', 'woooooo'):
			for quote in ('', 'qqq'):
				for ext in ('.csv', '.gz'):
					for sliced, filename in ((False, 'out' + ext), (True, 'out.%d' + ext)):
						j = subjobs.build(
							'csvexport',
							filename=filename,
							separator=separator,
							line_separator=line_separator,
							quote_fields=quote,
							sliced=sliced,
							source=ds,
						)
						if sliced:
							for sliceno, data in ((0, ['A', 'B']), (1, ['\0', '\xe4'])):
								verify([data], filename % (sliceno,))
						else:
							verify([['A', 'B'], ['\0', '\xe4']], filename)
示例#30
0
def verify(slices, data, source, previous=None, **options):
    jid = subjobs.build(
        "dataset_hashpart",
        datasets=dict(source=source, previous=previous),
        options=options,
    )
    hl = options["hashlabel"]
    h = typed_writer(columns[hl][0]).hash
    ds = Dataset(jid)
    good = {row[hl]: row for row in data}
    names = list(source.columns)
    for slice in range(slices):
        for row in ds.iterate_chain(slice, names):
            row = dict(zip(names, row))
            assert h(
                row[hl]
            ) % slices == slice, "row %r is incorrectly in slice %d in %s" % (
                row, slice, ds)
            want = good[row[hl]]
            assert row == want, '%s (rehashed from %s) did not contain the right data for "%s".\nWanted\n%r\ngot\n%r' % (
                ds, source, hl, want, row)
    return ds