def analysis(sliceno, prepare_res): dw_default = DatasetWriter() dw_named = DatasetWriter(name="named") dw_passed, num = prepare_res dw_default.write(a=sliceno, b="a") dw_default.write_list([num, str(sliceno)]) dw_named.write(True, date(1536, 12, min(sliceno + 1, 31))) dw_named.write_dict({"c": False, "d": date(2236, 5, min(sliceno + 1, 31))}) # slice 0 is written in synthesis if 0 < sliceno < test_data.value_cnt: dw_passed.write_dict( {k: v[sliceno] for k, v in test_data.data.items()})
def do_one(params, name, data): dw = DatasetWriter(name=name, columns=columns) dw.set_slice(0) for v in data: if v is None: d = dict( ascii_new=None, ascii_old=None, bytes_new=None, bytes_old=None, unicode_new=None, unicode_old=None, ) else: d = dict( ascii_new=v, ascii_old=v, bytes_new=uni(v).encode("ascii"), bytes_old=uni(v).encode("ascii"), unicode_new=uni(v), unicode_old=uni(v), ) dw.write_dict(d) # We don't really want the other slices, but write one thing to # each, to make sure it doesn't show up in slice 0. # (Small slice merging will put it in the same file, so this is # a real risk.) for sliceno in range(1, params.slices): dw.set_slice(sliceno) dw.write_dict(d) dw.finish() # verify we got what we asked for me_ds = Dataset(params.jobid, name) for colname, coltype in columns.items(): col = me_ds.columns[colname] assert col.type == coltype.split("_")[-1], colname assert col.backing_type == coltype, colname for want, got in zip(data, me_ds.iterate(0, colname)): if want is not None: if PY2 and "unicode" in coltype: want = uni(want) if PY3 and "bytes" in coltype: want = want.encode("ascii") assert want == got, "%s in %s did not contain the expected value. Wanted %r but got %r." % (colname, me_ds, want, got) # check that both types of bytes filter correctly through typing jid = subjobs.build("dataset_type", datasets=dict(source=me_ds), options=dict( column2type=dict( ascii_new="bytes", ascii_old="number", # fails on the string, so that gets filtered out everywhere bytes_new="bytes", bytes_old="bytes", ), filter_bad=True, )) ds = Dataset(jid) # verify the number first data_it = iter(raw_data) next(data_it) # skip the filtered out string for got in ds.iterate(0, "ascii_old"): want = next(data_it) if want is None: # Becomes 0 because the typer (unfortunately) sees it as an empty string want = 0 assert want == got, "ascii_old in %s did not type correctly as number. Wanted %r but got %r." % (ds, want, got) # now verify all the bytes ones are ok, no longer containing the string. for colname in ("ascii_new", "bytes_new", "bytes_old",): data_it = iter(data) next(data_it) # skip the filtered out string for got in ds.iterate(0, colname): want = next(data_it) if want is not None: want = want.encode("ascii") assert want == got, "%s in %s did not roundtrip correctly as bytes. Wanted %r but got %r." % (colname, ds, want, got) # and now check that the Nones are ok after making bytes from ascii and unicode from bytes. jid = subjobs.build("dataset_type", datasets=dict(source=me_ds), options=dict( column2type=dict( ascii_new="bytes", ascii_old="bytes", bytes_new="unicode:ascii", bytes_old="unicode:ascii", ), )) ds = Dataset(jid) for colname in ("ascii_new", "ascii_old", "bytes_new", "bytes_old",): for want, got in ds.iterate(0, ["unicode_new", colname]): assert uni(want) == uni(got), "%s in %s did not roundtrip correctly as bytes. Wanted %r but got %r." % (colname, ds, want, got)