def test(name, input, want_obj, want_bytes, **kw): json_save(input, name, **kw) with open(name, "rb") as fh: got_bytes_raw = fh.read() assert got_bytes_raw[ -1:] == b"\n", name + " didn't even end with a newline" got_bytes_raw = got_bytes_raw[:-1] as_str = json_encode(input, as_str=True, **kw) as_bytes = json_encode(input, as_str=False, **kw) assert isinstance(as_str, str) and isinstance( as_bytes, bytes), "json_encode returns the wrong types: %s %s" % ( type(as_str), type(as_bytes), ) assert as_bytes == got_bytes_raw, "json_save doesn't save the same thing json_encode returns for " + name if PY3: as_str = as_str.encode("utf-8") assert as_bytes == as_str, "json_encode doesn't return the same data for as_str=True and False" got_obj = json_load(name) assert want_obj == got_obj, "%s roundtrips wrong (wanted %r, got %r)" % ( name, want_obj, got_obj) with open(name, "rb") as fh: got_bytes_fuzzy = b"".join(line.strip() for line in fh) assert want_bytes == got_bytes_fuzzy, "%s wrong on disk (but decoded right)" % ( name, )
def run_job(self, jobid, subjob_cookie=None, parent_pid=0): W = self.workspaces[get_workspace_name(jobid)] # active_workspaces = {} for name in self.source_workdirs: active_workspaces[name] = self.workspaces[name].get_path() slices = self.workspaces[self.target_workdir].get_slices() t0 = time.time() setup = update_setup(jobid, starttime=t0) prof = setup.profile or DotDict() new_prof, files, subjobs = dispatch.launch(W.path, setup, self.config, self.Methods, active_workspaces, slices, self.debug, self.daemon_url, subjob_cookie, parent_pid) if self.debug: delete_from = Temp.TEMP else: delete_from = Temp.DEBUG for filename, temp in list(files.items()): if temp >= delete_from: unlink(join(W.path, jobid, filename)) del files[filename] prof.update(new_prof) prof.total = 0 prof.total = sum(v for v in prof.values() if isinstance(v, (float, int))) data = dict( starttime=t0, endtime=time.time(), profile=prof, ) update_setup(jobid, **data) data['files'] = files data['subjobs'] = subjobs json_save(data, resolve_jobid_filename(jobid, 'post.json'))
def real_synthesis(params, options, datasets, minmax_index, prepare_res, we_have_spill, save_discard=False): stats = DotDict( included_lines = [0] * params.slices, discarded_lines = [0] * params.slices, spilled_lines = [0] * params.slices, virtually_spilled_lines = [0] * params.slices, split_date = str(options.split_date) if options.split_date else None, discard_before_date = str(options.discard_before_date) if options.discard_before_date else None, ) minmax_per_slice = [{} for _ in range(params.slices)] def update_stats(data): for item in data.itervalues(): stats.included_lines[sliceno] += item.counters[2] stats.discarded_lines[sliceno] += item.counters[1] if item.virtual_spill: stats.virtually_spilled_lines[sliceno] += item.counters[3] else: stats.spilled_lines[sliceno] += item.counters[3] update_minmax(minmax_per_slice[sliceno], item.minmax) def update_minmax(dest, src): for name, lst0 in src.iteritems(): lst1 = dest.get(name, lst0) mins = map(min, zip(lst0[:3], lst1[:3])) maxs = map(max, zip(lst0[3:], lst1[3:])) dest[name] = mins + maxs for sliceno in range(params.slices): update_stats(blob.load('stats', sliceno=sliceno)) minmax = {} for item in minmax_per_slice: update_minmax(minmax, item) def minmax_select(offset, stringify=False): d = {} for k, v in minmax.iteritems(): mn = v[offset] mx = v[3 + offset] if mn <= mx: if stringify and isinstance(mn, (date, time,)): d[k] = [str(mn), str(mx)] else: d[k] = [mn, mx] return d dw, dw_spill = prepare_res[:2] dw.set_minmax(None, minmax_select(minmax_index)) dw_spill.set_minmax(None, minmax_select(2)) if save_discard: included_lines = stats.discarded_lines else: included_lines = stats.included_lines for sliceno in range(params.slices): dw.set_lines(sliceno, included_lines[sliceno]) dw_spill.set_lines(sliceno, stats.spilled_lines[sliceno]) if not we_have_spill: dw_spill.discard() stats.minmax_discarded = minmax_select(0, True) stats.minmax = minmax_select(1, True) stats.minmax_spilled = minmax_select(2, True) json_save(stats)
def synthesis(params, prepare_res): source_params = job_params(datasets.source) source_params.options.caption = options.caption a_dataset_datesplit.real_synthesis(params, source_params.options, source_params.datasets, 0, prepare_res, False, save_discard=True) stats = json_load() json_save(dict( minmax = stats.minmax_discarded, included_lines = stats.discarded_lines, split_date = stats.split_date, discard_before_date = stats.discard_before_date, ))
def synthesis(params, analysis_res, prepare_res): r = report() res = DotDict() d = datasets.source analysis_res = list(analysis_res) if options.filter_bad: num_lines_per_split = [ num - data[1] for num, data in zip(d.lines, analysis_res) ] res.bad_line_count_per_slice = [data[1] for data in analysis_res] res.bad_line_count_total = sum(res.bad_line_count_per_slice) r.println('Slice Bad line count') for sliceno, cnt in enumerate(res.bad_line_count_per_slice): r.println('%5d %d' % ( sliceno, cnt, )) r.println('total %d' % (res.bad_line_count_total, )) r.line() r.println('Slice Bad line number') reported_count = 0 for sliceno, data in enumerate(analysis_res): fn = 'badmap%d' % (sliceno, ) if data[1] and reported_count < 32: with open(fn, 'rb') as fh: badmap = mmap(fh.fileno(), 0, prot=PROT_READ) for ix, v in enumerate(imap(ord, badmap)): if v: for jx in range(8): if v & (1 << jx): r.println('%5d %d' % ( sliceno, ix * 8 + jx, )) reported_count += 1 if reported_count >= 32: break if reported_count >= 32: break badmap.close() unlink(fn) if reported_count >= 32: r.println('...') r.line() res.bad_line_count_per_column = {} r.println('Bad line count Column') for colname in sorted(analysis_res[0][0]): cnt = sum(data[0][colname] for data in analysis_res) r.println('%14d %s' % ( cnt, colname, )) res.bad_line_count_per_column[colname] = cnt r.line() else: num_lines_per_split = d.lines dw = prepare_res for sliceno, count in enumerate(num_lines_per_split): dw.set_lines(sliceno, count) if options.defaults: r.println('Defaulted values') res.defaulted_per_slice = {} res.defaulted_total = {} for colname in sorted(options.defaults): r.println(' %s:' % (colname, )) r.println(' Slice Defaulted line count') res.defaulted_per_slice[colname] = [ data[2][colname] for data in analysis_res ] res.defaulted_total[colname] = sum( res.defaulted_per_slice[colname]) for sliceno, cnt in enumerate(res.defaulted_per_slice[colname]): r.println(' %5d %d' % ( sliceno, cnt, )) r.println(' total %d' % (res.defaulted_total[colname], )) r.line() for sliceno, data in enumerate(analysis_res): dw.set_minmax(sliceno, data[3]) d = dw.finish() res.good_line_count_per_slice = num_lines_per_split res.good_line_count_total = sum(num_lines_per_split) r.line() r.println('Total of %d lines converted' % (res.good_line_count_total, )) r.close() json_save(res)