def synthesis(prepare_res, params): if not options.as_chain: # If we don't want a chain we abuse our knowledge of dataset internals # to avoid recompressing. Don't do this stuff yourself. dws, names, prev_source, caption, filename = prepare_res merged_dw = DatasetWriter( caption=caption, hashlabel=options.hashlabel, filename=filename, previous=datasets.previous, meta_only=True, columns=datasets.source.columns, ) for sliceno in range(params.slices): merged_dw.set_lines(sliceno, sum(dw._lens[sliceno] for dw in dws)) for dwno, dw in enumerate(dws): merged_dw.set_minmax((sliceno, dwno), dw._minmax[sliceno]) for n in names: fn = merged_dw.column_filename(n, sliceno=sliceno) with open(fn, "wb") as out_fh: for dw in dws: fn = dw.column_filename(n, sliceno=sliceno) with open(fn, "rb") as in_fh: copyfileobj(in_fh, out_fh) for dw in dws: dw.discard()
def analysis_lap(sliceno, badmap_fh, first_lap): known_line_count = 0 badmap_size = 0 badmap_fd = -1 res_bad_count = {} res_default_count = {} res_minmax = {} link_candidates = [] if first_lap: record_bad = options.filter_bad skip_bad = 0 else: record_bad = 0 skip_bad = options.filter_bad minmax_fn = 'minmax%d' % (sliceno, ) dw = DatasetWriter() for colname, coltype in options.column2type.iteritems(): out_fn = dw.column_filename(options.rename.get( colname, colname)).encode('ascii') if ':' in coltype and not coltype.startswith('number:'): coltype, fmt = coltype.split(':', 1) _, cfunc, pyfunc = dataset_typing.convfuncs[coltype + ':*'] if '%f' in fmt: # needs to fall back to python version cfunc = None if not cfunc: pyfunc = pyfunc(coltype, fmt) else: _, cfunc, pyfunc = dataset_typing.convfuncs[coltype] fmt = ffi.NULL d = datasets.source assert d.columns[colname].type in ( 'bytes', 'ascii', ), colname if options.filter_bad: line_count = d.lines[sliceno] if known_line_count: assert line_count == known_line_count, (colname, line_count, known_line_count) else: known_line_count = line_count pagesize = getpagesize() badmap_size = (line_count // 8 // pagesize + 1) * pagesize badmap_fh.truncate(badmap_size) badmap_fd = badmap_fh.fileno() if d.columns[colname].backing_type.startswith('_v2_'): backing_format = 2 else: backing_format = 3 in_fn = d.column_filename(colname, sliceno).encode('ascii') if d.columns[colname].offsets: offset = d.columns[colname].offsets[sliceno] max_count = d.lines[sliceno] else: offset = 0 max_count = -1 if coltype == 'number': cfunc = True if coltype == 'number:int': coltype = 'number' cfunc = True fmt = "int" if cfunc: default_value = options.defaults.get(colname, ffi.NULL) if default_value is None: default_value = ffi.NULL default_value_is_None = True else: default_value_is_None = False bad_count = ffi.new('uint64_t [1]', [0]) default_count = ffi.new('uint64_t [1]', [0]) c = getattr(backend, 'convert_column_' + coltype) res = c(in_fn, out_fn, minmax_fn, default_value, default_value_is_None, fmt, record_bad, skip_bad, badmap_fd, badmap_size, bad_count, default_count, offset, max_count, backing_format) assert not res, 'Failed to convert ' + colname res_bad_count[colname] = bad_count[0] res_default_count[colname] = default_count[0] with type2iter[dataset_typing.typerename.get( coltype, coltype)](minmax_fn) as it: res_minmax[colname] = list(it) unlink(minmax_fn) elif pyfunc is str: # We skip it the first time around, and link it from # the source dataset if there were no bad lines. # (That happens at the end of analysis.) # We can't do that if the file is not slice-specific though. # And we also can't do it if the column is in the wrong (old) format. if skip_bad or '%s' not in d.column_filename( colname, '%s') or backing_format != 3: res = backend.filter_strings(in_fn, out_fn, badmap_fd, badmap_size, offset, max_count, backing_format) assert not res, 'Failed to convert ' + colname else: link_candidates.append(( in_fn, out_fn, )) res_bad_count[colname] = 0 res_default_count[colname] = 0 elif pyfunc is str.strip: res = backend.filter_stringstrip(in_fn, out_fn, badmap_fd, badmap_size, offset, max_count, backing_format) assert not res, 'Failed to convert ' + colname res_bad_count[colname] = 0 res_default_count[colname] = 0 else: # python func nodefault = object() if colname in options.defaults: if options.defaults[colname] is None: default_value = None else: default_value = pyfunc(options.defaults[colname]) else: default_value = nodefault if options.filter_bad: badmap = mmap(badmap_fd, badmap_size) bad_count = 0 default_count = 0 with typed_writer(dataset_typing.typerename.get( coltype, coltype))(out_fn) as fh: col_min = col_max = None for ix, v in enumerate(d.iterate(sliceno, colname)): if skip_bad: if ord(badmap[ix // 8]) & (1 << (ix % 8)): bad_count += 1 continue try: v = pyfunc(v) except ValueError: if default_value is not nodefault: v = default_value default_count += 1 elif record_bad: bad_count += 1 bv = ord(badmap[ix // 8]) badmap[ix // 8] = chr(bv | (1 << (ix % 8))) continue else: raise Exception( "Invalid value %r with no default in %s" % ( v, colname, )) if not isinstance(v, ( NoneType, str, unicode, )): if col_min is None: col_min = col_max = v if v < col_min: col_min = v if v > col_max: col_max = v fh.write(v) if options.filter_bad: badmap.close() res_bad_count[colname] = bad_count res_default_count[colname] = default_count res_minmax[colname] = [col_min, col_max] return res_bad_count, res_default_count, res_minmax, link_candidates
def analysis_lap(sliceno, badmap_fh, first_lap): known_line_count = 0 badmap_size = 0 badmap_fd = -1 res_bad_count = {} res_default_count = {} res_minmax = {} link_candidates = [] if first_lap: record_bad = options.filter_bad skip_bad = 0 else: record_bad = 0 skip_bad = options.filter_bad minmax_fn = 'minmax%d' % (sliceno, ) dw = DatasetWriter() for colname, coltype in iteritems(options.column2type): out_fn = dw.column_filename(options.rename.get(colname, colname)) fmt = fmt_b = None if coltype in dataset_typing.convfuncs: shorttype = coltype _, cfunc, pyfunc = dataset_typing.convfuncs[coltype] else: shorttype, fmt = coltype.split(':', 1) _, cfunc, pyfunc = dataset_typing.convfuncs[shorttype + ':*'] if cfunc: cfunc = shorttype.replace(':', '_') if pyfunc: tmp = pyfunc(coltype) if callable(tmp): pyfunc = tmp cfunc = None else: pyfunc = None cfunc, fmt, fmt_b = tmp if coltype == 'number': cfunc = 'number' elif coltype == 'number:int': coltype = 'number' cfunc = 'number' fmt = "int" assert cfunc or pyfunc, coltype + " didn't have cfunc or pyfunc" coltype = shorttype d = datasets.source assert d.columns[colname].type in byteslike_types, colname if options.filter_bad: line_count = d.lines[sliceno] if known_line_count: assert line_count == known_line_count, (colname, line_count, known_line_count) else: known_line_count = line_count pagesize = getpagesize() badmap_size = (line_count // 8 // pagesize + 1) * pagesize badmap_fh.truncate(badmap_size) badmap_fd = badmap_fh.fileno() if d.columns[colname].backing_type.startswith('_v2_'): backing_format = 2 else: backing_format = 3 in_fn = d.column_filename(colname, sliceno) if d.columns[colname].offsets: offset = d.columns[colname].offsets[sliceno] max_count = d.lines[sliceno] else: offset = 0 max_count = -1 if cfunc: default_value = options.defaults.get(colname, ffi.NULL) default_len = 0 if default_value is None: default_value = ffi.NULL default_value_is_None = True else: default_value_is_None = False if default_value != ffi.NULL: if isinstance(default_value, unicode): default_value = default_value.encode("utf-8") default_len = len(default_value) bad_count = ffi.new('uint64_t [1]', [0]) default_count = ffi.new('uint64_t [1]', [0]) c = getattr(backend, 'convert_column_' + cfunc) res = c(*bytesargs(in_fn, out_fn, minmax_fn, default_value, default_len, default_value_is_None, fmt, fmt_b, record_bad, skip_bad, badmap_fd, badmap_size, bad_count, default_count, offset, max_count, backing_format)) assert not res, 'Failed to convert ' + colname res_bad_count[colname] = bad_count[0] res_default_count[colname] = default_count[0] coltype = coltype.split(':', 1)[0] with type2iter[dataset_typing.typerename.get( coltype, coltype)](minmax_fn) as it: res_minmax[colname] = list(it) unlink(minmax_fn) else: # python func nodefault = object() if colname in options.defaults: default_value = options.defaults[colname] if default_value is not None: if isinstance(default_value, unicode): default_value = default_value.encode('utf-8') default_value = pyfunc(default_value) else: default_value = nodefault if options.filter_bad: badmap = mmap(badmap_fd, badmap_size) if PY2: badmap = IntegerBytesWrapper(badmap) bad_count = 0 default_count = 0 dont_minmax_types = {'bytes', 'ascii', 'unicode', 'json'} real_coltype = dataset_typing.typerename.get(coltype, coltype) do_minmax = real_coltype not in dont_minmax_types with typed_writer(real_coltype)(out_fn) as fh: col_min = col_max = None for ix, v in enumerate( d._column_iterator(sliceno, colname, _type='bytes' if backing_format == 3 else '_v2_bytes')): if skip_bad: if badmap[ix // 8] & (1 << (ix % 8)): bad_count += 1 continue try: v = pyfunc(v) except ValueError: if default_value is not nodefault: v = default_value default_count += 1 elif record_bad: bad_count += 1 bv = badmap[ix // 8] badmap[ix // 8] = bv | (1 << (ix % 8)) continue else: raise Exception( "Invalid value %r with no default in %s" % ( v, colname, )) if do_minmax and not isinstance(v, NoneType): if col_min is None: col_min = col_max = v if v < col_min: col_min = v if v > col_max: col_max = v fh.write(v) if options.filter_bad: badmap.close() res_bad_count[colname] = bad_count res_default_count[colname] = default_count res_minmax[colname] = [col_min, col_max] return res_bad_count, res_default_count, res_minmax, link_candidates