def _iterate_datasets(to_iter, columns, pre_callback, post_callback, filter_func, translation_func, translators, want_tuple, range, status_reporting): skip_ds = None def argfixup(func, is_post): if func: if len(getargspec(func).args) == 1: seen_ds = [None] def wrapper(d, sliceno=None): if d != seen_ds[0]: if is_post: if seen_ds[0] and seen_ds[0] != skip_ds: func(seen_ds[0]) else: func(d) seen_ds[0] = d return wrapper, True return func, False pre_callback, unsliced_pre_callback = argfixup(pre_callback, False) post_callback, unsliced_post_callback = argfixup(post_callback, True) if not to_iter: return if range: range_k, ( range_bottom, range_top, ) = next(iteritems(range)) range_check = range_check_function(range_bottom, range_top) if range_k in columns and range_k not in translators and not translation_func: has_range_column = True range_i = columns.index(range_k) if want_tuple: range_f = lambda t: range_check(t[range_i]) else: range_f = range_check else: has_range_column = False if status_reporting: from status import status else: from status import dummy_status as status def fmt_dsname(d, sliceno, rehash): if rehash: return d + ':REHASH' else: return '%s:%d' % (d, sliceno) if len(to_iter) == 1: msg_head = 'Iterating ' + fmt_dsname(*to_iter[0]) def update_status(update, ix, d, sliceno, rehash): pass else: msg_head = 'Iterating %s to %s' % ( fmt_dsname(*to_iter[0]), fmt_dsname(*to_iter[-1]), ) def update_status(update, ix, d, sliceno, rehash): update('%s, %d/%d (%s)' % (msg_head, ix, len(to_iter), fmt_dsname(d, sliceno, rehash))) with status(msg_head) as update: for ix, (d, sliceno, rehash) in enumerate(to_iter, 1): if unsliced_post_callback: post_callback(d) update_status(update, ix, d, sliceno, rehash) if pre_callback: if d == skip_ds: continue try: pre_callback(d, sliceno) except SkipSlice: if unsliced_pre_callback: skip_ds = d continue except SkipJob: skip_ds = d continue it = d._iterator(None if rehash else sliceno, columns) for ix, trans in translators.items(): it[ix] = imap(trans, it[ix]) if want_tuple: it = izip(*it) else: it = it[0] if rehash: it = d._hashfilter(sliceno, rehash, it) if translation_func: it = imap(translation_func, it) if range: c = d.columns[range_k] if c.min is not None and (not range_check(c.min) or not range_check(c.max)): if has_range_column: it = ifilter(range_f, it) else: if rehash: filter_it = d._hashfilter( sliceno, rehash, d._column_iterator(None, range_k)) else: filter_it = d._column_iterator( sliceno, range_k) it = compress(it, imap(range_check, filter_it)) if filter_func: it = ifilter(filter_func, it) yield it if post_callback and not unsliced_post_callback: post_callback(d, sliceno) if unsliced_post_callback: post_callback(None)
def synthesis(params, analysis_res, prepare_res): r = report() res = DotDict() d = datasets.source analysis_res = list(analysis_res) if options.filter_bad: num_lines_per_split = [ num - data[1] for num, data in zip(d.lines, analysis_res) ] res.bad_line_count_per_slice = [data[1] for data in analysis_res] res.bad_line_count_total = sum(res.bad_line_count_per_slice) r.println('Slice Bad line count') for sliceno, cnt in enumerate(res.bad_line_count_per_slice): r.println('%5d %d' % ( sliceno, cnt, )) r.println('total %d' % (res.bad_line_count_total, )) r.line() r.println('Slice Bad line number') reported_count = 0 for sliceno, data in enumerate(analysis_res): fn = 'badmap%d' % (sliceno, ) if data[1] and reported_count < 32: with open(fn, 'rb') as fh: badmap = mmap(fh.fileno(), 0, prot=PROT_READ) for ix, v in enumerate(imap(ord, badmap)): if v: for jx in range(8): if v & (1 << jx): r.println('%5d %d' % ( sliceno, ix * 8 + jx, )) reported_count += 1 if reported_count >= 32: break if reported_count >= 32: break badmap.close() unlink(fn) if reported_count >= 32: r.println('...') r.line() res.bad_line_count_per_column = {} r.println('Bad line count Column') for colname in sorted(analysis_res[0][0]): cnt = sum(data[0][colname] for data in analysis_res) r.println('%14d %s' % ( cnt, colname, )) res.bad_line_count_per_column[colname] = cnt r.line() else: num_lines_per_split = d.lines dw = prepare_res for sliceno, count in enumerate(num_lines_per_split): dw.set_lines(sliceno, count) if options.defaults: r.println('Defaulted values') res.defaulted_per_slice = {} res.defaulted_total = {} for colname in sorted(options.defaults): r.println(' %s:' % (colname, )) r.println(' Slice Defaulted line count') res.defaulted_per_slice[colname] = [ data[2][colname] for data in analysis_res ] res.defaulted_total[colname] = sum( res.defaulted_per_slice[colname]) for sliceno, cnt in enumerate(res.defaulted_per_slice[colname]): r.println(' %5d %d' % ( sliceno, cnt, )) r.println(' total %d' % (res.defaulted_total[colname], )) r.line() for sliceno, data in enumerate(analysis_res): dw.set_minmax(sliceno, data[3]) d = dw.finish() res.good_line_count_per_slice = num_lines_per_split res.good_line_count_total = sum(num_lines_per_split) r.line() r.println('Total of %d lines converted' % (res.good_line_count_total, )) r.close() json_save(res)
def csvexport(sliceno, filename, labelsonfirstline): assert len(options.separator) == 1 assert options.quote_fields in ( '', "'", '"', ) d = datasets.source[0] if not options.labels: options.labels = sorted(d.columns) if options.chain_source: if jobids.previous: prev_source = job_params(jobids.previous).datasets.source assert len(datasets.source) == len(prev_source) else: prev_source = [None] * len(datasets.source) lst = [] for src, stop in zip(datasets.source, prev_source): lst.extend(src.chain(stop_ds=stop)) datasets.source = lst if filename.lower().endswith('.gz'): mkwrite = mkwrite_gz elif filename.lower().endswith('.csv'): mkwrite = mkwrite_uncompressed else: raise Exception( "Filename should end with .gz for compressed or .csv for uncompressed" ) iters = [] first = True for label in options.labels: it = d.iterate_list(sliceno, label, datasets.source, status_reporting=first) first = False t = d.columns[label].type if t == 'unicode' and PY2: it = imap(enc, it) elif t == 'bytes' and PY3: it = imap(lambda s: s.decode('utf-8', errors='backslashreplace'), it) elif t in ('float32', 'float64', 'number'): it = imap(repr, it) elif t == 'json': it = imap(dumps, it) elif t not in ('unicode', 'ascii', 'bytes'): it = imap(str, it) iters.append(it) it = izip(*iters) with mkwrite(filename) as write: q = options.quote_fields sep = options.separator if q: qq = q + q if labelsonfirstline: write( enc( sep.join(q + n.replace(q, qq) + q for n in options.labels))) for data in it: write(sep.join(q + n.replace(q, qq) + q for n in data)) else: if labelsonfirstline: write(enc(sep.join(options.labels))) for data in it: write(sep.join(data))
def _iterate_datasets(to_iter, columns, pre_callback, post_callback, filter_func, translation_func, translators, want_tuple, range): skip_jobid = None def argfixup(func, is_post): if func: if len(getargspec(func).args) == 1: seen_jobid = [None] def wrapper(jobid, sliceno=None): if jobid != seen_jobid[0]: if is_post: if seen_jobid[ 0] and seen_jobid[0] != skip_jobid: func(seen_jobid[0]) else: func(jobid) seen_jobid[0] = jobid return wrapper, True return func, False pre_callback, unsliced_pre_callback = argfixup(pre_callback, False) post_callback, unsliced_post_callback = argfixup(post_callback, True) if not to_iter: return if range: range_k, ( range_bottom, range_top, ) = next(iteritems(range)) range_check = range_check_function(range_bottom, range_top) if range_k in columns and range_k not in translators and not translation_func: has_range_column = True range_i = columns.index(range_k) if want_tuple: range_f = lambda t: range_check(t[range_i]) else: range_f = range_check else: has_range_column = False starting_at = '%s:%d' % ( to_iter[0][0], to_iter[0][2], ) if len(to_iter) == 1: msg = 'Iterating ' + starting_at else: msg = 'Iterating %d dataset slices starting at %s' % ( len(to_iter), starting_at, ) with status(msg): for ix, (jobid, d, sliceno, rehash) in enumerate(to_iter): if unsliced_post_callback: post_callback(jobid) if pre_callback: if jobid == skip_jobid: continue try: pre_callback(jobid, sliceno) except SkipSlice: if unsliced_pre_callback: skip_jobid = jobid continue except SkipJob: skip_jobid = jobid continue it = d._iterator(None if rehash else sliceno, columns) for ix, trans in translators.items(): it[ix] = imap(trans, it[ix]) if want_tuple: it = izip(*it) else: it = it[0] if rehash: it = d._hashfilter(sliceno, rehash, it) if translation_func: it = imap(translation_func, it) if range: c = d.columns[range_k] if c.min is not None and (not range_check(c.min) or not range_check(c.max)): if has_range_column: it = ifilter(range_f, it) else: if rehash: filter_it = d._hashfilter( sliceno, rehash, d._column_iterator(None, range_k)) else: filter_it = d._column_iterator( sliceno, range_k) it = compress(it, imap(range_check, filter_it)) if filter_func: it = ifilter(filter_func, it) with status('(%d/%d) %s:%s' % ( ix, len(to_iter), jobid, 'REHASH' if rehash else sliceno, )): yield it if post_callback and not unsliced_post_callback: post_callback(jobid, sliceno) if unsliced_post_callback: post_callback(None)