示例#1
0
    def _iterate_datasets(to_iter, columns, pre_callback, post_callback,
                          filter_func, translation_func, translators,
                          want_tuple, range, status_reporting):
        skip_ds = None

        def argfixup(func, is_post):
            if func:
                if len(getargspec(func).args) == 1:
                    seen_ds = [None]

                    def wrapper(d, sliceno=None):
                        if d != seen_ds[0]:
                            if is_post:
                                if seen_ds[0] and seen_ds[0] != skip_ds:
                                    func(seen_ds[0])
                            else:
                                func(d)
                            seen_ds[0] = d

                    return wrapper, True
            return func, False

        pre_callback, unsliced_pre_callback = argfixup(pre_callback, False)
        post_callback, unsliced_post_callback = argfixup(post_callback, True)
        if not to_iter:
            return
        if range:
            range_k, (
                range_bottom,
                range_top,
            ) = next(iteritems(range))
            range_check = range_check_function(range_bottom, range_top)
            if range_k in columns and range_k not in translators and not translation_func:
                has_range_column = True
                range_i = columns.index(range_k)
                if want_tuple:
                    range_f = lambda t: range_check(t[range_i])
                else:
                    range_f = range_check
            else:
                has_range_column = False
        if status_reporting:
            from status import status
        else:
            from status import dummy_status as status

        def fmt_dsname(d, sliceno, rehash):
            if rehash:
                return d + ':REHASH'
            else:
                return '%s:%d' % (d, sliceno)

        if len(to_iter) == 1:
            msg_head = 'Iterating ' + fmt_dsname(*to_iter[0])

            def update_status(update, ix, d, sliceno, rehash):
                pass
        else:
            msg_head = 'Iterating %s to %s' % (
                fmt_dsname(*to_iter[0]),
                fmt_dsname(*to_iter[-1]),
            )

            def update_status(update, ix, d, sliceno, rehash):
                update('%s, %d/%d (%s)' % (msg_head, ix, len(to_iter),
                                           fmt_dsname(d, sliceno, rehash)))

        with status(msg_head) as update:
            for ix, (d, sliceno, rehash) in enumerate(to_iter, 1):
                if unsliced_post_callback:
                    post_callback(d)
                update_status(update, ix, d, sliceno, rehash)
                if pre_callback:
                    if d == skip_ds:
                        continue
                    try:
                        pre_callback(d, sliceno)
                    except SkipSlice:
                        if unsliced_pre_callback:
                            skip_ds = d
                        continue
                    except SkipJob:
                        skip_ds = d
                        continue
                it = d._iterator(None if rehash else sliceno, columns)
                for ix, trans in translators.items():
                    it[ix] = imap(trans, it[ix])
                if want_tuple:
                    it = izip(*it)
                else:
                    it = it[0]
                if rehash:
                    it = d._hashfilter(sliceno, rehash, it)
                if translation_func:
                    it = imap(translation_func, it)
                if range:
                    c = d.columns[range_k]
                    if c.min is not None and (not range_check(c.min)
                                              or not range_check(c.max)):
                        if has_range_column:
                            it = ifilter(range_f, it)
                        else:
                            if rehash:
                                filter_it = d._hashfilter(
                                    sliceno, rehash,
                                    d._column_iterator(None, range_k))
                            else:
                                filter_it = d._column_iterator(
                                    sliceno, range_k)
                            it = compress(it, imap(range_check, filter_it))
                if filter_func:
                    it = ifilter(filter_func, it)
                yield it
                if post_callback and not unsliced_post_callback:
                    post_callback(d, sliceno)
            if unsliced_post_callback:
                post_callback(None)
示例#2
0
def synthesis(params, analysis_res, prepare_res):
    r = report()
    res = DotDict()
    d = datasets.source
    analysis_res = list(analysis_res)
    if options.filter_bad:
        num_lines_per_split = [
            num - data[1] for num, data in zip(d.lines, analysis_res)
        ]
        res.bad_line_count_per_slice = [data[1] for data in analysis_res]
        res.bad_line_count_total = sum(res.bad_line_count_per_slice)
        r.println('Slice   Bad line count')
        for sliceno, cnt in enumerate(res.bad_line_count_per_slice):
            r.println('%5d   %d' % (
                sliceno,
                cnt,
            ))
        r.println('total   %d' % (res.bad_line_count_total, ))
        r.line()
        r.println('Slice   Bad line number')
        reported_count = 0
        for sliceno, data in enumerate(analysis_res):
            fn = 'badmap%d' % (sliceno, )
            if data[1] and reported_count < 32:
                with open(fn, 'rb') as fh:
                    badmap = mmap(fh.fileno(), 0, prot=PROT_READ)
                    for ix, v in enumerate(imap(ord, badmap)):
                        if v:
                            for jx in range(8):
                                if v & (1 << jx):
                                    r.println('%5d   %d' % (
                                        sliceno,
                                        ix * 8 + jx,
                                    ))
                                    reported_count += 1
                                    if reported_count >= 32: break
                            if reported_count >= 32: break
                    badmap.close()
            unlink(fn)
        if reported_count >= 32:
            r.println('...')
        r.line()
        res.bad_line_count_per_column = {}
        r.println('Bad line count   Column')
        for colname in sorted(analysis_res[0][0]):
            cnt = sum(data[0][colname] for data in analysis_res)
            r.println('%14d   %s' % (
                cnt,
                colname,
            ))
            res.bad_line_count_per_column[colname] = cnt
        r.line()
    else:
        num_lines_per_split = d.lines
    dw = prepare_res
    for sliceno, count in enumerate(num_lines_per_split):
        dw.set_lines(sliceno, count)
    if options.defaults:
        r.println('Defaulted values')
        res.defaulted_per_slice = {}
        res.defaulted_total = {}
        for colname in sorted(options.defaults):
            r.println('    %s:' % (colname, ))
            r.println('        Slice   Defaulted line count')
            res.defaulted_per_slice[colname] = [
                data[2][colname] for data in analysis_res
            ]
            res.defaulted_total[colname] = sum(
                res.defaulted_per_slice[colname])
            for sliceno, cnt in enumerate(res.defaulted_per_slice[colname]):
                r.println('        %5d   %d' % (
                    sliceno,
                    cnt,
                ))
            r.println('        total   %d' % (res.defaulted_total[colname], ))
        r.line()
    for sliceno, data in enumerate(analysis_res):
        dw.set_minmax(sliceno, data[3])
    d = dw.finish()
    res.good_line_count_per_slice = num_lines_per_split
    res.good_line_count_total = sum(num_lines_per_split)
    r.line()
    r.println('Total of %d lines converted' % (res.good_line_count_total, ))
    r.close()
    json_save(res)
示例#3
0
def csvexport(sliceno, filename, labelsonfirstline):
    assert len(options.separator) == 1
    assert options.quote_fields in (
        '',
        "'",
        '"',
    )
    d = datasets.source[0]
    if not options.labels:
        options.labels = sorted(d.columns)
    if options.chain_source:
        if jobids.previous:
            prev_source = job_params(jobids.previous).datasets.source
            assert len(datasets.source) == len(prev_source)
        else:
            prev_source = [None] * len(datasets.source)
        lst = []
        for src, stop in zip(datasets.source, prev_source):
            lst.extend(src.chain(stop_ds=stop))
        datasets.source = lst
    if filename.lower().endswith('.gz'):
        mkwrite = mkwrite_gz
    elif filename.lower().endswith('.csv'):
        mkwrite = mkwrite_uncompressed
    else:
        raise Exception(
            "Filename should end with .gz for compressed or .csv for uncompressed"
        )
    iters = []
    first = True
    for label in options.labels:
        it = d.iterate_list(sliceno,
                            label,
                            datasets.source,
                            status_reporting=first)
        first = False
        t = d.columns[label].type
        if t == 'unicode' and PY2:
            it = imap(enc, it)
        elif t == 'bytes' and PY3:
            it = imap(lambda s: s.decode('utf-8', errors='backslashreplace'),
                      it)
        elif t in ('float32', 'float64', 'number'):
            it = imap(repr, it)
        elif t == 'json':
            it = imap(dumps, it)
        elif t not in ('unicode', 'ascii', 'bytes'):
            it = imap(str, it)
        iters.append(it)
    it = izip(*iters)
    with mkwrite(filename) as write:
        q = options.quote_fields
        sep = options.separator
        if q:
            qq = q + q
            if labelsonfirstline:
                write(
                    enc(
                        sep.join(q + n.replace(q, qq) + q
                                 for n in options.labels)))
            for data in it:
                write(sep.join(q + n.replace(q, qq) + q for n in data))
        else:
            if labelsonfirstline:
                write(enc(sep.join(options.labels)))
            for data in it:
                write(sep.join(data))
示例#4
0
    def _iterate_datasets(to_iter, columns, pre_callback, post_callback,
                          filter_func, translation_func, translators,
                          want_tuple, range):
        skip_jobid = None

        def argfixup(func, is_post):
            if func:
                if len(getargspec(func).args) == 1:
                    seen_jobid = [None]

                    def wrapper(jobid, sliceno=None):
                        if jobid != seen_jobid[0]:
                            if is_post:
                                if seen_jobid[
                                        0] and seen_jobid[0] != skip_jobid:
                                    func(seen_jobid[0])
                            else:
                                func(jobid)
                            seen_jobid[0] = jobid

                    return wrapper, True
            return func, False

        pre_callback, unsliced_pre_callback = argfixup(pre_callback, False)
        post_callback, unsliced_post_callback = argfixup(post_callback, True)
        if not to_iter:
            return
        if range:
            range_k, (
                range_bottom,
                range_top,
            ) = next(iteritems(range))
            range_check = range_check_function(range_bottom, range_top)
            if range_k in columns and range_k not in translators and not translation_func:
                has_range_column = True
                range_i = columns.index(range_k)
                if want_tuple:
                    range_f = lambda t: range_check(t[range_i])
                else:
                    range_f = range_check
            else:
                has_range_column = False
        starting_at = '%s:%d' % (
            to_iter[0][0],
            to_iter[0][2],
        )
        if len(to_iter) == 1:
            msg = 'Iterating ' + starting_at
        else:
            msg = 'Iterating %d dataset slices starting at %s' % (
                len(to_iter),
                starting_at,
            )
        with status(msg):
            for ix, (jobid, d, sliceno, rehash) in enumerate(to_iter):
                if unsliced_post_callback:
                    post_callback(jobid)
                if pre_callback:
                    if jobid == skip_jobid:
                        continue
                    try:
                        pre_callback(jobid, sliceno)
                    except SkipSlice:
                        if unsliced_pre_callback:
                            skip_jobid = jobid
                        continue
                    except SkipJob:
                        skip_jobid = jobid
                        continue
                it = d._iterator(None if rehash else sliceno, columns)
                for ix, trans in translators.items():
                    it[ix] = imap(trans, it[ix])
                if want_tuple:
                    it = izip(*it)
                else:
                    it = it[0]
                if rehash:
                    it = d._hashfilter(sliceno, rehash, it)
                if translation_func:
                    it = imap(translation_func, it)
                if range:
                    c = d.columns[range_k]
                    if c.min is not None and (not range_check(c.min)
                                              or not range_check(c.max)):
                        if has_range_column:
                            it = ifilter(range_f, it)
                        else:
                            if rehash:
                                filter_it = d._hashfilter(
                                    sliceno, rehash,
                                    d._column_iterator(None, range_k))
                            else:
                                filter_it = d._column_iterator(
                                    sliceno, range_k)
                            it = compress(it, imap(range_check, filter_it))
                if filter_func:
                    it = ifilter(filter_func, it)
                with status('(%d/%d) %s:%s' % (
                        ix,
                        len(to_iter),
                        jobid,
                        'REHASH' if rehash else sliceno,
                )):
                    yield it
                if post_callback and not unsliced_post_callback:
                    post_callback(jobid, sliceno)
            if unsliced_post_callback:
                post_callback(None)