Пример #1
0
def sort(columniter):
    with status('Determining sort order'):
        info = datasets.source.columns
        if sum(info[column].type not in nononehandling_types
               for column in options.sort_columns):
            # At least one sort column can have unsortable values
            first = True
            iters = []
            for column in options.sort_columns:
                it = columniter(column, status_reporting=first)
                first = False
                if info[column].type not in nononehandling_types:
                    it = filter_unsortable(column, it)
                iters.append(it)
            if len(iters) == 1:
                # Special case to not make tuples when there is only one column.
                lst = list(iters[0])
            else:
                lst = list(izip(*iters))
        else:
            columns = options.sort_columns
            if len(columns) == 1:
                # Special case to not make tuples when there is only one column.
                columns = columns[0]
            lst = list(columniter(columns))
        reverse = (options.sort_order == 'descending')
        with status('Creating sort list'):
            return sorted(range(len(lst)),
                          key=lst.__getitem__,
                          reverse=reverse)
Пример #2
0
def analysis(sliceno, prepare_res):
    write = prepare_res[0].write_list
    ix = prepare_res[1]
    d = datasets.source
    to_copy = d.lines[sliceno]
    if to_copy == 0:
        # bail out empty slices right away
        return
    to_skip = sum(d.lines[:sliceno])
    if to_skip:
        it = d.iterate('roundrobin',
                       slice=to_skip - bool(options.trigger_column))
        if options.trigger_column:
            trigger_v = next(it)[ix]
            # keep skipping until trigger value changes
            for v in it:
                to_copy -= 1
                if v[ix] != trigger_v:
                    write(v)
                    break
                if to_copy == 0:
                    return  # no lines left for this slice
    else:
        it = d.iterate('roundrobin')
    # write the lines belonging here
    # (zip so we don't have to count down to_copy manually)
    for _, v in izip(range(to_copy), it):
        write(v)
    if options.trigger_column:
        trigger_v = v[ix]
        # keep copying until trigger value changes or lines run out
        for v in it:
            if trigger_v != v[ix]:
                break
            write(v)
Пример #3
0
def analysis(sliceno, prepare_res):
    writers, columns, chain = prepare_res
    key_it = chain.iterate(sliceno, options.column)
    # we can't just use chain.iterate because of protections against changing types with copy_mode
    values_it = itertools.chain.from_iterable(
        ds.iterate(sliceno, columns, copy_mode=True, status_reporting=False)
        for ds in chain)
    for key, values in izip(key_it, values_it):
        writers[unicode(key)].write(*values)
Пример #4
0
	def __init__(self, slices):
		slices = range(slices)
		self._slices = iter(slices)
		tuple_len = pickle_load("Analysis.tuple")
		if tuple_len is False:
			self._is_tupled = False
		else:
			self._is_tupled = True
			self._loaders = [self._loader(ix, iter(slices)) for ix in range(tuple_len)]
			self._tupled = izip(*self._loaders)
Пример #5
0
def csvexport(sliceno, filename, labelsonfirstline):
	assert len(options.separator) == 1
	assert options.quote_fields in ('', "'", '"',)
	d = datasets.source[0]
	if not options.labels:
		options.labels = sorted(d.columns)
	if options.chain_source:
		if jobids.previous:
			prev_source = job_params(jobids.previous).datasets.source
			assert len(datasets.source) == len(prev_source)
		else:
			prev_source = [None] * len(datasets.source)
		lst = []
		for src, stop in zip(datasets.source, prev_source):
			lst.extend(src.chain(stop_ds=stop))
		datasets.source = lst
	if filename.lower().endswith('.gz'):
		mkwrite = mkwrite_gz
	elif filename.lower().endswith('.csv'):
		mkwrite = mkwrite_uncompressed
	else:
		raise Exception("Filename should end with .gz for compressed or .csv for uncompressed")
	iters = []
	first = True
	for label in options.labels:
		it = d.iterate_list(sliceno, label, datasets.source, status_reporting=first)
		first = False
		t = d.columns[label].type
		if t == 'unicode' and PY2:
			it = imap(enc, it)
		elif t == 'bytes' and PY3:
			it = imap(lambda s: s.decode('utf-8', errors='backslashreplace'), it)
		elif t in ('float32', 'float64', 'number'):
			it = imap(repr, it)
		elif t == 'json':
			it = imap(dumps, it)
		elif t not in ('unicode', 'ascii', 'bytes'):
			it = imap(str, it)
		iters.append(it)
	it = izip(*iters)
	with mkwrite(filename) as write:
		q = options.quote_fields
		sep = options.separator
		if q:
			qq = q + q
			if labelsonfirstline:
				write(enc(sep.join(q + n.replace(q, qq) + q for n in options.labels)))
			for data in it:
				write(sep.join(q + n.replace(q, qq) + q for n in data))
		else:
			if labelsonfirstline:
				write(enc(sep.join(options.labels)))
			for data in it:
				write(sep.join(data))
Пример #6
0
def sort(columniter):
    with status('Determining sort order'):
        info = datasets.source.columns
        special_handling = set()
        for column in options.sort_columns:
            if info[column].type.startswith(
                    'float') or info[column].type == 'number':
                # for NaN
                special_handling.add(column)
            if info[column].none_support:
                special_handling.add(column)
        if special_handling:
            # At least one sort column can have unsortable values
            first = True
            iters = []
            for column in options.sort_columns:
                it = columniter(column, status_reporting=first)
                first = False
                if column in special_handling:
                    it = filter_unsortable(column, it)
                iters.append(it)
            if len(iters) == 1:
                # Special case to not make tuples when there is only one column.
                lst = list(iters[0])
            else:
                lst = list(izip(*iters))
        else:
            columns = options.sort_columns
            if len(columns) == 1:
                # Special case to not make tuples when there is only one column.
                columns = columns[0]
            lst = list(columniter(columns))
        if options.trigger_column:
            if len(options.sort_columns) == 1:
                sort_extra = lst
            else:
                with status('Creating trigger list'):
                    ix = options.sort_columns.index(options.trigger_column)
                    sort_extra = [el[ix] for el in lst]
        else:
            sort_extra = None
        reverse = (options.sort_order == 'descending')
        with status('Creating sort list'):
            return sorted(range(len(lst)),
                          key=lst.__getitem__,
                          reverse=reverse), sort_extra
Пример #7
0
    def grep(ds, sliceno):
        def no_conv(v):
            return v

        def mk_conv(col):
            if ds.columns[col].type in (
                    'bytes',
                    'unicode',
                    'ascii',
            ):
                if not ds.columns[col].none_support:
                    return no_conv
            return unicode

        chk = pat_s.search

        def mk_iter(col):
            if ds.columns[col].type == 'ascii':
                it = ds._column_iterator(sliceno, col, _type='unicode')
            else:
                it = ds._column_iterator(sliceno, col)
            if ds.columns[col].type == 'bytes':
                errors = 'replace' if PY2 else 'surrogateescape'
                if ds.columns[col].none_support:
                    it = (None if v is None else v.decode('utf-8', errors)
                          for v in it)
                else:
                    it = (v.decode('utf-8', errors) for v in it)
            return it

        def colour_item(item):
            pos = 0
            parts = []
            for m in pat_s.finditer(item):
                a, b = m.span()
                parts.extend((item[pos:a], colour.red(item[a:b])))
                pos = b
            parts.append(item[pos:])
            return ''.join(parts)

        if args.format == 'json':
            prefix = {}
            dumps = json.JSONEncoder(ensure_ascii=False,
                                     default=json_default).encode
            if args.show_dataset:
                prefix['dataset'] = ds
            if args.show_sliceno:
                prefix['sliceno'] = sliceno

            def show():
                d = dict(zip(used_columns, items))
                if args.show_lineno:
                    prefix['lineno'] = lineno
                if prefix:
                    prefix['data'] = d
                    d = prefix
                return dumps(d).encode('utf-8', 'surrogatepass')
        else:
            prefix = []
            if args.show_dataset:
                prefix.append(ds)
            if args.show_sliceno:
                prefix.append(str(sliceno))
            prefix = tuple(prefix)

            def show():
                data = list(prefix)
                if args.show_lineno:
                    data.append(unicode(lineno))
                if PY2:
                    show_items = (v if isinstance(v, unicode) else
                                  str(v).decode('utf-8', 'replace')
                                  for v in items)
                else:
                    show_items = map(str, items)
                show_items = list(show_items)
                lens = (len(item) for item in data + show_items)
                if highlight_matches:
                    show_items = list(map(colour_item, show_items))
                if escape_item:
                    lens_unesc = (len(item) for item in data + show_items)
                    show_items = list(map(escape_item, show_items))
                    lens_esc = (len(item) for item in data + show_items)
                    lens = (
                        l + esc - unesc
                        for l, unesc, esc in zip(lens, lens_unesc, lens_esc))
                data.extend(show_items)
                return separate(data, lens).encode('utf-8', errors)

        used_columns = columns or sorted(ds.columns)
        if grep_columns and grep_columns != set(used_columns):
            grep_iter = izip(*(mk_iter(col) for col in grep_columns))
            conv_items = [mk_conv(col) for col in grep_columns]
        else:
            grep_iter = repeat(None)
            conv_items = [mk_conv(col) for col in used_columns]
        lines_iter = izip(*(mk_iter(col) for col in used_columns))
        for lineno, (grep_items,
                     items) in enumerate(izip(grep_iter, lines_iter)):
            if any(
                    chk(conv(item))
                    for conv, item in izip(conv_items, grep_items or items)):
                # This will be atomic if the line is not too long
                # (at least up to PIPE_BUF bytes, should be at least 512).
                write(1, show() + b'\n')
Пример #8
0
def csvexport(sliceno, filename, labelsonfirstline):
    d = datasets.source[0]
    if not options.labels:
        options.labels = sorted(d.columns)
    if options.chain_source:
        if jobs.previous:
            prev_source = jobs.previous.params.datasets.source
            assert len(datasets.source) == len(prev_source)
        else:
            prev_source = [None] * len(datasets.source)
        lst = []
        for src, stop in zip(datasets.source, prev_source):
            lst.extend(src.chain(stop_ds=stop))
        datasets.source = lst
    if filename.lower().endswith('.gz'):
        open_func = partial(gzip.open, compresslevel=options.compression)
    elif filename.lower().endswith('.csv'):
        open_func = open
    else:
        raise Exception(
            "Filename should end with .gz for compressed or .csv for uncompressed"
        )
    if PY2:
        open_func = partial(open_func, mode='wb')
    else:
        open_func = partial(open_func, mode='xt', encoding='utf-8')
    iters = []
    first = True
    dumps = JSONEncoder(
        sort_keys=True,
        ensure_ascii=True,
        check_circular=False,
    ).encode
    for label in options.labels:
        it = d.iterate_list(sliceno,
                            label,
                            datasets.source,
                            status_reporting=first)
        first = False
        t = d.columns[label].type
        if d.columns[label].none_support:
            if t == 'bytes' or (PY2 and t == 'ascii'):
                it = imap(nonefix_b, it)
            elif t in (
                    'ascii',
                    'unicode',
            ):
                it = imap(nonefix_u, it)
        if t == 'unicode' and PY2:
            it = imap(enc, it)
        elif t == 'bytes' and PY3:
            it = imap(lambda s: s.decode('utf-8', errors='backslashreplace'),
                      it)
        elif t in (
                'float32',
                'float64',
        ):
            it = imap(repr, it)
        elif t == 'number':
            if PY2:
                it = imap(lambda n: str(n)
                          if isinstance(n, long) else repr(n), it)
            else:
                it = imap(repr, it)
        elif t == 'json':
            it = imap(dumps, it)
        elif t not in ('unicode', 'ascii', 'bytes'):
            it = imap(str, it)
        iters.append(it)
    it = izip(*iters)
    with writer(open_func(filename)) as write:
        q = options.quote_fields
        sep = options.separator
        if q:
            qq = q + q
            if labelsonfirstline:
                write(
                    enc(
                        sep.join(q + n.replace(q, qq) + q
                                 for n in options.labels)))
            for data in it:
                write(sep.join(q + n.replace(q, qq) + q for n in data))
        else:
            if labelsonfirstline:
                write(enc(sep.join(options.labels)))
            for data in it:
                write(sep.join(data))
Пример #9
0
def csvexport(sliceno, filename, labelsonfirstline):
    d = datasets.source[0]
    if not options.labels:
        options.labels = sorted(d.columns)
    if options.chain_source:
        if jobs.previous:
            prev_source = jobs.previous.params.datasets.source
            assert len(datasets.source) == len(prev_source)
        else:
            prev_source = [None] * len(datasets.source)
        lst = []
        for src, stop in zip(datasets.source, prev_source):
            lst.extend(src.chain(stop_ds=stop))
        datasets.source = lst
    if options.filename.lower().endswith(
            '.gz') or '.gz.' in options.filename.lower():
        open_func = partial(gzip.open, compresslevel=options.compression)
    else:
        open_func = open
    if PY2:
        open_func = partial(open_func, mode='wb')
    else:
        open_func = partial(open_func, mode='xt', encoding='utf-8')
    if options.none_as:
        if isinstance(options.none_as, dict):
            bad_none = set(options.none_as) - set(options.labels)
            assert not bad_none, 'Unknown labels in none_as: %r' % (bad_none, )
        else:
            assert isinstance(options.none_as,
                              str), "What did you pass as none_as?"

    def resolve_none(label, col):
        d = options.none_as or {}
        if col.type in (
                'json',
                'pickle',
        ):
            if isinstance(options.none_as, str):
                return options.none_as
            return d.get(label)
        elif col.none_support:
            if isinstance(options.none_as, str):
                return options.none_as
            return d.get(label, 'None')

    q = options.quote_fields
    qq = q + q
    sep = options.separator

    def quote_always(v):
        return q + v.replace(q, qq) + q

    if q in '"\'':
        # special case so both quotes will quote the other
        def quote_if_needed(v):
            if v and (v[0] in '"\'' or v[-1] in '"\'' or sep in v):
                return q + v.replace(q, qq) + q
            else:
                return v
    else:

        def quote_if_needed(v):
            if v.startswith(q) or v.endswith(q) or sep in v:
                return q + v.replace(q, qq) + q
            else:
                return v

    if not q:
        quote_func = str
    elif options.lazy_quotes and sep:  # always quote if no separator
        quote_func = quote_if_needed
    else:
        quote_func = quote_always

    def needs_quoting(typ):
        if not q:
            return False
        if not options.lazy_quotes:
            return True
        # maybe we can skip quoting because values that need quoting are impossible?
        if typ in (
                'int32',
                'int64',
                'bits32',
                'bits64',
        ):
            possible = '0123456789-'
        elif typ in (
                'float32',
                'float64',
                'number',
        ):
            possible = '0123456789-+einfa.'
        else:
            possible = False
        if possible:
            q_s = set(q)
            sep_s = set(sep)
            possible_s = set(possible)
            if q_s - possible_s and sep_s - possible_s:
                return False
        return True

    def column_iterator(d, label, first):
        col = d.columns[label]
        f = format.get(col.type, str)
        it = d.iterate(sliceno, label, status_reporting=first)
        none_as = resolve_none(label, col)
        if none_as is not None:
            none_as = quote_func(none_as)
            if needs_quoting(col.type):
                if f:
                    it = (none_as if v is None else quote_func(f(v))
                          for v in it)
                else:
                    it = (none_as if v is None else quote_func(v) for v in it)
            else:
                if f:
                    it = (none_as if v is None else f(v) for v in it)
                else:
                    it = (none_as if v is None else v for v in it)
        elif f:
            if needs_quoting(col.type):
                it = (quote_func(f(v)) for v in it)
            else:
                it = imap(f, it)
        elif needs_quoting(col.type):
            it = imap(quote_func, it)
        return it

    def outer_iterator(label, first):
        return chain.from_iterable(
            column_iterator(d, label, first) for d in datasets.source)

    iters = []
    first = True
    for label in options.labels:
        iters.append(outer_iterator(label, first))
        first = False
    it = izip(*iters)
    with writer(open_func(filename)) as write:
        if labelsonfirstline:
            write(enc(sep.join(map(quote_func, options.labels))))
        for data in it:
            write(sep.join(data))
Пример #10
0
    def grep(ds, sliceno, out):
        out.start(ds)
        if len(patterns) == 1:
            chk = patterns[0].search
        else:

            def chk(s):
                return any(p.search(s) for p in patterns)

        first = [True]

        def mk_iter(col):
            kw = {}
            if first[0]:
                first[0] = False
                lines = ds.lines[sliceno]
                if lines > status_interval[sliceno]:

                    def cb(n):
                        q_status.put((sliceno, False))
                        out.excite()

                    kw['callback'] = cb
                    kw['callback_interval'] = status_interval[sliceno]
            if ds.columns[col].type == 'ascii':
                kw['_type'] = 'unicode'
            it = ds._column_iterator(sliceno, col, **kw)
            if ds.columns[col].type == 'bytes':
                errors = 'replace' if PY2 else 'surrogateescape'
                if ds.columns[col].none_support:
                    it = (None if v is None else v.decode('utf-8', errors)
                          for v in it)
                else:
                    it = (v.decode('utf-8', errors) for v in it)
            return it

        used_columns = columns_for_ds(ds)
        used_grep_columns = grep_columns and columns_for_ds(ds, grep_columns)
        if grep_columns and set(used_grep_columns) != set(used_columns):
            grep_iter = izip(*(mk_iter(col) for col in used_grep_columns))
        else:
            grep_iter = repeat(None)
        lines_iter = izip(*(mk_iter(col) for col in used_columns))
        if args.before_context:
            before = deque((), args.before_context)
        else:
            before = None
        if args.format == 'json':
            prefix = {}
            if args.show_dataset:
                prefix['dataset'] = ds
            if args.show_sliceno:
                prefix['sliceno'] = sliceno
            show = make_show(prefix, used_columns)
        else:
            prefix = []
            if args.show_dataset:
                prefix.append(ds)
            if args.show_sliceno:
                prefix.append(str(sliceno))
            prefix = tuple(prefix)
            show = make_show(prefix, used_columns)
        if args.invert_match:
            maybe_invert = operator.not_
        else:
            maybe_invert = bool
        to_show = 0
        for lineno, (grep_items,
                     items) in enumerate(izip(grep_iter, lines_iter)):
            if maybe_invert(
                    any(chk(unicode(item)) for item in grep_items or items)):
                if q_list:
                    q_list.put((ds, sliceno))
                    return
                while before:
                    out.put(show(*before.popleft()))
                to_show = 1 + args.after_context
            if to_show:
                out.put(show(lineno, items))
                to_show -= 1
            elif before is not None:
                before.append((lineno, items))
        out.end(ds)
Пример #11
0
    def grep(ds, sliceno):
        # Use bytes for everything if anything is bytes, str otherwise. (For speed.)
        if any(ds.columns[col].backing_type == 'bytes'
               for col in (grep_columns or columns or ds.columns)):

            def strbytes(v):
                return str(v).encode('utf-8', 'replace')

            def mk_iter(col):
                if ds.columns[col].backing_type in (
                        'bytes',
                        'unicode',
                        'ascii',
                ):
                    return ds._column_iterator(sliceno, col, _type='bytes')
                else:
                    return imap(strbytes, ds._column_iterator(sliceno, col))

            chk = pat_b.search
        else:

            def mk_iter(col):
                if ds.columns[col].backing_type in (
                        'unicode',
                        'ascii',
                ):
                    return ds._column_iterator(sliceno, col, _type='unicode')
                else:
                    return imap(str, ds._column_iterator(sliceno, col))

            chk = pat_s.search

        def fmt(v):
            if not isinstance(v, (unicode, bytes)):
                v = str(v)
            if isinstance(v, unicode):
                v = v.encode('utf-8', 'replace')
            return v

        def color(item):
            pos = 0
            parts = []
            for m in pat_b.finditer(item):
                a, b = m.span()
                parts.extend((item[pos:a], b'\x1b[31m', item[a:b], b'\x1b[m'))
                pos = b
            parts.append(item[pos:])
            return b''.join(parts)

        prefix = []
        if args.show_dataset:
            prefix.append(ds.encode('utf-8'))
        if args.show_sliceno:
            prefix.append(str(sliceno).encode('utf-8'))
        prefix = tuple(prefix)

        def show(prefix, items):
            items = map(fmt, items)
            if args.color:
                items = map(color, items)
            # This will be atomic if the line is not too long
            # (at least up to PIPE_BUF bytes, should be at least 512).
            write(1, separator_b.join(prefix + tuple(items)) + b'\n')

        if grep_columns and grep_columns != set(columns or ds.columns):
            grep_iter = izip(*(mk_iter(col) for col in grep_columns))
            lines_iter = ds.iterate(sliceno, columns)
        else:
            grep_iter = repeat(None)
            lines_iter = izip(*(mk_iter(col)
                                for col in (columns or sorted(ds.columns))))
        lines = izip(grep_iter, lines_iter)
        if args.show_lineno:
            for lineno, (grep_items, items) in enumerate(lines):
                if any(imap(chk, grep_items or items)):
                    show(prefix + (str(lineno).encode('utf-8'), ), items)
        else:
            for grep_items, items in lines:
                if any(imap(chk, grep_items or items)):
                    show(prefix, items)