Пример #1
0
def analysis(sliceno, prepare_res, params):
    dws = prepare_res
    dws.down_discarded.enable_hash_discard()
    dws.down_discarded_list.enable_hash_discard()
    dws.down_discarded_dict.enable_hash_discard()
    dws.up_datetime.enable_hash_discard()
    dws.down_time.enable_hash_discard()
    dws.up_ascii.enable_hash_discard()
    dws.down_unicode.enable_hash_discard()
    dws.up_date.enable_hash_discard()
    dws.down_date.enable_hash_discard()
    for ix, (up, down) in enumerate(all_data):
        if dws.up_checked.hashcheck(up):
            dws.up_checked.write(up, down)
        if dws.down_checked.hashcheck(down):
            dws.down_checked.write(up, down)
        if ix % params.slices == sliceno:
            dws.unhashed_manual.write(up, down)
            dws.unhashed_complex64.write(up, down)
            dws.unhashed_bytes.write(
                str(up).encode("ascii"),
                str(down).encode("ascii"))
        dws.down_discarded.write(up, down)
        dws.down_discarded_list.write_list([up, down])
        dws.down_discarded_dict.write_dict(dict(up=up, down=down))
        dt_up = datetime(1970, 1, 1, 0, 0, 0, up)
        dt_down = datetime(1970, 1, 1, 0, 0, 0, down)
        dws.up_datetime.write(dt_up, dt_down)
        dws.down_time.write(dt_up.time(), dt_down.time())
        dws.up_date.write(date.fromordinal(up + 1), date.fromordinal(down + 1))
        dws.down_date.write(date.fromordinal(up + 1),
                            date.fromordinal(down + 1))
        dws.up_ascii.write(str(up), str(down))
        dws.down_unicode.write(unicode(up), unicode(down))
    # verify that we are not allowed to write in the wrong slice without enable_hash_discard
    if not dws.up_checked.hashcheck(0):
        good = True
        for fn, a in (
            ("write", (
                0,
                0,
            )),
            ("write_list", ([0, 0], )),
            ("write_dict", (dict(up=0, down=0), )),
        ):
            try:
                getattr(dws.up_checked, fn)(*a)
                good = False
            except Exception:
                pass
            assert good, "%s allowed writing in wrong slice" % (fn, )
Пример #2
0
 def show(lineno, items):
     if only_matching == 'part':
         items = [filter_item(unicode(item)) for item in items]
     if only_matching == 'columns':
         d = {
             k: v
             for k, v in zip(used_columns, items)
             if filter_item(unicode(v))
         }
     else:
         d = dict(zip(used_columns, items))
     if args.show_lineno:
         prefix['lineno'] = lineno
     if prefix:
         prefix['data'] = d
         d = prefix
     return dumps(d).encode('utf-8', 'surrogatepass') + b'\n'
Пример #3
0
def analysis(sliceno, prepare_res):
    writers, columns, chain = prepare_res
    key_it = chain.iterate(sliceno, options.column)
    # we can't just use chain.iterate because of protections against changing types with copy_mode
    values_it = itertools.chain.from_iterable(
        ds.iterate(sliceno, columns, copy_mode=True, status_reporting=False)
        for ds in chain)
    for key, values in izip(key_it, values_it):
        writers[unicode(key)].write(*values)
Пример #4
0
def quote(s):
	"""Quote s unless it looks fine without"""
	s = unicode(s)
	r = repr(s)
	if PY2:
		# remove leading u
		r = r[1:]
	if s and len(s) + 2 == len(r) and not any(c.isspace() for c in s):
		return s
	else:
		return r
Пример #5
0
 def show():
     data = list(prefix)
     if args.show_lineno:
         data.append(unicode(lineno))
     if PY2:
         show_items = (v if isinstance(v, unicode) else
                       str(v).decode('utf-8', 'replace')
                       for v in items)
     else:
         show_items = map(str, items)
     show_items = list(show_items)
     lens = (len(item) for item in data + show_items)
     if highlight_matches:
         show_items = list(map(colour_item, show_items))
     if escape_item:
         lens_unesc = (len(item) for item in data + show_items)
         show_items = list(map(escape_item, show_items))
         lens_esc = (len(item) for item in data + show_items)
         lens = (
             l + esc - unesc
             for l, unesc, esc in zip(lens, lens_unesc, lens_esc))
     data.extend(show_items)
     return separate(data, lens).encode('utf-8', errors)
Пример #6
0
 def show(lineno, items):
     data = list(prefix)
     if args.show_lineno:
         data.append(unicode(lineno))
     show_items = map(unicode, items)
     if only_matching:
         if only_matching == 'columns':
             show_items = (item if filter_item(item) else ''
                           for item in show_items)
         else:
             show_items = map(filter_item, show_items)
     show_items = list(show_items)
     lens = (len(item) for item in data + show_items)
     if highlight_matches:
         show_items = list(map(colour_item, show_items))
     if escape_item:
         lens_unesc = (len(item) for item in data + show_items)
         show_items = list(map(escape_item, show_items))
         lens_esc = (len(item) for item in data + show_items)
         lens = (
             l + esc - unesc
             for l, unesc, esc in zip(lens, lens_unesc, lens_esc))
     data.extend(show_items)
     return separate(data, lens).encode('utf-8', errors) + b'\n'
def synthesis(job):
    # Test keeping untyped columns of bytes-like types.
    dw = job.datasetwriter(name='a',
                           columns={
                               'a': 'unicode',
                               'b': 'bytes',
                               'c': 'ascii',
                               'd': 'number'
                           })
    write = dw.get_split_write()
    write('a', b'b', 'c', 0)
    a = dw.finish()
    assert a.hashlabel == None
    typed_a = subjobs.build('dataset_type',
                            options=dict(hashlabel='a',
                                         column2type={'a': 'ascii'}),
                            datasets=dict(source=a)).dataset()
    assert typed_a.hashlabel == 'a'
    assert list(typed_a.iterate(None)) == [('a', b'b', 'c')], typed_a

    # Test hashing on a column not explicitly typed.
    dw = job.datasetwriter(name='b',
                           columns={
                               'a': 'unicode',
                               'b': 'ascii',
                               'c': 'bytes',
                               'd': 'unicode'
                           },
                           previous=a)
    write = dw.get_split_write()
    write('A', 'B', b'C', '1')
    b = dw.finish()
    assert b.hashlabel == None
    typed_b = subjobs.build('dataset_type',
                            options=dict(hashlabel='a',
                                         column2type={'b': 'ascii'}),
                            datasets=dict(source=b)).dataset()
    assert typed_b.hashlabel == 'a'
    assert set(typed_b.iterate(None)) == {('a', 'b'), ('A', 'B')}, typed_b

    # Test renaming over the original hashlabel
    dw = job.datasetwriter(name='c',
                           columns={
                               'a': 'unicode',
                               'b': 'ascii',
                               'c': 'bytes',
                               'd': 'unicode'
                           },
                           hashlabel='a')
    write = dw.get_split_write()
    write('\xe5', 'b', b'c', '0')
    c = dw.finish()
    assert c.hashlabel == 'a'
    typed_c = subjobs.build('dataset_type',
                            options=dict(column2type={
                                'a': 'ascii',
                                'd': 'number'
                            },
                                         rename={'c': 'a'}),
                            datasets=dict(source=c)).dataset()
    assert typed_c.hashlabel == None
    assert list(typed_c.iterate(None)) == [('c', 'b', b'c', 0)], typed_c

    # Test using the original names but for different columns (keeping hashlabel under new name)
    dw = job.datasetwriter(name='d',
                           columns={
                               'a': 'unicode',
                               'b': 'ascii',
                               'c': 'bytes',
                               'd': 'unicode'
                           },
                           hashlabel='a')
    write = dw.get_split_write()
    write('\xc5', 'B', B'C', '1')
    d = dw.finish()
    assert d.hashlabel == 'a'
    typed_d = subjobs.build('dataset_type',
                            options=dict(column2type={
                                'a': 'bytes',
                                'b': 'ascii',
                                'c': 'int32_10',
                                'd': 'bytes'
                            },
                                         rename={
                                             'b': 'a',
                                             'c': 'b',
                                             'd': 'c',
                                             'a': 'd'
                                         }),
                            datasets=dict(source=d)).dataset()
    assert typed_d.hashlabel == 'd'
    assert list(typed_d.iterate(None)) == [(b'B', 'C', 1, b'\xc3\x85')
                                           ], typed_c

    # Test various types for hashing and discarding of bad lines.
    for hl in (None, 'a', 'b', 'c'):
        dw = job.datasetwriter(name='hashed on %s' % (hl, ),
                               columns={
                                   'a': 'unicode',
                                   'b': 'unicode',
                                   'c': 'unicode'
                               },
                               hashlabel=hl)
        w = dw.get_split_write()
        for ix in range(1000):
            w(unicode(ix), '%d.%d' % (ix, ix % 5 == 0),
              ('{"a": %s}' if ix % 3 else '%d is bad') % (ix, ))
        src_ds = dw.finish()
        assert src_ds.hashlabel == hl
        test(
            src_ds,
            dict(column2type={
                'a': 'int32_10',
                'b': 'number:int'
            },
                 filter_bad=True), 800)
        test(
            src_ds,
            dict(column2type={
                'a': 'int64_10',
                'b': 'number',
                'c': 'json'
            },
                 filter_bad=True), 666)
        test(
            src_ds,
            dict(column2type={
                'a': 'floatint32ei',
                'b': 'number:int',
                'c': 'json'
            },
                 filter_bad=True), 533)
        test(
            src_ds,
            dict(column2type={
                'from_a': 'number',
                'from_b': 'float64',
                'from_c': 'ascii'
            },
                 rename=dict(a='from_a', b='from_b', c='from_c')), 1000)
        test(
            src_ds,
            dict(column2type={
                'c': 'bits32_16',
                'a': 'float32',
                'b': 'bytes'
            },
                 rename=dict(a='c', b='a', c='b')), 1000)

    # this doesn't test as many permutations, it's just to test more column types.
    dw = job.datasetwriter(name='more types')
    cols = {
        'floatbooli':
        cycle(['1.42 or so', '0 maybe', '1 (exactly)']),
        'datetime:%Y%m%d %H:%M': [
            '2019%02d%02d 17:%02d' % (t % 12 + 1, t % 28 + 1, t % 60)
            for t in range(1000)
        ],
        'date:%Y%m%d':
        ['2019%02d%02d' % (
            t % 12 + 1,
            t % 28 + 1,
        ) for t in range(1000)],
        'time:%H:%M': ['%02d:%02d' % (t // 60, t % 60) for t in range(1000)],
        'timei:%H:%M': [
            '%02d:%02d%c' % (t // 60, t % 60, chr(t % 26 + 65))
            for t in range(1000)
        ],
    }
    gens = []
    for coltype, gen in cols.items():
        dw.add(coltype.split(':')[0], 'ascii')
        gens.append(iter(gen))
    dw.add('half', 'bytes')
    gens.append(cycle([b'1', b'no']))
    w = dw.get_split_write()
    for _ in range(1000):
        w(*map(next, gens))
    src_ds = dw.finish()
    assert src_ds.hashlabel == None
    column2type = {t.split(':')[0]: t for t in cols}
    for hl in column2type:
        hashed = subjobs.build('dataset_type',
                               options=dict(column2type=column2type,
                                            hashlabel=hl),
                               datasets=dict(source=src_ds)).dataset()
        assert hashed.hashlabel == hl
        unhashed = subjobs.build('dataset_type',
                                 options=dict(column2type=column2type),
                                 datasets=dict(source=src_ds)).dataset()
        assert unhashed.hashlabel == None
        rehashed = subjobs.build('dataset_rehash',
                                 options=dict(hashlabel=hl),
                                 datasets=dict(source=unhashed)).dataset()
        assert rehashed.hashlabel == hl
        assert hashed.lines == rehashed.lines
        assert sum(hashed.lines) == 1000
        assert set(hashed.columns.keys()) == set(
            unhashed.columns.keys()) == set(rehashed.columns.keys())
        # and again with a bad column
        column2type['half'] = 'float32'
        hashed = subjobs.build('dataset_type',
                               options=dict(column2type=column2type,
                                            hashlabel=hl,
                                            filter_bad=True),
                               datasets=dict(source=src_ds)).dataset()
        assert hashed.hashlabel == hl
        unhashed = subjobs.build('dataset_type',
                                 options=dict(column2type=column2type,
                                              filter_bad=True),
                                 datasets=dict(source=src_ds)).dataset()
        assert unhashed.hashlabel == None
        rehashed = subjobs.build('dataset_rehash',
                                 options=dict(hashlabel=hl),
                                 datasets=dict(source=unhashed)).dataset()
        assert rehashed.hashlabel == hl
        del column2type['half']
        assert hashed.lines == rehashed.lines
        assert sum(hashed.lines) == 500
        assert set(hashed.columns.keys()) == set(
            unhashed.columns.keys()) == set(rehashed.columns.keys())
Пример #8
0
def synthesis(job, slices):
    # Test keeping untyped columns.
    dw = job.datasetwriter(name='a',
                           columns={
                               'a': 'unicode',
                               'b': ('bytes', True),
                               'c': ('ascii', True),
                               'd': ('number', True)
                           })
    write = dw.get_split_write()
    write('A', None, None, None)
    write('a', b'b', 'c', 0)
    a = dw.finish()
    assert a.hashlabel == None
    typed_a = subjobs.build('dataset_type',
                            options=dict(hashlabel='a',
                                         column2type={'a': 'ascii'}),
                            datasets=dict(source=a)).dataset()
    assert typed_a.hashlabel == 'a'
    assert set(typed_a.iterate(None)) == {('A', None, None, None),
                                          ('a', b'b', 'c', 0)}, typed_a

    # Test hashing on a column not explicitly typed.
    dw = job.datasetwriter(name='b',
                           columns={
                               'a': 'unicode',
                               'b': 'ascii',
                               'c': 'bytes',
                               'd': 'unicode'
                           },
                           previous=a)
    write = dw.get_split_write()
    write('A', 'B', b'C', '1')
    b = dw.finish()
    assert b.hashlabel == None
    typed_b = subjobs.build('dataset_type',
                            options=dict(hashlabel='a',
                                         column2type={'b': 'ascii'}),
                            datasets=dict(source=b)).dataset()
    assert typed_b.hashlabel == 'a'
    assert set(typed_b.iterate(None)) == {('a', 'b', b'c'), ('A', None, None),
                                          ('A', 'B', b'C')}, typed_b

    # Test renaming over the original hashlabel
    dw = job.datasetwriter(name='c',
                           columns={
                               'a': 'unicode',
                               'b': 'ascii',
                               'c': 'bytes',
                               'd': 'unicode'
                           },
                           hashlabel='a')
    write = dw.get_split_write()
    write('\xe5', 'b', b'c', '0')
    c = dw.finish()
    assert c.hashlabel == 'a'
    typed_c = subjobs.build('dataset_type',
                            options=dict(column2type={
                                'a': 'ascii',
                                'd': 'number'
                            },
                                         rename={'c': 'a'}),
                            datasets=dict(source=c)).dataset()
    assert typed_c.hashlabel == None
    assert list(typed_c.iterate(None)) == [('c', 'b', b'c', 0)], typed_c

    # Test using the original names but for different columns (keeping hashlabel under new name)
    dw = job.datasetwriter(name='d',
                           columns={
                               'a': 'unicode',
                               'b': 'ascii',
                               'c': 'bytes',
                               'd': 'unicode'
                           },
                           hashlabel='a')
    write = dw.get_split_write()
    write('\xc5', 'B', B'C', '1')
    d = dw.finish()
    assert d.hashlabel == 'a'
    typed_d = subjobs.build('dataset_type',
                            options=dict(column2type={
                                'a': 'bytes',
                                'b': 'ascii',
                                'c': 'int32_10',
                                'd': 'bytes'
                            },
                                         rename={
                                             'b': 'a',
                                             'c': 'b',
                                             'd': 'c',
                                             'a': 'd'
                                         }),
                            datasets=dict(source=d)).dataset()
    assert typed_d.hashlabel == 'd'
    assert list(typed_d.iterate(None)) == [(b'B', 'C', 1, b'\xc3\x85')
                                           ], typed_c

    # Test various types for hashing and discarding of bad lines.
    for hl in (None, 'a', 'b', 'c'):
        dw = job.datasetwriter(name='hashed on %s' % (hl, ),
                               columns={
                                   'a': 'unicode',
                                   'b': 'unicode',
                                   'c': 'unicode'
                               },
                               hashlabel=hl)
        w = dw.get_split_write()
        for ix in range(1000):
            w(unicode(ix), '%d.%d' % (ix, ix % 5 == 0),
              ('{"a": %s}' if ix % 3 else '%d is bad') % (ix, ))
        src_ds = dw.finish()
        assert src_ds.hashlabel == hl
        test(
            src_ds,
            dict(column2type={
                'a': 'int32_10',
                'b': 'number:int'
            },
                 filter_bad=True), 800)
        test(
            src_ds,
            dict(column2type={
                'a': 'int64_10',
                'b': 'number',
                'c': 'json'
            },
                 filter_bad=True), 666)
        test(
            src_ds,
            dict(column2type={
                'a': 'floatint32ei',
                'b': 'number:int',
                'c': 'json'
            },
                 filter_bad=True), 533)
        test(
            src_ds,
            dict(column2type={
                'from_a': 'number',
                'from_b': 'float64',
                'from_c': 'ascii'
            },
                 rename=dict(a='from_a', b='from_b', c='from_c')), 1000)
        test(
            src_ds,
            dict(column2type={
                'c': 'bits32_16',
                'a': 'float32',
                'b': 'bytes'
            },
                 rename=dict(a='c', b='a', c='b')), 1000)

    # this doesn't test as many permutations, it's just to test more column types.
    dw = job.datasetwriter(name='more types')
    cols = {
        'floatbooli':
        cycle(['1.42 or so', '0 maybe', '1 (exactly)']),
        'datetime:%Y%m%d %H:%M': [
            '2019%02d%02d 17:%02d' % (t % 12 + 1, t % 28 + 1, t % 60)
            for t in range(1000)
        ],
        'date:%Y%m%d':
        ['2019%02d%02d' % (
            t % 12 + 1,
            t % 28 + 1,
        ) for t in range(1000)],
        'time:%H:%M': ['%02d:%02d' % (t // 60, t % 60) for t in range(1000)],
        'timei:%H:%M': [
            '%02d:%02d%c' % (t // 60, t % 60, chr(t % 26 + 65))
            for t in range(1000)
        ],
    }
    gens = []
    for coltype, gen in cols.items():
        dw.add(coltype.split(':')[0], 'ascii')
        gens.append(iter(gen))
    dw.add('half', 'bytes')
    gens.append(cycle([b'1', b'no']))
    w = dw.get_split_write()
    for _ in range(1000):
        w(*map(next, gens))
    src_ds = dw.finish()
    assert src_ds.hashlabel == None
    column2type = {t.split(':')[0]: t for t in cols}
    for hl in column2type:
        hashed = subjobs.build('dataset_type',
                               options=dict(column2type=column2type,
                                            hashlabel=hl),
                               datasets=dict(source=src_ds)).dataset()
        assert hashed.hashlabel == hl
        unhashed = subjobs.build('dataset_type',
                                 options=dict(column2type=column2type),
                                 datasets=dict(source=src_ds)).dataset()
        assert unhashed.hashlabel == None
        rehashed = subjobs.build('dataset_hashpart',
                                 options=dict(hashlabel=hl),
                                 datasets=dict(source=unhashed)).dataset()
        assert rehashed.hashlabel == hl
        assert hashed.lines == rehashed.lines
        assert sum(hashed.lines) == 1000
        assert set(hashed.columns.keys()) == set(
            unhashed.columns.keys()) == set(rehashed.columns.keys())
        # and again with a bad column
        column2type['half'] = 'float32'
        hashed = subjobs.build('dataset_type',
                               options=dict(column2type=column2type,
                                            hashlabel=hl,
                                            filter_bad=True),
                               datasets=dict(source=src_ds)).dataset()
        assert hashed.hashlabel == hl
        unhashed = subjobs.build('dataset_type',
                                 options=dict(column2type=column2type,
                                              filter_bad=True),
                                 datasets=dict(source=src_ds)).dataset()
        assert unhashed.hashlabel == None
        rehashed = subjobs.build('dataset_hashpart',
                                 options=dict(hashlabel=hl),
                                 datasets=dict(source=unhashed)).dataset()
        assert rehashed.hashlabel == hl
        del column2type['half']
        assert hashed.lines == rehashed.lines
        assert sum(hashed.lines) == 500
        assert set(hashed.columns.keys()) == set(
            unhashed.columns.keys()) == set(rehashed.columns.keys())

    # test rehashing on a column we don't type, over all types.
    dw = job.datasetwriter(name='rehash all types',
                           columns={
                               '2type': ('ascii', True),
                               'ascii': ('ascii', True),
                               'bits32': ('bits32', False),
                               'bits64': ('bits64', False),
                               'bool': ('bool', True),
                               'bytes': ('bytes', True),
                               'date': ('date', True),
                               'datetime': ('datetime', True),
                               'float32': ('float32', True),
                               'float64': ('float64', True),
                               'int32': ('int32', True),
                               'int64': ('int64', True),
                               'json': ('json', True),
                               'number': ('number', True),
                               'time': ('time', True),
                               'unicode': ('unicode', True),
                           })
    write = dw.get_split_write()
    data = {
        '42':
        ('ascii string', 100, 1000, True, b'bytes string', date(2019, 12, 11),
         datetime(2019, 12, 11, 20, 7, 21), 1.5, 0.00000001, 99, -11, {
             "a": "b"
         }, 1e100, time(20, 7, 21), 'unicode string'),
        None: (None, 0, 0, None, None, None, None, None, None, None, None,
               None, None, None, None),
        '18': ('ASCII STRING', 111, 1111, False, b'BYTES STRING',
               date(1868, 1, 3), datetime(1868, 1, 3, 13, 14,
                                          5), 2.5, -0.0000001, 67, -99,
               [42, ".."], 5e100, time(13, 14, 5), 'UNICODE STRING'),
    }
    write('42', *data['42'])
    write(None, *data[None])
    write('18', *data['18'])
    src_ds = dw.finish()
    data['None'] = data.pop(None)
    type2type = {
        'ascii': 'unicode:ascii',
        'bool': 'unicode:ascii',
        'date': 'unicode:ascii',
        'datetime': 'unicode:ascii',
        'time': 'unicode:ascii',
        'bits32': 'bits32_10',
        'bits64': 'bits64_10',
        'bytes': 'bytes',
        'float32': 'float32',
        'float64': 'float64',
        'int32': 'int32_10',
        'int64': 'int64_10',
        'number': 'number',
        'unicode': 'unicode:ascii',
    }
    for hl, typeas in sorted(type2type.items()):
        ds = subjobs.build('dataset_type',
                           column2type={
                               '2type': typeas
                           },
                           hashlabel=hl,
                           source=src_ds).dataset()
        seen = set()
        hl_hash = typed_writer(hl).hash
        for sliceno in range(slices):
            for line in ds.iterate(sliceno, None):
                key = line[0] or None
                if isinstance(key, float):
                    key = int(key)
                if isinstance(key, bytes):
                    key = key.decode('ascii')
                else:
                    key = unicode(key)
                assert data.get(key) == line[
                    1:], "%s (hl %s) didn't have the right data for line %r" % (
                        ds,
                        hl,
                        line[0],
                    )
                hv = line[sorted(src_ds.columns).index(hl)]
                assert hl_hash(
                    hv
                ) % slices == sliceno, "%s (hl %s) didn't hash %r correctly" % (
                    ds,
                    hl,
                    hv,
                )
                assert key not in seen, "%s (hl %s) repeated line %s" % (
                    ds,
                    hl,
                    line[0],
                )
                seen.add(key)
        assert seen == {'42', 'None',
                        '18'}, "%s didn't have all lines (%r)" % (
                            ds,
                            seen,
                        )
Пример #9
0
 def __reduce__(self):
     return unicode, (unicode(self), )
Пример #10
0
    def grep(ds, sliceno, out):
        out.start(ds)
        if len(patterns) == 1:
            chk = patterns[0].search
        else:

            def chk(s):
                return any(p.search(s) for p in patterns)

        first = [True]

        def mk_iter(col):
            kw = {}
            if first[0]:
                first[0] = False
                lines = ds.lines[sliceno]
                if lines > status_interval[sliceno]:

                    def cb(n):
                        q_status.put((sliceno, False))
                        out.excite()

                    kw['callback'] = cb
                    kw['callback_interval'] = status_interval[sliceno]
            if ds.columns[col].type == 'ascii':
                kw['_type'] = 'unicode'
            it = ds._column_iterator(sliceno, col, **kw)
            if ds.columns[col].type == 'bytes':
                errors = 'replace' if PY2 else 'surrogateescape'
                if ds.columns[col].none_support:
                    it = (None if v is None else v.decode('utf-8', errors)
                          for v in it)
                else:
                    it = (v.decode('utf-8', errors) for v in it)
            return it

        used_columns = columns_for_ds(ds)
        used_grep_columns = grep_columns and columns_for_ds(ds, grep_columns)
        if grep_columns and set(used_grep_columns) != set(used_columns):
            grep_iter = izip(*(mk_iter(col) for col in used_grep_columns))
        else:
            grep_iter = repeat(None)
        lines_iter = izip(*(mk_iter(col) for col in used_columns))
        if args.before_context:
            before = deque((), args.before_context)
        else:
            before = None
        if args.format == 'json':
            prefix = {}
            if args.show_dataset:
                prefix['dataset'] = ds
            if args.show_sliceno:
                prefix['sliceno'] = sliceno
            show = make_show(prefix, used_columns)
        else:
            prefix = []
            if args.show_dataset:
                prefix.append(ds)
            if args.show_sliceno:
                prefix.append(str(sliceno))
            prefix = tuple(prefix)
            show = make_show(prefix, used_columns)
        if args.invert_match:
            maybe_invert = operator.not_
        else:
            maybe_invert = bool
        to_show = 0
        for lineno, (grep_items,
                     items) in enumerate(izip(grep_iter, lines_iter)):
            if maybe_invert(
                    any(chk(unicode(item)) for item in grep_items or items)):
                if q_list:
                    q_list.put((ds, sliceno))
                    return
                while before:
                    out.put(show(*before.popleft()))
                to_show = 1 + args.after_context
            if to_show:
                out.put(show(lineno, items))
                to_show -= 1
            elif before is not None:
                before.append((lineno, items))
        out.end(ds)
Пример #11
0
def synthesis(job):
	def mk(name, types, lines, hashlabel=None, previous=None):
		columns = {chr(ix): typ for ix, typ in enumerate(types, 65)}
		dw = job.datasetwriter(name=name, columns=columns, hashlabel=hashlabel, previous=previous)
		w = dw.get_split_write_list()
		for line in lines:
			w(line)
		return dw.finish()

	def chk(job, colnames, types, ds2lines, previous={}, hashlabel=None):
		have_ds = set(ds.name for ds in job.datasets)
		want_ds = set(ds2lines)
		assert have_ds == want_ds, 'Job %r should have had datasets %r but had %r' % (job, want_ds, have_ds,)
		colnames = sorted(colnames)
		for ds, lines in ds2lines.items():
			ds = job.dataset(ds)
			assert ds.hashlabel == hashlabel, 'Dataset %s should have had hashlabel %s but had %s' % (ds.quoted, hashlabel, ds.hashlabel,)
			assert ds.previous == previous.get(ds.name), 'Dataset %s should have had previous %s but had %s' % (ds.quoted, previous.get(ds.name), ds.previous,)
			ds_colnames = sorted(ds.columns)
			assert ds_colnames == colnames, 'Dataset %s should have had columns %r but had %r' % (ds.quoted, colnames, ds_colnames,)
			ds_types = tuple(col.type for _, col in sorted(ds.columns.items()))
			assert ds_types == types, 'Dataset %s should have had columns with types %r but had %r' % (ds.quoted, types, ds_types,)
			have_lines = sorted(ds.iterate(None))
			want_lines = sorted(lines)
			assert have_lines == want_lines, 'Dataset %s should have contained %r but contained %r' % (ds.quoted, want_lines, have_lines,)

	# just a simple splitting
	a = mk('a', ('unicode', 'ascii', 'int64'), [('a', 'a', 1), ('b', 'b', 2), ('a', 'c', 3)], hashlabel='A')
	j_a_A = subjobs.build('dataset_fanout', source=a, column='A')
	chk(j_a_A, 'BC', ('ascii', 'int64'), {'a': [('a', 1), ('c', 3)], 'b': [('b', 2)]})
	j_a_B = subjobs.build('dataset_fanout', source=a, column='B')
	chk(j_a_B, 'AC', ('unicode', 'int64'), {'a': [('a', 1)], 'b': [('b', 2)], 'c': [('a', 3)]}, hashlabel='A')

	# non-text columns should work too
	j_a_C = subjobs.build('dataset_fanout', source=a, column='C')
	chk(j_a_C, 'AB', ('unicode', 'ascii'), {'1': [('a', 'a')], '2': [('b', 'b')], '3': [('a', 'c')]}, hashlabel='A')

	b = mk('b', ('ascii', 'unicode', 'int32', 'int32'), [('a', 'aa', 11, 111), ('b', 'bb', 12, 112), ('a', 'cc', 13, 113), ('d', 'dd', 14, 114)], previous=a)
	# with previous
	j_b_A = subjobs.build('dataset_fanout', source=b, column='A', previous=j_a_A)
	chk(
		j_b_A,
		'BCD',
		('unicode', 'int32', 'int32'),
		{'a': [('aa', 11, 111), ('cc', 13, 113)], 'b': [('bb', 12, 112)], 'd': [('dd', 14, 114)]},
		previous={'a': j_a_A.dataset('a'), 'b': j_a_A.dataset('b')},
	)

	# without previous, but only getting the data from b because of length=1
	j_b_A_len1 = subjobs.build('dataset_fanout', source=b, column='A', length=1)
	chk(
		j_b_A_len1,
		'BCD',
		('unicode', 'int32', 'int32'),
		{'a': [('aa', 11, 111), ('cc', 13, 113)], 'b': [('bb', 12, 112)], 'd': [('dd', 14, 114)]},
	)

	# with "wrong" previous, inheriting some empty datasets.
	j_b_A_C = subjobs.build('dataset_fanout', source=b, column='A', previous=j_a_C)
	chk(
		j_b_A_C,
		'BCD',
		('unicode', 'int32', 'int32'),
		{'a': [('aa', 11, 111), ('cc', 13, 113)], 'b': [('bb', 12, 112)], 'd': [('dd', 14, 114)], '1': [], '2': [], '3': []},
		previous={'1': j_a_C.dataset('1'), '2': j_a_C.dataset('2'), '3': j_a_C.dataset('3')},
	)

	# without previous, getting data from both a and b and the "widest" type for the columns.
	# (discards the D column since it doesn't exist in a.)
	j_b_A_None = subjobs.build('dataset_fanout', source=b, column='A')
	chk(
		j_b_A_None,
		'BC',
		('unicode', 'int64'),
		{'a': [('a', 1), ('aa', 11), ('c', 3), ('cc', 13)], 'b': [('b', 2), ('bb', 12)], 'd': [('dd', 14)]},
	)

	# test more type combinations, and switching hashlabel (to an included column)
	tt_a = mk(
		'tt_a',
		('ascii', 'int32', 'bits64', 'float32', 'number', 'complex32', 'number'),
		[('a', 1, 2, 2.5, 3, 1+2j, 3.14)],
		hashlabel='B',
	)
	tt_b = mk(
		'tt_b',
		('ascii', 'int64', 'bits32', 'float64', 'int32', 'complex64', 'float64'),
		[('a', 11, 12, 12.5, 13, 11+2j, 13.14)],
		hashlabel='B',
		previous=tt_a,
	)
	tt_c = mk(
		'tt_c',
		('ascii', 'int32', 'bits64', 'int64', 'float64', 'complex32', 'float32'),
		[('a', 111, 112, 112, 113.5, 111+2j, 314.0), ('b', 0, 0, 0, 0, 0, 0)],
		hashlabel='C',
		previous=tt_b,
	)

	# first two, some type changes
	j_tt_b = subjobs.build('dataset_fanout', source=tt_b, column='A')
	chk(
		j_tt_b,
		'BCDEFG',
		('int64', 'bits64', 'float64', 'number', 'complex64', 'number'),
		{'a': [(1, 2, 2.5, 3, 1+2j, 3.14), (11, 12, 12.5, 13, 11+2j, 13.14)]},
		hashlabel='B',
	)

	# all three in one, more types become number
	j_tt_c = subjobs.build('dataset_fanout', source=tt_c, column='A')
	chk(
		j_tt_c,
		'BCDEFG',
		('int64', 'bits64', 'number', 'number', 'complex64', 'number'),
		{'a': [(1, 2, 2.5, 3, 1+2j, 3.14), (11, 12, 12.5, 13, 11+2j, 13.14), (111, 112, 112, 113.5, 111+2j, 314.0)], 'b': [(0, 0, 0, 0, 0, 0)]},
		hashlabel=None,
	)

	# just two (checking that earlier types are not considered)
	j_tt_c_len2 = subjobs.build('dataset_fanout', source=tt_c, column='A', length=2)
	chk(
		j_tt_c_len2,
		'BCDEFG',
		('int64', 'bits64', 'number', 'number', 'complex64', 'float64'),
		{'a': [(11, 12, 12.5, 13, 11+2j, 13.14), (111, 112, 112, 113.5, 111+2j, 314.0)], 'b': [(0, 0, 0, 0, 0, 0)]},
		hashlabel=None,
	)

	# using previous to only get one source dataset, again checking that earlier
	# types are not considered and that only a gets a previous (and b doesn't)
	j_tt_c_b = subjobs.build('dataset_fanout', source=tt_c, column='A', previous=j_tt_b)
	chk(
		j_tt_c_b,
		'BCDEFG',
		('int32', 'bits64', 'int64', 'float64', 'complex32', 'float32'),
		{'a': [(111, 112, 112, 113.5, 111+2j, 314.0)], 'b': [(0, 0, 0, 0, 0, 0)]},
		hashlabel='C',
		previous={'a': j_tt_b.dataset('a')},
	)

	# it generally works, let's make an exhaustive test of compatible types
	# (to check that the values actually are compatible)
	previous = None
	all_types = []
	want_data = []
	for ix, types in enumerate(zip(
		cycle(['ascii']), # this is the split column
		['bits32', 'bits64', 'int32', 'int64', 'float32', 'float64', 'number'],
		cycle(['bits64', 'bits32']),
		cycle(['complex64', 'complex32']),
		cycle(['float64', 'float32']),
		cycle(['int64', 'bits32', 'int32']),
		cycle(['unicode', 'ascii']),
	)):
		data = [('data',) + (ix + 1000,) * 5 + (unicode(ix),)]
		want_data.append(data[0][1:])
		all_types.append(
			mk('all types %d' % (ix,), types, data, previous=previous)
		)
		previous = all_types[-1]

	j_all = subjobs.build('dataset_fanout', source=all_types[-1], column='A')
	chk(
		j_all,
		'BCDEFG',
		('number', 'bits64', 'complex64', 'float64', 'int64', 'unicode'),
		{'data': want_data},
	)

	# the B column doesn't have number any more here, but should still become number.
	j_all_except_number = subjobs.build('dataset_fanout', source=all_types[-2], column='A')
	chk(
		j_all_except_number,
		'BCDEFG',
		('number', 'bits64', 'complex64', 'float64', 'int64', 'unicode'),
		{'data': want_data[:-1]},
	)