Пример #1
0
def test_from_filenames():
    with filetexts({'a1.log': 'A\nB', 'a2.log': 'C\nD'}) as fns:
        assert set(line.strip() for line in db.from_filenames(fns)) == \
                set('ABCD')
        assert set(line.strip() for line in db.from_filenames('a*.log')) == \
                set('ABCD')

    assert raises(ValueError, lambda: db.from_filenames('non-existent-*-path'))
Пример #2
0
def test_from_filenames():
    with filetexts({'a1.log': 'A\nB', 'a2.log': 'C\nD'}) as fns:
        assert set(line.strip() for line in db.from_filenames(fns)) == \
                set('ABCD')
        assert set(line.strip() for line in db.from_filenames('a*.log')) == \
                set('ABCD')

    assert raises(ValueError, lambda: db.from_filenames('non-existent-*-path'))
Пример #3
0
def test_from_filenames_large_gzip():
    with tmpfile('gz') as fn:
        f = gzip.open(fn, 'wb')
        f.write(b'Hello, world!\n' * 100)
        f.close()

        b = db.from_filenames(fn, chunkbytes=100)
        c = db.from_filenames(fn)
        assert len(b.dask) > 5
        assert list(b) == list(c)
Пример #4
0
def test_from_filenames_large_gzip():
    with tmpfile('gz') as fn:
        f = gzip.open(fn, 'wb')
        f.write(b'Hello, world!\n' * 100)
        f.close()

        b = db.from_filenames(fn, chunkbytes=100)
        c = db.from_filenames(fn)
        assert len(b.dask) > 5
        assert list(b) == [s.decode() for s in c]
Пример #5
0
def test_from_filenames_large_gzip():
    with tmpfile('gz') as fn:
        f = GzipFile(fn, 'wb')
        f.write(b'Hello, world!\n' * 100)
        f.close()

        b = db.from_filenames(fn, chunkbytes=100, linesep='\n')
        c = db.from_filenames(fn, linesep='\n')
        assert len(b.dask) > 5
        assert list(b) == list(c)
Пример #6
0
def test_from_filenames_large():
    with tmpfile() as fn:
        with open(fn, 'w') as f:
            f.write('Hello, world!\n' * 100)
        b = db.from_filenames(fn, chunkbytes=100)
        c = db.from_filenames(fn)
        assert len(b.dask) > 5
        assert list(b) == list(c)

        d = db.from_filenames([fn], chunkbytes=100)
        assert list(b) == list(d)
Пример #7
0
def test_from_filenames_large():
    with tmpfile() as fn:
        with open(fn, 'wb') as f:
            f.write(('Hello, world!' + os.linesep).encode() * 100)
        b = db.from_filenames(fn, chunkbytes=100)
        c = db.from_filenames(fn)
        assert len(b.dask) > 5
        assert list(map(str, b)) == list(map(str, c))

        d = db.from_filenames([fn], chunkbytes=100)
        assert list(b) == list(d)
Пример #8
0
def test_from_filenames_large():
    with tmpfile() as fn:
        with open(fn, 'wb') as f:
            f.write(('Hello, world!' + os.linesep).encode() * 100)
        b = db.from_filenames(fn, chunkbytes=100)
        c = db.from_filenames(fn)
        assert len(b.dask) > 5
        assert list(map(str, b)) == list(map(str, c))

        d = db.from_filenames([fn], chunkbytes=100)
        assert list(b) == list(d)
Пример #9
0
def test_from_filenames_encoding():
    with tmpfile() as fn:
        with open(fn, 'wb') as f:
            f.write((u'你好!' + os.linesep).encode('gb18030') * 100)
        b = db.from_filenames(fn, chunkbytes=100, encoding='gb18030')
        c = db.from_filenames(fn, encoding='gb18030')
        assert len(b.dask) > 5
        assert list(map(lambda x: x.encode('utf-8'), b)) == list(map(lambda x: x.encode('utf-8'), c))

        d = db.from_filenames([fn], chunkbytes=100, encoding='gb18030')
        assert list(b) == list(d)
Пример #10
0
def test_gh715():
    bin_data = u'\u20ac'.encode('utf-8')
    with tmpfile() as fn:
        with open(fn, 'wb') as f:
            f.write(bin_data)
        a = db.from_filenames(fn)
        assert a.compute()[0] == bin_data.decode('utf-8')
Пример #11
0
def test_from_filenames_bz2():
    b = db.from_filenames(['foo.json.bz2', 'bar.json.bz2'])

    assert (set(b.dask.values()) == set([
        (list, (bz2.BZ2File, os.path.abspath('foo.json.bz2'))),
        (list, (bz2.BZ2File, os.path.abspath('bar.json.bz2')))
    ]))
Пример #12
0
def test_from_filenames_gzip():
    b = db.from_filenames(['foo.json.gz', 'bar.json.gz'])

    assert (set(b.dask.values()) == set([
        (list, (gzip.open, os.path.abspath('foo.json.gz'))),
        (list, (gzip.open, os.path.abspath('bar.json.gz')))
    ]))
Пример #13
0
def analyze(path, parse_timestamps=True, **kwargs):
    """
    Analyze a given directory of either .json or flat text files
    with delimited JSON to get relevant key statistics.

    Parameters
    ----------
    path: string
        Path to directory
    parse_timestamps: boolean, default True
        If True, will attempt to regex match ISO8601 formatted parse_timestamps
    kwargs:
        passed into json.loads. Here you can specify encoding, etc.
    """

    stats = {}

    start_time = time.time()
    file_list = [os.path.join(path, f) for f in os.listdir(path)]
    bag = db.from_filenames(file_list).map(json.loads)
    recur_partial = partial(recur_dict, parse_timestamps=parse_timestamps)
    stats = bag.fold(recur_partial, combine_stats, initial={}).compute()
    count = stats["total_records"]
    del stats["total_records"]

    elapsed = time.time() - start_time
    print('Malort run finished: {} JSON blobs analyzed in {} seconds.'
          .format(count, elapsed))
    return MalortResult(stats, count, elapsed)
Пример #14
0
def test_gh715():
    bin_data = u'\u20ac'.encode('utf-8')
    with tmpfile() as fn:
        with open(fn, 'wb') as f:
            f.write(bin_data)
        a = db.from_filenames(fn)
        assert a.compute()[0] == bin_data.decode('utf-8')
Пример #15
0
def analyze(path, parse_timestamps=True, **kwargs):
    """
    Analyze a given directory of either .json or flat text files
    with delimited JSON to get relevant key statistics.

    Parameters
    ----------
    path: string
        Path to directory
    parse_timestamps: boolean, default True
        If True, will attempt to regex match ISO8601 formatted parse_timestamps
    kwargs:
        passed into json.loads. Here you can specify encoding, etc.
    """

    stats = {}

    start_time = time.time()
    file_list = [os.path.join(path, f) for f in os.listdir(path)]
    bag = db.from_filenames(file_list).map(json.loads)
    recur_partial = partial(recur_dict, parse_timestamps=parse_timestamps)
    stats = bag.fold(recur_partial, combine_stats, initial={}).compute()
    count = stats["total_records"]
    del stats["total_records"]

    elapsed = time.time() - start_time
    print('Malort run finished: {} JSON blobs analyzed in {} seconds.'.format(
        count, elapsed))
    return MalortResult(stats, count, elapsed)
Пример #16
0
def test_from_filenames_bz2():
    b = db.from_filenames(['foo.json.bz2', 'bar.json.bz2'])

    assert (set(b.dask.values()) == set([
        (list, (decode_sequence, system_encoding,
                (bz2.BZ2File, os.path.abspath('foo.json.bz2'), 'rb'))),
        (list, (decode_sequence, system_encoding,
                (bz2.BZ2File, os.path.abspath('bar.json.bz2'), 'rb')))
    ]))
Пример #17
0
def test_from_filenames_gzip():
    b = db.from_filenames(['foo.json.gz', 'bar.json.gz'])

    assert (set(b.dask.values()) == set([
        (list, (decode_sequence, system_encoding,
                (gzip.open, os.path.abspath('foo.json.gz'), 'rb'))),
        (list, (decode_sequence, system_encoding,
                (gzip.open, os.path.abspath('bar.json.gz'), 'rb')))
    ]))
Пример #18
0
def test_from_filenames_bz2():
    b = db.from_filenames(['foo.json.bz2', 'bar.json.bz2'])

    assert (set(b.dask.values()) == set([
        (list, (io.TextIOWrapper, (io.BufferedReader,
                                   (open, os.path.abspath('foo.json.bz2'),
                                    'rb', 'bz2')), system_encoding, None,
                os.linesep)),
        (list, (io.TextIOWrapper, (io.BufferedReader,
                                   (open, os.path.abspath('bar.json.bz2'),
                                    'rb', 'bz2')), system_encoding, None,
                os.linesep))
    ]))
Пример #19
0
def test_from_filenames_bz2():
    b = db.from_filenames(['foo.json.bz2', 'bar.json.bz2'])

    assert (set(b.dask.values()) ==
            set([(list,
                  (io.TextIOWrapper,
                   (io.BufferedReader,
                    (open, os.path.abspath('foo.json.bz2'), 'rb', 'bz2')),
                   system_encoding, None, os.linesep)),
                 (list,
                  (io.TextIOWrapper,
                   (io.BufferedReader,
                    (open, os.path.abspath('bar.json.bz2'), 'rb', 'bz2')),
                   system_encoding, None, os.linesep))]))
Пример #20
0
def test_from_filenames_bz2():
    b = db.from_filenames(['foo.json.bz2', 'bar.json.bz2'])

    assert (set(b.dask.values()) ==
            set([(list, (bz2.BZ2File, os.path.abspath('foo.json.bz2'))),
                 (list, (bz2.BZ2File, os.path.abspath('bar.json.bz2')))]))
Пример #21
0
def test_from_filenames_gzip():
    b = db.from_filenames(['foo.json.gz', 'bar.json.gz'])

    assert (set(b.dask.values()) ==
            set([(list, (gzip.open, os.path.abspath('foo.json.gz'))),
                 (list, (gzip.open, os.path.abspath('bar.json.gz')))]))
Пример #22
0
def test_from_filenames():
    with filetexts({'a1.log': 'A\nB', 'a2.log': 'C\nD'}) as fns:
        assert set(line.strip() for line in db.from_filenames(fns)) == \
                set('ABCD')
        assert set(line.strip() for line in db.from_filenames('a*.log')) == \
                set('ABCD')
Пример #23
0
def bag_to_iterator(x, **kwargs):
    return db.from_filenames([tf.path for tf in x])
Пример #24
0
 def of_files(self, filenames, chunkbytes=None):
     return Dream(*bag.from_filenames(filenames, chunkbytes)._args)
Пример #25
0
def test_from_filenames_bz2():
    b = db.from_filenames(['foo.json.bz2', 'bar.json.bz2'])

    assert (set(b.dask.values()) ==
            set([(list, (decode_sequence, system_encoding, (bz2.BZ2File, os.path.abspath('foo.json.bz2'), 'rb'))),
                 (list, (decode_sequence, system_encoding, (bz2.BZ2File, os.path.abspath('bar.json.bz2'), 'rb')))]))
Пример #26
0
def test_from_filenames_gzip():
    b = db.from_filenames(['foo.json.gz', 'bar.json.gz'])

    assert (set(b.dask.values()) ==
            set([(list, (decode_sequence, system_encoding, (gzip.open, os.path.abspath('foo.json.gz'), 'rb'))),
                 (list, (decode_sequence, system_encoding, (gzip.open, os.path.abspath('bar.json.gz'), 'rb')))]))
Пример #27
0
def bag_to_iterator(x, **kwargs):
    return db.from_filenames([tf.path for tf in x])
Пример #28
0
def test_from_filenames():
    with filetexts({'a1.log': 'A\nB', 'a2.log': 'C\nD'}) as fns:
        assert set(line.strip() for line in db.from_filenames(fns)) == \
                set('ABCD')
        assert set(line.strip() for line in db.from_filenames('a*.log')) == \
                set('ABCD')