def test_from_filenames(): with filetexts({'a1.log': 'A\nB', 'a2.log': 'C\nD'}) as fns: assert set(line.strip() for line in db.from_filenames(fns)) == \ set('ABCD') assert set(line.strip() for line in db.from_filenames('a*.log')) == \ set('ABCD') assert raises(ValueError, lambda: db.from_filenames('non-existent-*-path'))
def test_from_filenames_large_gzip(): with tmpfile('gz') as fn: f = gzip.open(fn, 'wb') f.write(b'Hello, world!\n' * 100) f.close() b = db.from_filenames(fn, chunkbytes=100) c = db.from_filenames(fn) assert len(b.dask) > 5 assert list(b) == list(c)
def test_from_filenames_large_gzip(): with tmpfile('gz') as fn: f = gzip.open(fn, 'wb') f.write(b'Hello, world!\n' * 100) f.close() b = db.from_filenames(fn, chunkbytes=100) c = db.from_filenames(fn) assert len(b.dask) > 5 assert list(b) == [s.decode() for s in c]
def test_from_filenames_large_gzip(): with tmpfile('gz') as fn: f = GzipFile(fn, 'wb') f.write(b'Hello, world!\n' * 100) f.close() b = db.from_filenames(fn, chunkbytes=100, linesep='\n') c = db.from_filenames(fn, linesep='\n') assert len(b.dask) > 5 assert list(b) == list(c)
def test_from_filenames_large(): with tmpfile() as fn: with open(fn, 'w') as f: f.write('Hello, world!\n' * 100) b = db.from_filenames(fn, chunkbytes=100) c = db.from_filenames(fn) assert len(b.dask) > 5 assert list(b) == list(c) d = db.from_filenames([fn], chunkbytes=100) assert list(b) == list(d)
def test_from_filenames_large(): with tmpfile() as fn: with open(fn, 'wb') as f: f.write(('Hello, world!' + os.linesep).encode() * 100) b = db.from_filenames(fn, chunkbytes=100) c = db.from_filenames(fn) assert len(b.dask) > 5 assert list(map(str, b)) == list(map(str, c)) d = db.from_filenames([fn], chunkbytes=100) assert list(b) == list(d)
def test_from_filenames_encoding(): with tmpfile() as fn: with open(fn, 'wb') as f: f.write((u'你好!' + os.linesep).encode('gb18030') * 100) b = db.from_filenames(fn, chunkbytes=100, encoding='gb18030') c = db.from_filenames(fn, encoding='gb18030') assert len(b.dask) > 5 assert list(map(lambda x: x.encode('utf-8'), b)) == list(map(lambda x: x.encode('utf-8'), c)) d = db.from_filenames([fn], chunkbytes=100, encoding='gb18030') assert list(b) == list(d)
def test_gh715(): bin_data = u'\u20ac'.encode('utf-8') with tmpfile() as fn: with open(fn, 'wb') as f: f.write(bin_data) a = db.from_filenames(fn) assert a.compute()[0] == bin_data.decode('utf-8')
def test_from_filenames_bz2(): b = db.from_filenames(['foo.json.bz2', 'bar.json.bz2']) assert (set(b.dask.values()) == set([ (list, (bz2.BZ2File, os.path.abspath('foo.json.bz2'))), (list, (bz2.BZ2File, os.path.abspath('bar.json.bz2'))) ]))
def test_from_filenames_gzip(): b = db.from_filenames(['foo.json.gz', 'bar.json.gz']) assert (set(b.dask.values()) == set([ (list, (gzip.open, os.path.abspath('foo.json.gz'))), (list, (gzip.open, os.path.abspath('bar.json.gz'))) ]))
def analyze(path, parse_timestamps=True, **kwargs): """ Analyze a given directory of either .json or flat text files with delimited JSON to get relevant key statistics. Parameters ---------- path: string Path to directory parse_timestamps: boolean, default True If True, will attempt to regex match ISO8601 formatted parse_timestamps kwargs: passed into json.loads. Here you can specify encoding, etc. """ stats = {} start_time = time.time() file_list = [os.path.join(path, f) for f in os.listdir(path)] bag = db.from_filenames(file_list).map(json.loads) recur_partial = partial(recur_dict, parse_timestamps=parse_timestamps) stats = bag.fold(recur_partial, combine_stats, initial={}).compute() count = stats["total_records"] del stats["total_records"] elapsed = time.time() - start_time print('Malort run finished: {} JSON blobs analyzed in {} seconds.' .format(count, elapsed)) return MalortResult(stats, count, elapsed)
def analyze(path, parse_timestamps=True, **kwargs): """ Analyze a given directory of either .json or flat text files with delimited JSON to get relevant key statistics. Parameters ---------- path: string Path to directory parse_timestamps: boolean, default True If True, will attempt to regex match ISO8601 formatted parse_timestamps kwargs: passed into json.loads. Here you can specify encoding, etc. """ stats = {} start_time = time.time() file_list = [os.path.join(path, f) for f in os.listdir(path)] bag = db.from_filenames(file_list).map(json.loads) recur_partial = partial(recur_dict, parse_timestamps=parse_timestamps) stats = bag.fold(recur_partial, combine_stats, initial={}).compute() count = stats["total_records"] del stats["total_records"] elapsed = time.time() - start_time print('Malort run finished: {} JSON blobs analyzed in {} seconds.'.format( count, elapsed)) return MalortResult(stats, count, elapsed)
def test_from_filenames_bz2(): b = db.from_filenames(['foo.json.bz2', 'bar.json.bz2']) assert (set(b.dask.values()) == set([ (list, (decode_sequence, system_encoding, (bz2.BZ2File, os.path.abspath('foo.json.bz2'), 'rb'))), (list, (decode_sequence, system_encoding, (bz2.BZ2File, os.path.abspath('bar.json.bz2'), 'rb'))) ]))
def test_from_filenames_gzip(): b = db.from_filenames(['foo.json.gz', 'bar.json.gz']) assert (set(b.dask.values()) == set([ (list, (decode_sequence, system_encoding, (gzip.open, os.path.abspath('foo.json.gz'), 'rb'))), (list, (decode_sequence, system_encoding, (gzip.open, os.path.abspath('bar.json.gz'), 'rb'))) ]))
def test_from_filenames_bz2(): b = db.from_filenames(['foo.json.bz2', 'bar.json.bz2']) assert (set(b.dask.values()) == set([ (list, (io.TextIOWrapper, (io.BufferedReader, (open, os.path.abspath('foo.json.bz2'), 'rb', 'bz2')), system_encoding, None, os.linesep)), (list, (io.TextIOWrapper, (io.BufferedReader, (open, os.path.abspath('bar.json.bz2'), 'rb', 'bz2')), system_encoding, None, os.linesep)) ]))
def test_from_filenames_bz2(): b = db.from_filenames(['foo.json.bz2', 'bar.json.bz2']) assert (set(b.dask.values()) == set([(list, (io.TextIOWrapper, (io.BufferedReader, (open, os.path.abspath('foo.json.bz2'), 'rb', 'bz2')), system_encoding, None, os.linesep)), (list, (io.TextIOWrapper, (io.BufferedReader, (open, os.path.abspath('bar.json.bz2'), 'rb', 'bz2')), system_encoding, None, os.linesep))]))
def test_from_filenames_bz2(): b = db.from_filenames(['foo.json.bz2', 'bar.json.bz2']) assert (set(b.dask.values()) == set([(list, (bz2.BZ2File, os.path.abspath('foo.json.bz2'))), (list, (bz2.BZ2File, os.path.abspath('bar.json.bz2')))]))
def test_from_filenames_gzip(): b = db.from_filenames(['foo.json.gz', 'bar.json.gz']) assert (set(b.dask.values()) == set([(list, (gzip.open, os.path.abspath('foo.json.gz'))), (list, (gzip.open, os.path.abspath('bar.json.gz')))]))
def test_from_filenames(): with filetexts({'a1.log': 'A\nB', 'a2.log': 'C\nD'}) as fns: assert set(line.strip() for line in db.from_filenames(fns)) == \ set('ABCD') assert set(line.strip() for line in db.from_filenames('a*.log')) == \ set('ABCD')
def bag_to_iterator(x, **kwargs): return db.from_filenames([tf.path for tf in x])
def of_files(self, filenames, chunkbytes=None): return Dream(*bag.from_filenames(filenames, chunkbytes)._args)
def test_from_filenames_bz2(): b = db.from_filenames(['foo.json.bz2', 'bar.json.bz2']) assert (set(b.dask.values()) == set([(list, (decode_sequence, system_encoding, (bz2.BZ2File, os.path.abspath('foo.json.bz2'), 'rb'))), (list, (decode_sequence, system_encoding, (bz2.BZ2File, os.path.abspath('bar.json.bz2'), 'rb')))]))
def test_from_filenames_gzip(): b = db.from_filenames(['foo.json.gz', 'bar.json.gz']) assert (set(b.dask.values()) == set([(list, (decode_sequence, system_encoding, (gzip.open, os.path.abspath('foo.json.gz'), 'rb'))), (list, (decode_sequence, system_encoding, (gzip.open, os.path.abspath('bar.json.gz'), 'rb')))]))