def test_sort_5(): table = (('foo', 'bar'), (2.3, 2), (1.2, 9), (2.3, 6), (3.2, 1), (1.2, 10)) expectation = (('foo', 'bar'), (1.2, 9), (1.2, 10), (2.3, 2), (2.3, 6), (3.2, 1)) # can use either field names or indices (from 1) to specify sort key result = sort(table, key=('foo', 'bar')) ieq(expectation, result) result = sort(table, key=(0, 1)) ieq(expectation, result) result = sort(table, key=('foo', 1)) ieq(expectation, result) result = sort(table, key=(0, 'bar')) ieq(expectation, result)
def __init__(self, left, right, lkey, rkey, presorted=False, missing=None, buffersize=None, tempdir=None, cache=True, lprefix=None, rprefix=None): if presorted: self.left = left self.right = right else: self.left = sort(left, lkey, buffersize=buffersize, tempdir=tempdir, cache=cache) self.right = sort(right, rkey, buffersize=buffersize, tempdir=tempdir, cache=cache) self.lkey = lkey self.rkey = rkey self.missing = missing self.lprefix = lprefix self.rprefix = rprefix
def test_sort_2(): table = (('foo', 'bar'), ('C', '2'), ('A', '9'), ('A', '6'), ('F', '1'), ('D', '10')) result = sort(table, key=('foo', 'bar')) expectation = (('foo', 'bar'), ('A', '6'), ('A', '9'), ('C', '2'), ('D', '10'), ('F', '1')) ieq(expectation, result) result = sort(table) # default is lexical sort expectation = (('foo', 'bar'), ('A', '6'), ('A', '9'), ('C', '2'), ('D', '10'), ('F', '1')) ieq(expectation, result)
def test_mergesort_3(): table1 = (('foo', 'bar'), ('A', 9), ('C', 2), ('D', 10), ('A', 6), ('F', 1)) table2 = (('foo', 'baz'), ('B', 3), ('D', 10), ('A', 10), ('F', 4)) # should be same as concatenate then sort (but more efficient, esp. when # presorted) expect = sort(cat(table1, table2), key='foo', reverse=True) actual = mergesort(table1, table2, key='foo', reverse=True) ieq(expect, actual) ieq(expect, actual) actual = mergesort(sort(table1, key='foo', reverse=True), sort(table2, key='foo', reverse=True), key='foo', reverse=True, presorted=True) ieq(expect, actual) ieq(expect, actual)
def test_fromhdf5sorted(): f = NamedTemporaryFile() # set up a new hdf5 table to work with h5file = tables.open_file(f.name, mode='w', title='Test file') h5file.create_group('/', 'testgroup', 'Test Group') h5table = h5file.create_table('/testgroup', 'testtable', FooBar, 'Test Table') # load some data into the table table1 = (('foo', 'bar'), (3, b'asdfgh'), (2, b'qwerty'), (1, b'zxcvbn')) for row in table1[1:]: for i, f in enumerate(table1[0]): h5table.row[f] = row[i] h5table.row.append() h5table.cols.foo.create_csindex() h5file.flush() # verify we can get the data back out table2 = fromhdf5sorted(h5table, sortby='foo') ieq(sort(table1, 'foo'), table2) ieq(sort(table1, 'foo'), table2) # clean up h5file.close()
def __init__(self, a, b, presorted=False, buffersize=None, tempdir=None, cache=True): if presorted: self.a = a self.b = b else: self.a = sort(a, buffersize=buffersize, tempdir=tempdir, cache=cache) self.b = sort(b, buffersize=buffersize, tempdir=tempdir, cache=cache)
def __init__(self, left, right, lkey, rkey, presorted=False, buffersize=None, tempdir=None, cache=True): if presorted: self.left = left self.right = right else: self.left = sort(left, lkey, buffersize=buffersize, tempdir=tempdir, cache=cache) self.right = sort(right, rkey, buffersize=buffersize, tempdir=tempdir, cache=cache) self.lkey = lkey self.rkey = rkey
def test_sort_none(): table = (('foo', 'bar'), ('C', 2), ('A', 9), ('A', None), ('F', 1), ('D', 10)) result = sort(table, 'bar') expectation = (('foo', 'bar'), ('A', None), ('F', 1), ('C', 2), ('A', 9), ('D', 10)) ieq(expectation, result) dt = datetime.now().replace table = (('foo', 'bar'), ('C', dt(hour=5)), ('A', dt(hour=1)), ('A', None), ('F', dt(hour=9)), ('D', dt(hour=17))) result = sort(table, 'bar') expectation = (('foo', 'bar'), ('A', None), ('A', dt(hour=1)), ('C', dt(hour=5)), ('F', dt(hour=9)), ('D', dt(hour=17))) ieq(expectation, result)
def diff(a, b, presorted=False, buffersize=None, tempdir=None, cache=True): """ Find the difference between rows in two tables. Returns a pair of tables. E.g.:: >>> import petl as etl >>> a = [['foo', 'bar', 'baz'], ... ['A', 1, True], ... ['C', 7, False], ... ['B', 2, False], ... ['C', 9, True]] >>> b = [['x', 'y', 'z'], ... ['B', 2, False], ... ['A', 9, False], ... ['B', 3, True], ... ['C', 9, True]] >>> added, subtracted = etl.diff(a, b) >>> # rows in b not in a ... added +-----+---+-------+ | x | y | z | +=====+===+=======+ | 'A' | 9 | False | +-----+---+-------+ | 'B' | 3 | True | +-----+---+-------+ >>> # rows in a not in b ... subtracted +-----+-----+-------+ | foo | bar | baz | +=====+=====+=======+ | 'A' | 1 | True | +-----+-----+-------+ | 'C' | 7 | False | +-----+-----+-------+ Convenient shorthand for ``(complement(b, a), complement(a, b))``. See also :func:`petl.transform.setops.complement`. If `presorted` is True, it is assumed that the data are already sorted by the given key, and the `buffersize`, `tempdir` and `cache` arguments are ignored. Otherwise, the data are sorted, see also the discussion of the `buffersize`, `tempdir` and `cache` arguments under the :func:`petl.transform.sorts.sort` function. """ if not presorted: a = sort(a) b = sort(b) added = complement(b, a, presorted=True, buffersize=buffersize, tempdir=tempdir, cache=cache) subtracted = complement(a, b, presorted=True, buffersize=buffersize, tempdir=tempdir, cache=cache) return added, subtracted
def test_sort_buffered_tempdir(): table = (("foo", "bar"), ("C", 2), ("A", 9), ("A", 6), ("F", 1), ("D", 10)) # test sort forwards expectation = (("foo", "bar"), ("F", 1), ("C", 2), ("A", 6), ("A", 9), ("D", 10)) result = sort(table, "bar") ieq(expectation, result) result = sort(table, "bar", buffersize=2, tempdir="/tmp") ieq(expectation, result)
def test_sort_2(): table = (("foo", "bar"), ("C", "2"), ("A", "9"), ("A", "6"), ("F", "1"), ("D", "10")) result = sort(table, key=("foo", "bar")) expectation = (("foo", "bar"), ("A", "6"), ("A", "9"), ("C", "2"), ("D", "10"), ("F", "1")) ieq(expectation, result) result = sort(table) # default is lexical sort expectation = (("foo", "bar"), ("A", "6"), ("A", "9"), ("C", "2"), ("D", "10"), ("F", "1")) ieq(expectation, result)
def test_sort_buffered_tempdir(): table = (('foo', 'bar'), ('C', 2), ('A', 9), ('A', 6), ('F', 1), ('D', 10)) # test sort forwards expectation = (('foo', 'bar'), ('F', 1), ('C', 2), ('A', 6), ('A', 9), ('D', 10)) result = sort(table, 'bar') ieq(expectation, result) result = sort(table, 'bar', buffersize=2, tempdir='/tmp') ieq(expectation, result)
def test_sort_buffered_tempdir(): table = (("foo", "bar"), ("C", 2), ("A", 9), ("A", 6), ("F", 1), ("D", 10)) # test sort forwards expectation = (("foo", "bar"), ("F", 1), ("C", 2), ("A", 6), ("A", 9), ("D", 10)) result = sort(table, "bar") ieq(expectation, result) tempdir = "tmp" if not os.path.exists(tempdir): os.mkdir(tempdir) result = sort(table, "bar", buffersize=2, tempdir=tempdir) ieq(expectation, result)
def test_sort_buffered_tempdir(): table = (('foo', 'bar'), ('C', 2), ('A', 9), ('A', 6), ('F', 1), ('D', 10)) # test sort forwards expectation = (('foo', 'bar'), ('F', 1), ('C', 2), ('A', 6), ('A', 9), ('D', 10)) result = sort(table, 'bar') ieq(expectation, result) tempdir = 'tmp' if not os.path.exists(tempdir): os.mkdir(tempdir) result = sort(table, 'bar', buffersize=2, tempdir=tempdir) ieq(expectation, result)
def test_sort_3(): table = (("foo", "bar"), ("C", "2"), ("A", "9"), ("A", "6"), ("F", "1"), ("D", "10")) result = sort(table, "bar") expectation = (("foo", "bar"), ("F", "1"), ("D", "10"), ("C", "2"), ("A", "6"), ("A", "9")) ieq(expectation, result)
def __init__(self, table, key=None, count=None, presorted=False, buffersize=None, tempdir=None, cache=True): if presorted: self.table = table else: self.table = sort(table, key=key, buffersize=buffersize, tempdir=tempdir, cache=cache) self.key = key self.count = count
def test_sort_4(): table = (("foo", "bar"), ("C", 2), ("A", 9), ("A", 6), ("F", 1), ("D", 10)) result = sort(table, "bar") expectation = (("foo", "bar"), ("F", 1), ("C", 2), ("A", 6), ("A", 9), ("D", 10)) ieq(expectation, result)
def __init__(self, source, key, aggregation=None, presorted=False, buffersize=None, tempdir=None, cache=True): if presorted: self.source = source else: self.source = sort(source, key, buffersize=buffersize, tempdir=tempdir, cache=cache) self.key = key if aggregation is None: self.aggregation = OrderedDict() elif isinstance(aggregation, (list, tuple)): self.aggregation = OrderedDict() for t in aggregation: self.aggregation[t[0]] = t[1:] elif isinstance(aggregation, dict): self.aggregation = aggregation else: raise ArgumentError( 'expected aggregation is None, list, tuple or dict, found %r' % aggregation)
def test_sort_missing_cell_numeric(): """ Sorting table with missing values raises IndexError #385 """ tbl = (('a', 'b'), ('4', ), ('2', '1'), ('1', )) expect = (('a', 'b'), ('1', ), ('2', '1'), ('4', )) tbl_sorted = sort(tbl) ieq(expect, tbl_sorted)
def test_sort_buffered_independent(): table = (('foo', 'bar'), ('C', 2), ('A', 9), ('A', 6), ('F', 1), ('D', 10)) expectation = (('foo', 'bar'), ('F', 1), ('C', 2), ('A', 6), ('A', 9), ('D', 10)) result = sort(table, 'bar', buffersize=4) nrows(result) # cause data to be cached # check that two row iterators are independent, i.e., consuming rows # from one does not affect the other it1 = iter(result) it2 = iter(result) eq_(expectation[0], it1.next()) eq_(expectation[1], it1.next()) eq_(expectation[0], it2.next()) eq_(expectation[1], it2.next()) eq_(expectation[2], it2.next()) eq_(expectation[2], it1.next())
def test_sort_missing_cell_text(): """ Sorting table with missing values raises IndexError #385 """ tbl = (('a', 'b', 'c'), ('C', ), ('A', '4', '5')) expect = (('a', 'b', 'c'), ('A', '4', '5'), ('C', )) tbl_sorted = sort(tbl) ieq(expect, tbl_sorted)
def test_sort_6(): table = (("foo", "bar"), (2.3, 2), (1.2, 9), (2.3, 6), (3.2, 1), (1.2, 10)) expectation = (("foo", "bar"), (3.2, 1), (2.3, 6), (2.3, 2), (1.2, 10), (1.2, 9)) result = sort(table, key=("foo", "bar"), reverse=True) ieq(expectation, result)
def __init__(self, source, key=None, presorted=False, buffersize=None, tempdir=None, cache=True): if presorted: self.source = source else: self.source = sort(source, key, buffersize=buffersize, tempdir=tempdir, cache=cache) self.key = key
def __init__(self, source, key=None, presorted=False, buffersize=None, tempdir=None, cache=True): if presorted: self.source = source else: self.source = sort(source, key, buffersize=buffersize, tempdir=tempdir, cache=cache) self.key = key # TODO property
def test_sort_4(): table = (('foo', 'bar'), ('C', 2), ('A', 9), ('A', 6), ('F', 1), ('D', 10)) result = sort(table, 'bar') expectation = (('foo', 'bar'), ('F', 1), ('C', 2), ('A', 6), ('A', 9), ('D', 10)) ieq(expectation, result)
def test_mergesort_2(): table1 = (("foo", "bar"), ("A", 9), ("C", 2), ("D", 10), ("A", 6), ("F", 1)) table2 = (("foo", "baz"), ("B", 3), ("D", 10), ("A", 10), ("F", 4)) # should be same as concatenate then sort (but more efficient, esp. when # presorted) expect = sort(cat(table1, table2), key="foo") actual = mergesort(table1, table2, key="foo") ieq(expect, actual) ieq(expect, actual) actual = mergesort(sort(table1, key="foo"), sort(table2, key="foo"), key="foo", presorted=True) ieq(expect, actual) ieq(expect, actual)
def test_sort_6(): table = (('foo', 'bar'), (2.3, 2), (1.2, 9), (2.3, 6), (3.2, 1), (1.2, 10)) expectation = (('foo', 'bar'), (3.2, 1), (2.3, 6), (2.3, 2), (1.2, 10), (1.2, 9)) result = sort(table, key=('foo', 'bar'), reverse=True) ieq(expectation, result)
def test_mergesort_1(): table1 = (('foo', 'bar'), ('A', 6), ('C', 2), ('D', 10), ('A', 9), ('F', 1)) table2 = (('foo', 'bar'), ('B', 3), ('D', 10), ('A', 10), ('F', 4)) # should be same as concatenate then sort (but more efficient, esp. when # presorted) expect = sort(cat(table1, table2)) actual = mergesort(table1, table2) ieq(expect, actual) ieq(expect, actual) actual = mergesort(sort(table1), sort(table2), presorted=True) ieq(expect, actual) ieq(expect, actual)
def __init__(self, table, key, aggregation=list, value=None, presorted=False, buffersize=None, tempdir=None, cache=True): if presorted: self.table = table else: self.table = sort(table, key, buffersize=buffersize, tempdir=tempdir, cache=cache) self.key = key self.aggregation = aggregation self.value = value
def __init__(self, source, key, reducer, fields=None, presorted=False, buffersize=None, tempdir=None, cache=True): if presorted: self.source = source else: self.source = sort(source, key, buffersize=buffersize, tempdir=tempdir, cache=cache) self.key = key self.fields = fields self.reducer = reducer
def test_sort_3(): table = (('foo', 'bar'), ('C', '2'), ('A', '9'), ('A', '6'), ('F', '1'), ('D', '10')) result = sort(table, 'bar') expectation = (('foo', 'bar'), ('F', '1'), ('D', '10'), ('C', '2'), ('A', '6'), ('A', '9')) ieq(expectation, result)
def __init__(self, source, f1, f2, f3, aggfun, missing=None, presorted=False, buffersize=None, tempdir=None, cache=True): if presorted: self.source = source else: self.source = sort(source, key=(f1, f2), buffersize=buffersize, tempdir=tempdir, cache=cache) self.f1, self.f2, self.f3 = f1, f2, f3 self.aggfun = aggfun self.missing = missing
def groupselectmax(table, key, value, presorted=False, buffersize=None, tempdir=None, cache=True): """Group by the `key` field then return the row with the minimum of the `value` field within each group. N.B., will only return one row for each group, even if multiple rows have the same (maximum) value.""" return groupselectfirst(sort(table, value, reverse=True), key, presorted=presorted, buffersize=buffersize, tempdir=tempdir, cache=cache)
def __init__(self, source, key, mapper, header=None, presorted=False, buffersize=None, tempdir=None, cache=True): if presorted: self.source = source else: self.source = sort(source, key, buffersize=buffersize, tempdir=tempdir, cache=cache) self.key = key self.header = header self.mapper = mapper
def test_mergesort_4(): table1 = (("foo", "bar", "baz"), (1, "A", True), (2, "B", None), (4, "C", True)) table2 = (("bar", "baz", "quux"), ("A", True, 42.0), ("B", False, 79.3), ("C", False, 12.4)) expect = sort(cat(table1, table2), key="bar") actual = mergesort(table1, table2, key="bar") ieq(expect, actual) ieq(expect, actual)
def collapsedintervals(table, start='start', stop='stop', key=None): """ Utility function to collapse intervals in a table. If no facet `key` is given, returns an iterator over `(start, stop)` tuples. If facet `key` is given, returns an iterator over `(key, start, stop)` tuples. """ if key is None: table = sort(table, key=start) for iv in _collapse(values(table, (start, stop))): yield iv else: table = sort(table, key=(key, start)) for k, g in rowgroupby(table, key=key, value=(start, stop)): for iv in _collapse(g): yield (k, ) + iv
def __init__(self, source, key, missing=None, exclude=None, include=None, presorted=False, buffersize=None, tempdir=None, cache=True): if presorted: self.source = source else: self.source = sort(source, key, buffersize=buffersize, tempdir=tempdir, cache=cache) self.key = key self.missing = missing self.exclude = exclude self.include = include
def collapsedintervals(table, start='start', stop='stop', key=None): """ Utility function to collapse intervals in a table. If no facet `key` is given, returns an iterator over `(start, stop)` tuples. If facet `key` is given, returns an iterator over `(key, start, stop)` tuples. """ if key is None: table = sort(table, key=start) for iv in _collapse(values(table, (start, stop))): yield iv else: table = sort(table, key=(key, start)) for k, g in rowgroupby(table, key=key, value=(start, stop)): for iv in _collapse(g): yield (k,) + iv
def groupselectmax(table, key, value): """ Group by the `key` field then return the row with the minimum of the `value` field within each group. N.B., will only return one row for each group, even if multiple rows have the same (maximum) value. .. versionadded:: 0.14 """ return groupselectfirst(sort(table, value, reverse=True), key)
def test_mergesort_4(): table1 = (('foo', 'bar', 'baz'), (1, 'A', True), (2, 'B', None), (4, 'C', True)) table2 = (('bar', 'baz', 'quux'), ('A', True, 42.0), ('B', False, 79.3), ('C', False, 12.4)) expect = sort(cat(table1, table2), key='bar') actual = mergesort(table1, table2, key='bar') ieq(expect, actual) ieq(expect, actual)
def test_sort_buffered(): table = (("foo", "bar"), ("C", 2), ("A", 9), ("A", 6), ("F", 1), ("D", 10)) # test sort forwards expectation = (("foo", "bar"), ("F", 1), ("C", 2), ("A", 6), ("A", 9), ("D", 10)) result = sort(table, "bar") ieq(expectation, result) result = sort(table, "bar", buffersize=2) # print list(result) ieq(expectation, result) # sort in reverse expectation = (("foo", "bar"), ("D", 10), ("A", 9), ("A", 6), ("C", 2), ("F", 1)) result = sort(table, "bar", reverse=True) ieq(expectation, result) result = sort(table, "bar", reverse=True, buffersize=2) ieq(expectation, result) # no key expectation = (("foo", "bar"), ("F", 1), ("D", 10), ("C", 2), ("A", 9), ("A", 6)) result = sort(table, reverse=True) ieq(expectation, result) result = sort(table, reverse=True, buffersize=2) ieq(expectation, result)
def test_sort_buffered(): table = (('foo', 'bar'), ('C', 2), ('A', 9), ('A', 6), ('F', 1), ('D', 10)) # test sort forwards expectation = (('foo', 'bar'), ('F', 1), ('C', 2), ('A', 6), ('A', 9), ('D', 10)) result = sort(table, 'bar') ieq(expectation, result) result = sort(table, 'bar', buffersize=2) ieq(expectation, result) # sort in reverse expectation = (('foo', 'bar'), ('D', 10), ('A', 9), ('A', 6), ('C', 2), ('F', 1)) result = sort(table, 'bar', reverse=True) ieq(expectation, result) result = sort(table, 'bar', reverse=True, buffersize=2) ieq(expectation, result) # no key expectation = (('foo', 'bar'), ('F', 1), ('D', 10), ('C', 2), ('A', 9), ('A', 6)) result = sort(table, reverse=True) ieq(expectation, result) result = sort(table, reverse=True, buffersize=2) ieq(expectation, result)
def groupselectmax(table, key, value, presorted=False, buffersize=None, tempdir=None, cache=True): """Group by the `key` field then return the row with the maximum of the `value` field within each group. N.B., will only return one row for each group, even if multiple rows have the same (maximum) value.""" return groupselectfirst(sort(table, value, reverse=True), key, presorted=presorted, buffersize=buffersize, tempdir=tempdir, cache=cache)
def test_sort_buffered_independent(): table = (('foo', 'bar'), ('C', 2), ('A', 9), ('A', 6), ('F', 1), ('D', 10)) expectation = (('foo', 'bar'), ('F', 1), ('C', 2), ('A', 6), ('A', 9), ('D', 10)) result = sort(table, 'bar', buffersize=4) nrows(result) # cause data to be cached # check that two row iterators are independent, i.e., consuming rows # from one does not affect the other it1 = iter(result) it2 = iter(result) eq_(expectation[0], next(it1)) eq_(expectation[1], next(it1)) eq_(expectation[0], next(it2)) eq_(expectation[1], next(it2)) eq_(expectation[2], next(it2)) eq_(expectation[2], next(it1))
def test_sort_buffered_cleanup(): table = (('foo', 'bar'), ('C', 2), ('A', 9), ('A', 6), ('F', 1), ('D', 10)) result = sort(table, 'bar', buffersize=2) debug('initially filecache should be empty') eq_(None, result._filecache) debug('pull rows through, should populate file cache') eq_(5, nrows(result)) eq_(3, len(result._filecache)) debug('check all files exist') filenames = _get_names(result._filecache) for fn in filenames: assert os.path.exists(fn), fn debug('delete object and garbage collect') del result gc.collect() debug('check all files have been deleted') for fn in filenames: assert not os.path.exists(fn), fn
def people_list(request, uuid): try: csvdownload = CSVDownload.objects.get(uuid=uuid) except CSVDownload.DoesNotExist: return HttpResponseNotFound("Not found.") fname = '{0}.csv'.format(csvdownload.uuid) full_fname = os.path.join(settings.CSV_DIR, fname) people = fromcsv(full_fname) sortby = request.GET.get('sortby', 'name') ordering = request.GET.get('ordering', 'asc') count_str = request.GET.get('count', '10') if sortby not in header(people): return HttpResponseBadRequest('Bad request.') if ordering not in ('asc', 'desc'): return HttpResponseBadRequest('Bad request.') try: count = int(count_str) except ValueError: return HttpResponseBadRequest('Bad request.') if count < 1: return HttpResponseBadRequest('Bad request.') people = sort(people, sortby, reverse=ordering == 'desc') people = head(people, count) return render( request, 'people_list.html', { 'csvdownload': csvdownload, 'headers': header(people), 'people': data(people), 'has_more': len(people) > count, 'queryparams': { 'sortby': sortby, 'ordering': ordering, 'count': str(count + 10) } })
def test_sort_buffered_cleanup_open_iterator(): table = (('foo', 'bar'), ('C', 2), ('A', 9), ('A', 6), ('F', 1), ('D', 10)) # check if cleanup is robust against open iterators result = sort(table, 'bar', buffersize=2) debug('pull rows through, should populate file cache') eq_(5, nrows(result)) eq_(3, len(result._filecache)) debug('check all files exist') filenames = _get_names(result._filecache) for fn in filenames: assert os.path.exists(fn), fn debug(filenames) debug('open an iterator') it = iter(result) next(it) next(it) debug('delete objects and garbage collect') del result del it gc.collect() for fn in filenames: assert not os.path.exists(fn), fn