def unique_index(self): from element import Element if len(self.bincounts) == 0: sorted = self.sorted bincounts = rl.encode(sorted.mask, self.sorted) offsets = np.cumsum(bincounts) - bincounts return self.sort_index[offsets] else: bincounts = np.asarray(self.bincounts[0]).astype(np.int) bincounts[~self.bincounts[0].mask] = 0 binid = np.repeat(np.arange(bincounts.size), bincounts)[self.sort_index.flattened] sorted = self.sorted.flattened array = sorted[:] array[~sorted.mask] = 0 bincounts = rl.encode(binid, sorted.mask, array) offsets = np.cumsum(bincounts) - bincounts bincounts2 = rl.encode(binid[offsets]) offsets2 = np.cumsum(bincounts2) - 1 bincounts3 = np.zeros(self.bincounts[0].size, np.int) bincounts3[binid[offsets][offsets2]] = bincounts2 bincounts3 = [Atom(bincounts3, mask=self.bincounts[0].mask)] bincounts3.extend(self.bincounts[1:]) return Atom(self.sort_index.flattened[offsets], mask=sorted.mask[offsets], bincounts=bincounts3)
class WordCountTests(unittest.TestCase): def test_encode(self): self.assertMultiLineEqual('2A3B4C', encode('AABBBCCCC')) def test_decode(self): self.assertMultiLineEqual('AABBBCCCC', decode('2A3B4C')) def test_encode_with_single(self): self.assertMultiLineEqual( '12WB12W3B24WB', encode('WWWWWWWWWWWWBWWWWWWWWWWWWBBBWWWWWWWWWWWWWWWWWWWWWWWWB')) def test_decode_with_single(self): self.assertMultiLineEqual( 'WWWWWWWWWWWWBWWWWWWWWWWWWBBBWWWWWWWWWWWWWWWWWWWWWWWWB', decode('12WB12W3B24WB')) def test_combination(self): self.assertMultiLineEqual('zzz ZZ zZ', decode(encode('zzz ZZ zZ'))) encode('zzz ZZ zZ') def test_encode_unicode_s(self): self.assertMultiLineEqual('⏰3⚽2⭐⏰', encode('⏰⚽⚽⚽⭐⭐⏰')) def test_decode_unicode(self): self.assertMultiLineEqual('⏰⚽⚽⚽⭐⭐⏰', decode('⏰3⚽2⭐⏰'))
def group_index(self, *a): ''' ''' from atom import Atom import run_length as rl cnames = by(self, *a) sorted_self = sort_by(self, *cnames) sorted_columns = getcolumns(sorted_self, *cnames) return Atom(sort_index(self, *cnames), bincounts=[rl.encode(*sorted_columns)])
def __init__(self, join, container, outer, axis, *keys): self.size = len(container) self.join = join self.axis = axis self.outer = outer _sort_index = sort_index(container, *keys) self.columns = getcolumns(container[_sort_index], *keys) self.bin_counts = rl.encode(*self.columns) self.unique = [Atom(column, bincounts=[self.bin_counts]).first for column in self.columns] self.inverted_index = Atom(_sort_index, bincounts=[self.bin_counts])
def __init__(self, left, right, keys, left_outer=False, right_outer=False): self.left = join.Side(self, left, left_outer, 0, *keys.values()) self.right = join.Side(self, right, right_outer, 1, *keys.keys()) self.left.other, self.right.other = self.right, self.left joined_columns = [np.concatenate([self.left.unique[i], self.right.unique[i]]) for i in xrange(0, len(keys))] _sort_index = np.lexsort(joined_columns) _bin_counts = rl.encode(*[x[_sort_index] for x in joined_columns]) self.side_indexes = Atom(np.concatenate([np.arange(self.left.bin_counts.size), np.arange(self.right.bin_counts.size)])[_sort_index], bincounts=[_bin_counts])
def sort_index(self): array = self.asarray() if array.dtype == np.datetime64: sort_index = np.argsort(array) inverse_sort_index = np.empty(len(array), dtype=int) inverse_sort_index[sort_index] = np.arange(len(array)) sorted_array = array[sort_index] bincounts = rl.encode(sorted_array) array = np.repeat( np.array([ int(re.sub('[-: ]', '', str(x))) for x in sorted_array[bincounts.cumsum() - bincounts] ]), bincounts)[inverse_sort_index] if len(self.bincounts) == 0: sort_index = np.lexsort([self.mask, array]) return Atom(sort_index, mask=self.mask[sort_index]) else: bincounts = np.asarray(self.bincounts[0]).astype(np.int) bincounts[~self.bincounts[0].mask] = 0 binid = np.repeat(np.arange(bincounts.size), bincounts) sort_index = np.lexsort([self.mask, array, binid]) return Atom(sort_index, mask=self.mask[sort_index], bincounts=self.bincounts)
def test_encode(self): self.assertMultiLineEqual("2A3B4C", encode("AABBBCCCC"))
def test_encode_unicode_s(self): self.assertMultiLineEqual('⏰3⚽2⭐⏰', encode('⏰⚽⚽⚽⭐⭐⏰'))
def test_combination(self): self.assertMultiLineEqual('zzz ZZ zZ', decode(encode('zzz ZZ zZ')))
def test_encode_with_single(self): self.assertMultiLineEqual( '12WB12W3B24WB', encode('WWWWWWWWWWWWBWWWWWWWWWWWWBBBWWWWWWWWWWWWWWWWWWWWWWWWB'))
def test_encode_unicode_s(self): self.assertMultiLineEqual("⏰3⚽2⭐⏰", encode("⏰⚽⚽⚽⭐⭐⏰"))
def test_combination(self): self.assertMultiLineEqual("zzz ZZ zZ", decode(encode("zzz ZZ zZ")))
def test_encode_with_single(self): self.assertMultiLineEqual("12WB12W3B24WB", encode("WWWWWWWWWWWWBWWWWWWWWWWWWBBBWWWWWWWWWWWWWWWWWWWWWWWWB"))
def unique_counts(self): return rl.encode(self.sorted)
from run_length import encode, decode print(encode("AABBCCDDEEE")) print("2A2B2C2D3E") print(encode("AAABCCDDDDAAB")) print("3A1B2C4D2A1B") print() print(decode("3F2A1B4D1C")) print("FFFAABDDDDC") print(decode("1A1B1A1B1A2B2A")) print("ABABABBAA")
def test_encode(self): self.assertEqual(run_length.encode('HHHeellloWooorrrrlld!!'), 'H3e2l3o1W1o3r4l2d1!2')
def test_encode(self): self.assertMultiLineEqual('2A3B4C', encode('AABBBCCCC'))