def test_splitter_datetime(): """test that structures containing datetime instances can be pushed in""" elements = [ [ '1', datetime.datetime(2004, 1, 22, 10, 0, 0), datetime.date(2010, 12, 24), ], [ '1', datetime.datetime(2005, 2, 23, 11, 27, 32), datetime.date(2015, 12, 24), ], [ '1', datetime.datetime(2014, 8, 22, 9, 54, 0), datetime.date(2001, 11, 6), ], ] fnames = get_splitter(elements, 0).split() for fname in fnames: f = open(fname, 'rb') flow = get_input_item_flow(f) for index, item in enumerate(flow): assert item == elements[index], ("Expected: '%s' got '%s'" % (item, elements[index]))
def test_simple_splitter_eleven_max3(self): # only one item with max_items > 1 input_items = list() o = TestItem('ref0', 'value0') input_items.append(o) input_items.extend( [TestItem('ref1', 'value%s' % i) for i in range(1, 10)]) # list of items with max_items < nb items self.splitter = get_splitter(input_items, 3) result = self.splitter.split() assert len(result) == 4, ( "The splitter returned %s files instead of 4" % len(result)) count = 0 for filename in result: f = open(filename, 'rb') for item in get_input_item_flow(f): assert item == input_items[count], ( "[File %s][Item %s] Got %s instead of %s" % (f.name, count, item, input_items[count])) count += 1 f.close() assert count == len(input_items), ( "We got %s items after splitting, instead of the %s we had before" % (count, len(input_items)))
def test_simple_splitter_one_nomax(self): input_items = list() # only one item with no max_items o = TestItem('ref0', 'value0') input_items.append(o) self.splitter = get_splitter(input_items, 0) result = self.splitter.split() assert len(result) == 1, ( "The splitter returned %s files instead of 1" % len(result)) count = 0 f = open(result[0], 'rb') for item in get_input_item_flow(f): assert item == input_items[count], ( "[File %s][Item %s] Got %s, %s instead of %s, %s" % ( f.name, count, item, type(item), input_items[count], type(input_items[count]), )) f.close()
def test_splitter_unicode(): """Test that unicode attributes are preserved after splitting.""" max_items = 2 unicode_field = u'$£ø' split_attribute = 'field1' input_items = list() o = TestItem('ref1', unicode_field) input_items.append(o) # test the simple splitter result = get_splitter(input_items, max_items).split() assert len(result) == 1, ("The splitter returned %s files instead of 1" % len(result)) f = open(result[0], 'rb') for item in get_input_item_flow(f): assert item.field2 == unicode_field, ( "Simple splitter didn't preserve unicode value: " "got %s instead of %s" % (item.field2, unicode_field)) f.close() # test the splitter by attribute result = get_splitter(input_items, 2, split_attribute).split() assert len(result) == 1, ("The splitter returned %s files instead of 1" % len(result)) f = open(result[0], 'rb') for item in get_input_item_flow(f): assert item.field2 == unicode_field, ( "Splitter by attribute didn't preserve unicode value: " "got %s instead of %s" % (item.field2, unicode_field)) f.close() # test the splitter by attribute, forcing it to split result = get_splitter(input_items, 2, split_attribute, force_split=True).split() assert len(result) == 1, ("The splitter returned %s files instead of 1" % len(result)) f = open(result[0], 'rb') for item in get_input_item_flow(f): assert item.field2 == unicode_field, ( "Splitter by attribute (forcing to split) didn't " "preserve unicode value: got %s instead of %s" % (item.field2, unicode_field)) f.close()
def retrieve(self, shelves=None, auto_clean=False): """ Retrieve the content of the storage as a data flow @param shelves: List of shelves to retrieve all item, if shelves is None, the method retrieve all item from all shelves @type shelves: List of filename """ if shelves is None: shelves = self.shelves for shelve in shelves: with open(shelve, 'r') as shelve_file: for item in get_input_item_flow(shelve_file): yield item if auto_clean: os.unlink(shelve)
def test_chaining_splitters(): """Test chaining different splitters.""" first_split_attribute = 'field1' second_split_attribute = 'field2' input_items = list() for i in range(1, 21): input_items.append(TestItem('ref%s' % (i % 5), 'value%s' % (i % 3))) # chain two splitters without max_items # the first one won't do anything, so this is strictly # equivalent to having only the second one params_splitters = [ { 'split_attribute': first_split_attribute }, { 'split_attribute': second_split_attribute, 'force_split': True }, ] result = chain_splitters(input_items, params_splitters) assert len(result) == 3, ("The splitter returned %s files instead of 3" % len(result)) count = 0 for filename in result: bucket_value = None f = open(filename, 'rb') for item in get_input_item_flow(f): value = getattr(item, second_split_attribute) if not bucket_value: bucket_value = value assert value == bucket_value, ( "We got an item with %s for its split_attribute " "in a file of items with %s" % (value, bucket_value)) count += 1 f.close() assert count == len(input_items), ( "We got %s items after splitting, instead of the " "%s we had before" % (count, len(input_items))) # chain two splitters with first max_items < nb items params_splitters = [{ 'max_items': 8, 'split_attribute': first_split_attribute }, { 'split_attribute': second_split_attribute, 'force_split': True }] result = chain_splitters(input_items, params_splitters) assert len(result) == 9, ("The splitter returned %s files instead of 9" % len(result)) count = 0 already_seen = list() for filename in result: bucket_value = None ref_values = set() f = open(filename, 'rb') for item in get_input_item_flow(f): value = getattr(item, second_split_attribute) if not bucket_value: bucket_value = value assert value == bucket_value, ( "We got an item with %s for its split_attribute in a " "file of items with %s" % (value, bucket_value)) ref_value = (getattr(item, first_split_attribute), value) ref_values.add(ref_value) assert ref_value not in already_seen, ( "The value %s should have been in a previous file" % (ref_value, )) count += 1 f.close() already_seen.extend(ref_values) assert count == len(input_items), ( "We got %s items after splitting, instead of the " "%s we had before" % (count, len(input_items)))
def test_chaining_splitters_single(): """Make sure that chaining only one splitter doesn't change the behavior. """ split_attribute = 'field1' input_items = list() for i in range(1, 11): input_items.append(TestItem('ref%s' % (i % 5), 'value%s' % (i % 3))) # simple splitter without max_items params_splitters = [{}] result = chain_splitters(input_items, params_splitters) assert len(result) == 1, ("The splitter returned " "%s files instead of 1" % len(result)) count = 0 for filename in result: f = open(filename, 'rb') for item in get_input_item_flow(f): assert item == input_items[count], ( "[File %s][Item %s] Got %s instead of %s" % (f.name, count, item, input_items[count])) count += 1 f.close() assert count == len(input_items), ( "We got %s items after splitting, instead of the " "%s we had before" % (count, len(input_items))) # simple splitter with max_items < nb items params_splitters = [{'max_items': 4}] result = chain_splitters(input_items, params_splitters) assert len(result) == 3, ("The splitter returned %s files instead of 3" % len(result)) count = 0 for filename in result: f = open(filename, 'rb') for item in get_input_item_flow(f): assert item == input_items[count], ( "[File %s][Item %s] Got %s instead of %s" % (f.name, count, item, input_items[count])) count += 1 f.close() assert count == len(input_items), ( "We got %s items after splitting, instead of the " "%s we had before" % (count, len(input_items))) # split by attribute without max_items params_splitters = [{'split_attribute': split_attribute}] result = chain_splitters(input_items, params_splitters) assert len(result) == 1, ("The splitter returned %s files instead of 1" % len(result)) count = 0 already_seen = list() for filename in result: ref_values = set() f = open(filename, 'rb') for item in get_input_item_flow(f): ref_value = getattr(item, split_attribute) ref_values.add(ref_value) assert ref_value not in already_seen, ( "The value %s should have been in " "a previous file" % ref_value) count += 1 f.close() already_seen.extend(ref_values) assert count == len(input_items), ("We got %s items after splitting, " "instead of the %s we had before" % (count, len(input_items))) # split by attribute with max_items < nb items params_splitters = [{'max_items': 8, 'split_attribute': split_attribute}] result = chain_splitters(input_items, params_splitters) assert len(result) == 2, ("The splitter returned %s files instead of 2" % len(result)) count = 0 already_seen = list() for filename in result: ref_values = set() f = open(filename, 'rb') for item in get_input_item_flow(f): ref_value = getattr(item, split_attribute) ref_values.add(ref_value) assert ref_value not in already_seen, ( "The value %s should have been " "in a previous file" % ref_value) count += 1 f.close() already_seen.extend(ref_values) assert count == len(input_items), ( "We got %s items after splitting, instead of the " "%s we had before" % (count, len(input_items))) # split by attribute forcing split without max_items params_splitters = [{ 'split_attribute': split_attribute, 'force_split': True }] result = chain_splitters(input_items, params_splitters) assert len(result) == 5, ("The splitter returned %s files instead of 5" % len(result)) count = 0 for filename in result: bucket_value = None f = open(filename, 'rb') for item in get_input_item_flow(f): value = getattr(item, split_attribute) if not bucket_value: bucket_value = value assert value == bucket_value, ( "We got an item with %s for its split_attribute " "in a file of items with %s" % (value, bucket_value)) count += 1 f.close() assert count == len(input_items), ( "We got %s items after splitting, instead of the " "%s we had before" % (count, len(input_items))) # split by attribute forcing split with max_items < nb items params_splitters = [{ 'max_items': 6, 'split_attribute': split_attribute, 'force_split': True }] result = chain_splitters(input_items, params_splitters) assert len(result) == 5, ("The splitter returned %s files instead of 5" % len(result)) count = 0 for filename in result: bucket_value = None f = open(filename, 'rb') for item in get_input_item_flow(f): value = getattr(item, split_attribute) if not bucket_value: bucket_value = value assert value == bucket_value, ( "We got an item with %s for its split_attribute " "in a file of items with %s" % (value, bucket_value)) count += 1 f.close() assert count == len(input_items), ( "We got %s items after splitting, instead of the " "%s we had before" % (count, len(input_items)))
def test_splitter_force_attr(): """Test splitting by attribute (forcing split) in various scenarios.""" split_attribute = 'field1' # empty input with no max_items input_items = list() result = get_splitter(input_items, 0, split_attribute, force_split=True).split() assert len(result) == 0, ( "The splitter returned %s files instead of none" % len(result)) # empty input with max_items result = get_splitter(input_items, 1, split_attribute, force_split=True).split() assert len(result) == 0, ( "The splitter returned %s files instead of none" % len(result)) # only one item with no max_items input_items.append(TestItem('ref0', 'value0')) result = get_splitter(input_items, 0, split_attribute, force_split=True).split() assert len(result) == 1, ("The splitter returned %s files instead of 1" % len(result)) count = 0 f = open(result[0], 'rb') for item in get_input_item_flow(f): assert item == input_items[count], ("Got %s instead of %s" % (input_items[count], item)) count += 1 f.close() assert count == len(input_items), ( "We got %s items after splitting, instead of the %s we had before" % (count, len(input_items))) # only one item with max_items = 1 result = get_splitter(input_items, 1, split_attribute, force_split=True).split() assert len(result) == 1, ("The splitter returned %s files instead of 1" % len(result)) count = 0 f = open(result[0], 'rb') for item in get_input_item_flow(f): assert item == input_items[count], ("Got %s instead of %s" % (input_items[count], item)) count += 1 f.close() assert count == len(input_items), ( "We got %s items after splitting, instead of the %s we had before" % (count, len(input_items))) # only one item with max_items > 1 result = get_splitter(input_items, 10, split_attribute, force_split=True).split() assert len(result) == 1, ("The splitter returned %s files instead of 1" % len(result)) count = 0 f = open(result[0], 'rb') for item in get_input_item_flow(f): assert item == input_items[count], ("Got %s instead of %s" % (item, input_items[count])) count += 1 f.close() assert count == len(input_items), ( "We got %s items after splitting, instead of the %s we had before" % (count, len(input_items))) # list of items with no max_items input_items.append(TestItem('ref0', 'value1')) input_items.extend([TestItem('ref1', 'value%s' % i) for i in range(3)]) result = get_splitter(input_items, 0, split_attribute, force_split=True).split() assert len(result) == 2, ("The splitter returned %s files instead of 2" % len(result)) count = 0 for filename in result: bucket_value = None f = open(filename, 'rb') for item in get_input_item_flow(f): value = getattr(item, split_attribute) if not bucket_value: bucket_value = value assert value == bucket_value, ( "We got an item with %s for its split_attribute " "in a file of items with %s" % (value, bucket_value)) count += 1 f.close() assert count == len(input_items), ( "We got %s items after splitting, instead of the %s we had before" % (count, len(input_items))) # list of items with max_items < nb items result = get_splitter(input_items, 3, split_attribute, force_split=True).split() assert len(result) == 2, ("The splitter returned %s files instead of 2" % len(result)) count = 0 for filename in result: bucket_value = None f = open(filename, 'rb') for item in get_input_item_flow(f): value = getattr(item, split_attribute) if not bucket_value: bucket_value = value assert value == bucket_value, ( "We got an item with %s for its split_attribute " "in a file of items with %s" % (value, bucket_value)) count += 1 f.close() assert count == len(input_items), ( "We got %s items after splitting, instead of the %s we had before" % (count, len(input_items))) # list of items with max_items = nb items result = get_splitter(input_items, 5, split_attribute, force_split=True).split() assert len(result) == 2, ("The splitter returned %s files instead of 2" % len(result)) count = 0 for filename in result: bucket_value = None f = open(filename, 'rb') for item in get_input_item_flow(f): value = getattr(item, split_attribute) if not bucket_value: bucket_value = value assert value == bucket_value, ( "We got an item with %s for its split_attribute " "in a file of items with %s" % (value, bucket_value)) count += 1 f.close() assert count == len(input_items), ( "We got %s items after splitting, instead of the %s we had before" % (count, len(input_items))) # list of items with max_items > nb items result = get_splitter(input_items, 10, split_attribute, force_split=True).split() assert len(result) == 2, ("The splitter returned %s files instead of 2" % len(result)) count = 0 for filename in result: bucket_value = None f = open(filename, 'rb') for item in get_input_item_flow(f): value = getattr(item, split_attribute) if not bucket_value: bucket_value = value assert value == bucket_value, ( "We got an item with %s for its split_attribute " "in a file of items with %s" % (value, bucket_value)) count += 1 f.close() assert count == len(input_items), ( "We got %s items after splitting, instead of the " "%s we had before" % (count, len(input_items)))
def test_splitter_attr(): """Test splitting by attribute in various scenarios.""" split_attribute = 'field1' # empty input with no max_items input_items = list() result = get_splitter(input_items, 0, split_attribute).split() assert len(result) == 0, ( "The splitter returned %s files instead of none" % len(result)) # empty input with max_items result = get_splitter(input_items, 1, split_attribute).split() assert len(result) == 0, ( "The splitter returned %s files instead of none" % len(result)) # only one item with no max_items input_items.append(TestItem('ref0', 'value0')) result = get_splitter(input_items, 0, split_attribute).split() assert len(result) == 1, ("The splitter returned %s files instead of 1" % len(result)) count = 0 f = open(result[0], 'rb') for item in get_input_item_flow(f): assert item == input_items[count], ("Got %s instead of %s" % (input_items[count], item)) count += 1 f.close() assert count == len(input_items), ( "We got %s items after splitting, instead of the %s we had before" % (count, len(input_items))) # only one item with max_items = 1 result = get_splitter(input_items, 1, split_attribute).split() assert len(result) == 1, ("The splitter returned %s files instead of 1" % len(result)) count = 0 f = open(result[0], 'rb') for item in get_input_item_flow(f): assert item == input_items[count], ("Got %s instead of %s" % (input_items[count], item)) count += 1 f.close() assert count == len(input_items), ( "We got %s items after splitting, instead of the %s we had before" % (count, len(input_items))) # only one item with max_items > 1 result = get_splitter(input_items, 10, split_attribute).split() assert len(result) == 1, ("The splitter returned %s files instead of 1" % len(result)) count = 0 f = open(result[0], 'rb') for item in get_input_item_flow(f): assert item == input_items[count], ("Got %s instead of %s" % (input_items[count], item)) count += 1 f.close() assert count == len(input_items), ( "We got %s items after splitting, instead of the %s we had before" % (count, len(input_items))) # list of items with no max_items input_items.append(TestItem('ref0', 'value1')) input_items.extend([TestItem('ref1', 'value%s' % i) for i in range(3)]) result = get_splitter(input_items, 0, split_attribute).split() assert len(result) == 1, ("The splitter returned %s files instead of 1" % len(result)) count = 0 already_seen = list() for filename in result: ref_values = set() f = open(filename, 'rb') for item in get_input_item_flow(f): ref_value = getattr(item, split_attribute) ref_values.add(ref_value) assert ref_value not in already_seen, ( "The value %s should have been in a previous file" % (ref_value)) count += 1 f.close() already_seen.extend(ref_values) assert count == len(input_items), ( "We got %s items after splitting, instead of the %s we had before" % (count, len(input_items))) # list of items with max_items < nb items result = get_splitter(input_items, 3, split_attribute).split() assert len(result) == 2, ("The splitter returned %s files instead of 2" % len(result)) count = 0 already_seen = list() for filename in result: ref_values = set() f = open(filename, 'rb') for item in get_input_item_flow(f): ref_value = getattr(item, split_attribute) ref_values.add(ref_value) assert ref_value not in already_seen, ( "The value %s should have been in a previous file" % (ref_value)) count += 1 f.close() already_seen.extend(ref_values) assert count == len(input_items), ( "We got %s items after splitting, instead of the %s we had before" % (count, len(input_items))) # list of items with max_items = nb items result = get_splitter(input_items, 5, split_attribute).split() assert len(result) == 1, ("The splitter returned %s files instead of 1" % len(result)) count = 0 already_seen = list() for filename in result: ref_values = set() f = open(filename, 'rb') for item in get_input_item_flow(f): ref_value = getattr(item, split_attribute) ref_values.add(ref_value) assert ref_value not in already_seen, ( "The value %s should have been in a previous file" % (ref_value)) count += 1 f.close() already_seen.extend(ref_values) assert count == len(input_items), ( "We got %s items after splitting, instead of the %s we had before" % (count, len(input_items))) # list of items with max_items > nb items result = get_splitter(input_items, 10, split_attribute).split() assert len(result) == 1, ("The splitter returned %s files instead of 1" % len(result)) count = 0 already_seen = list() for filename in result: ref_values = set() f = open(filename, 'rb') for item in get_input_item_flow(f): ref_value = getattr(item, split_attribute) ref_values.add(ref_value) assert ref_value not in already_seen, ( "The value %s should have been in a previous file" % (ref_value)) count += 1 f.close() already_seen.extend(ref_values) assert count == len(input_items), ( "We got %s items after splitting, instead of the %s we had before" % (count, len(input_items)))