def test_dedupe(self): """Dedupe using given field as deduping key, Only first appearance survives. """ args = Transform.args_from_dict({ 'operation': 'dedupe', 'field': 'id', }) dataset = [{ 'name': 'Office_A', 'id': 'A', 'description': 'Office' }, { 'name': 'Office_B', 'id': 'B', 'description': 'Office' }, { 'name': 'Office_A', 'id': 'A', 'description': 'Office' }] self.assertEqual(list(Transform().run(dataset, args)), [{ 'name': 'Office_A', 'id': 'A', 'description': 'Office' }, { 'name': 'Office_B', 'id': 'B', 'description': 'Office' }])
def test_fixed_value(self): """Function is provided in the field argument. """ args = Transform.args_from_dict({ 'operation': 'fixedvalue', 'field': 'chain', 'target': 'USPS', }) dataset = [{ 'name': 'Office A', 'description': 'Headquarter' }, { 'name': 'Office B', 'description': 'Office' }] self.assertEqual(list(Transform().run(dataset, args)), [{ 'name': 'Office A', 'chain': 'USPS', 'description': 'Headquarter' }, { 'name': 'Office B', 'chain': 'USPS', 'description': 'Office' }])
def test_plaintext(self): """ Converts text to plain: - lowers letters - replace spaces and hyphens by _ - remove any other character that are not digits """ args = Transform.args_from_dict({ 'operation': 'plaintext', 'field': 'id', }) dataset = [{ 'name': 'Office A', 'id': 'Office A', 'description': 'Office' }, { 'name': 'Office B', 'id': 'Office B', 'description': 'Office' }] self.assertEqual(list(Transform().run(dataset, args)), [{ 'name': 'Office A', 'id': 'office_a', 'description': 'Office' }, { 'name': 'Office B', 'id': 'office_b', 'description': 'Office' }])
def test_template(self): """Copy given fields (in template format as per str.format() function) from each record in a dataset, into the given target field. """ args = Transform.args_from_dict({ 'operation': 'template', 'field': '{description}_{id}', 'target': 'name', }) dataset = [{ 'id': 'A', 'description': 'Office' }, { 'id': 'B', 'description': 'Office' }] self.assertEqual(list(Transform().run(dataset, args)), [{ 'name': 'Office_A', 'description': 'Office', 'id': 'A' }, { 'name': 'Office_B', 'description': 'Office', 'id': 'B' }])
def test_extract_ii(self): """Join the extracted groups from the given field, and save in the given target field """ args = Transform.args_from_dict({ 'operation': 'extract', 'field': 'name', 'target': 'id', 'regex': r'(off)ice_(\w+)', 'regex_flags': ['I'], }) dataset = [{ 'name': 'Office_A', 'description': 'Headquarter' }, { 'name': 'Office_B', 'description': 'Office' }] self.assertEqual(list(Transform().run(dataset, args)), [{ 'name': 'Office_A', 'description': 'Headquarter', 'id': 'OffA' }, { 'name': 'Office_B', 'description': 'Office', 'id': 'OffB' }])
def test_extract(self): """Extracts the regex group from the given field, and save in the given target field Also accepts regex_flags. """ args = Transform.args_from_dict({ 'operation': 'extract', 'field': 'name', 'target': 'id', 'regex': r'office_(\w+)', 'regex_flags': ['I'], }) dataset = [{ 'name': 'Office_A', 'description': 'Headquarter' }, { 'name': 'Office_B', 'description': 'Office' }] self.assertEqual(list(Transform().run(dataset, args)), [{ 'name': 'Office_A', 'description': 'Headquarter', 'id': 'A' }, { 'name': 'Office_B', 'description': 'Office', 'id': 'B' }])
class MyTransform(Transform): PIPELINE = { 'usps.com': [ Transform.args_from_dict({ 'operation': 'dedupe', 'field': 'id', }), Transform.args_from_dict({ 'operation': 'remove_fields', 'field': 'id', }), ], }
def test_filter_regex(self): """Filters out records that don't match given regex in the given field. Also accepts regex_flags. """ args = Transform.args_from_dict({ 'operation': 'filter_regex', 'field': 'name', 'regex': r'office_\d+', 'regex_flags': ['I'], }) dataset = [{'name': 'Office_A'}, {'name': 'Office_1'}] self.assertEqual(list(Transform().run(dataset, args)), [{ 'name': 'Office_1' }])
def test_filter_not_exists(self): """Filters out records that doesn't have given field """ args = Transform.args_from_dict({ 'operation': 'filter_not_exists', 'field': 'description', }) dataset = [{ 'name': 'Office_A', 'description': 'Headquarter' }, { 'name': 'Office_B' }] self.assertEqual(list(Transform().run(dataset, args)), [{ 'name': 'Office_A', 'description': 'Headquarter' }])
def test_cross_filter(self): """Filters out records for which given field don't match value from another (target) field """ args = Transform.args_from_dict({ 'operation': 'cross_filter', 'field': 'name', 'target': 'description', }) dataset = [{ 'name': 'Office_A', 'description': 'Headquarter' }, { 'name': 'Office_B', 'description': 'Office' }] self.assertEqual(list(Transform().run(dataset, args)), [{ 'name': 'Office_B', 'description': 'Office' }])
def test_remove_fields_list(self): """Remove the given fields (as list) of each record in a dataset. """ args = Transform.args_from_dict({ 'operation': 'remove_fields', 'field': ['description', 'id'], }) dataset = [{ 'name': 'Office_A', 'id': 'A', 'description': 'Office' }, { 'name': 'Office_B', 'id': 'B', 'description': 'Office' }] self.assertEqual(list(Transform().run(dataset, args)), [{ 'name': 'Office_A' }, { 'name': 'Office_B' }])
def test_rename_field(self): """Rename provided field to the target one. """ args = Transform.args_from_dict({ 'operation': 'rename_field', 'field': 'description', 'target': 'title', }) dataset = [{ 'name': 'Office_A', 'description': 'Headquarter' }, { 'name': 'Office_B', 'description': 'Office' }] self.assertEqual(list(Transform().run(dataset, args)), [{ 'name': 'Office_A', 'title': 'Headquarter' }, { 'name': 'Office_B', 'title': 'Office' }])
def test_function(self): """ Apply given function (as an absolute python path) to each record. Function parameters are a record and operation arguments object. Return value is the modified record. Function is provided in the field argument. """ args = Transform.args_from_dict({ 'operation': 'function', 'field': dict_to_text, 'target': 'open_hours', 'separator': ', ', }) dataset = [{ 'name': 'Office A', 'open_hours': { 'Monday': '9:00-18:00', 'Tuesday': '8:00-18:00' } }, { 'name': 'Office B', 'open_hours': { 'Monday-Friday': '8:00-20:00', 'Saturday': '10:00-18:00' } }] self.assertEqual( list(Transform().run(dataset, args)), [{ 'name': 'Office A', 'open_hours': 'Monday: 9:00-18:00, Tuesday: 8:00-18:00' }, { 'name': 'Office B', 'open_hours': 'Monday-Friday: 8:00-20:00, Saturday: 10:00-18:00' }])