def transform_resource(self, resource): target = resource source = self.get("resource") field_names = self.get("fieldNames") ignore_fields = self.get("ignoreFields") sort_by_field = self.get("sortByField") if isinstance(source, str): source = target.package.get_resource(source) source.infer() view1 = target.to_petl() view2 = source.to_petl() # Ignore fields if ignore_fields: for field in source.schema.fields[len(target.schema.fields):]: target.schema.add_field(field) resource.data = petl.stack(view1, view2) # Default else: for field in source.schema.fields: if field.name not in target.schema.field_names: target.schema.add_field(field) if field_names: for field in list(target.schema.fields): if field.name not in field_names: target.schema.remove_field(field.name) if sort_by_field: key = sort_by_field resource.data = petl.mergesort(view1, view2, key=key, header=field_names) else: resource.data = petl.cat(view1, view2, header=field_names)
def transform_resource(self, source, target): if isinstance(self.__resource, str): self.__resource = source.package.get_resource(self.__resource) self.__resource.infer(only_sample=True) view1 = source.to_petl() view2 = self.__resource.to_petl() # Ignore fields if self.__ignore_fields: target.data = petl.stack(view1, view2) for field in self.__resource.schema.fields[len(target.schema.fields ):]: target.schema.add_field(field) # Default else: if self.__sort: target.data = petl.mergesort(view1, view2, key=self.__sort, header=self.__field_names) else: target.data = petl.cat(view1, view2, header=self.__field_names) for field in self.__resource.schema.fields: if field.name not in target.schema.field_names: target.schema.add_field(field) if self.__field_names: for field in list(target.schema.fields): if field.name not in self.__field_names: target.schema.remove_field(field.name)
def transform_xls(hires_and_promotions_excel, separations_excel, exempt_roster_excel, output_file): hires_and_promotions = petl.io.xls \ .fromxls(hires_and_promotions_excel, sheet='Data') \ .rename(column_map_shared) separations = petl.io.xls \ .fromxls(separations_excel, sheet='Data') \ .rename({**column_map_shared, **column_map_separations}) def dedup_separations(payroll_number, rows): rows_sorted = sorted(rows, key=lambda x: x['termination_date']) return rows_sorted[-1] separations_deduped = petl.rowreduce(separations, 'payroll_number', dedup_separations) exempt_roster = petl.io.xls \ .fromxls(exempt_roster_excel, sheet='Data') \ .rename(column_map_roster) merged = petl.mergesort(hires_and_promotions, separations_deduped, exempt_roster, key='payroll_number') def dedup_merged(payroll_number, rows): rows_sorted = sorted(rows, key=lambda x: x['latest_start_date']) if len(rows_sorted) == 1: return rows_sorted[-1] merged_row = [] for i in range(0, len(rows_sorted[0]) - 1): if (rows_sorted[0][i] == '' or rows_sorted[0][i] == None ) and rows_sorted[1][i] != '' and rows_sorted[1][i] != None: merged_row.append(rows_sorted[1][i]) elif (rows_sorted[1][i] == '' or rows_sorted[1][i] == None ) and rows_sorted[0][i] != '' and rows_sorted[0][i] != None: merged_row.append(rows_sorted[0][i]) elif rows_sorted[0][i] == rows_sorted[1][i]: merged_row.append(rows_sorted[0][i]) else: merged_row.append( rows_sorted[1][i]) ## take latest value by start date return merged_row merged_deduped = petl.rowreduce(merged, 'payroll_number', dedup_merged) petl.tocsv(merged_deduped, source=output_file)
def createFacts(events, users): try: events_uid = etl.cutout(events, 'tracking_id', 'utm_medium', 'utm_campaign') events_tui = etl.cutout(events, 'user_id') stage_uid = etl.join(users, events_uid, key='user_id') stage_tui = etl.join(users, events_tui, key='tracking_id') stage_utm = etl.cut(stage_tui, 'user_id', 'utm_medium', 'utm_campaign') stage_uid_utm = etl.join(stage_uid, stage_utm, key='user_id') stage_m_s = etl.mergesort(stage_uid_utm, stage_tui, key=['created_at', 'email']) mappings = OrderedDict() mappings['tid'] = 'tracking_id' mappings['uid'] = 'user_id' mappings['utm_medium'] = 'utm_medium' mappings['utm_campaign'] = 'utm_campaign', {'audio': 'none', 'social': 'none'} mappings['utm_campaigntype'] = 'utm_campaign' mappings['email'] = 'email' mappings['subscription'] = 'type' mappings['sub_order'] = 'type', {'Signup Completed': '1', 'Trial Started': '2', 'Subscription Started': '3', 'Subscription Ended': '4'} mappings['created_at'] = 'created_at' # Mapping stage_mapping = etl.fieldmap(stage_m_s, mappings) # Sort stage_mapping_ordered = etl.sort(stage_mapping, key=['created_at', 'email', 'sub_order']) # Datetime split t1 = etl.split(stage_mapping_ordered, 'created_at', 'T', ['date', 'time'], include_original=True) t2 = etl.split(t1, 'date', '-', ['year', 'month', 'day']) stage_ready = etl.split(t2, 'time', ':', ['hour', 'minute', 'second']) # Export as csv to load folder etl.tocsv(stage_ready, 'load/facts.csv') except Exception as e: print("Something went wrong. Error {0}".format(e))
table1 = (('foo', 'bar'), ('A', 9), ('C', 2), ('D', 10), ('A', 6), ('F', 1)) table2 = (('foo', 'bar'), ('B', 3), ('D', 10), ('A', 10), ('F', 4)) from petl import mergesort, look look(table1) look(table2) table3 = mergesort(table1, table2, key='foo') look(table3) # mergesort - heterogeneous tables table4 = (('foo', 'bar'), ('A', 9), ('C', 2), ('D', 10), ('A', 6), ('F', 1)) table5 = (('foo', 'baz'), ('B', 3), ('D', 10),
import petl as etl table1 = [['foo', 'bar'], ['C', 2], ['A', 9], ['A', 6], ['F', 1], ['D', 10]] table2 = etl.sort(table1, 'foo') table2 # sorting by compound key is supported table3 = etl.sort(table1, key=['foo', 'bar']) table3 # if no key is specified, the default is a lexical sort table4 = etl.sort(table1) table4 # mergesort() ############# import petl as etl table1 = [['foo', 'bar'], ['A', 9], ['C', 2], ['D', 10], ['A', 6], ['F', 1]] table2 = [['foo', 'bar'], ['B', 3], ['D', 10], ['A', 10], ['F', 4]] table3 = etl.mergesort(table1, table2, key='foo') table3.lookall() # issorted() ############ import petl as etl table1 = [['foo', 'bar', 'baz'], ['a', 1, True], ['b', 3, True], ['b', 2]] etl.issorted(table1, key='foo') etl.issorted(table1, key='bar') etl.issorted(table1, key='foo', strict=True) etl.issorted(table1, key='foo', reverse=True)
mappedTable = etl.fieldmap(dataTable, mappings) cleansedTable = mappedTable #add rules to clean the table - reversed for give the priority for top attributes for x in reversed(range(length)): attr = data['attibutes'][x]['attrName'] rules = data['attibutes'][x]['rules'] rulesListSize = len(rules) for y in range(rulesListSize): if rules[y] == "Remove Null Value Rows": cleansedTable = etl.select(cleansedTable, attr, lambda v: v != '') if rules[y] == "Remove Duplicates": cleansedTable = etl.aggregate(cleansedTable, attr) if rules[y] == "Sort": cleansedTable = etl.mergesort(cleansedTable, key=attr) if rules[y] == "Number Validation": cleansedTable = etl.select(cleansedTable, attr) if rules[y] == "Fill Missing Values": cleansedTable = etl.filldown(cleansedTable, attr) etl.tocsv(cleansedTable,'src/etl/outputs/cleansed.csv') #Create rawData Table dataTable = cleansedTable rawDataTable = cleansedTable reasonUniqueValues = etl.aggregate(dataTable,dataTable[0][20]) mappings = OrderedDict()
# Facts # This facts table will be the staging with all the needed info to quickly update with the dimension keys and load to the facts table # The facts table will have columns to match each column on the dim Time table, to make it easier to get the reference key # events_uid = etl.cutout(events, 'tracking_id', 'utm_medium', 'utm_campaign') events_tui = etl.cutout(events, 'user_id') stage_uid = etl.join(users, events_uid, key='user_id') stage_tui = etl.join(users, events_tui, key='tracking_id') stage_utm = etl.cut(stage_tui, 'user_id', 'utm_medium', 'utm_campaign') stage_uid_utm = etl.join(stage_uid, stage_utm, key='user_id') stage_m_s = etl.mergesort(stage_uid_utm, stage_tui, key=['created_at', 'email']) # Mapping definitions mappings = OrderedDict() mappings['tid'] = 'tracking_id' mappings['uid'] = 'user_id' mappings['utm_medium'] = 'utm_medium' mappings['utm_campaign'] = 'utm_campaign', {'audio': 'none', 'social': 'none'} mappings['utm_campaign_type'] = 'utm_campaign' mappings['email'] = 'email' mappings['subscription'] = 'type' mappings['sub_order'] = 'type', { 'Signup Completed': '1', 'Trial Started': '2', 'Subscription Started': '3',
# mergesort() ############# import petl as etl table1 = [['foo', 'bar'], ['A', 9], ['C', 2], ['D', 10], ['A', 6], ['F', 1]] table2 = [['foo', 'bar'], ['B', 3], ['D', 10], ['A', 10], ['F', 4]] table3 = etl.mergesort(table1, table2, key='foo') table3.lookall() # issorted() ############ import petl as etl table1 = [['foo', 'bar', 'baz'], ['a', 1, True], ['b', 3, True], ['b', 2]] etl.issorted(table1, key='foo') etl.issorted(table1, key='bar') etl.issorted(table1, key='foo', strict=True) etl.issorted(table1, key='foo', reverse=True)