def transform_xls(hires_and_promotions_excel, separations_excel, exempt_roster_excel, output_file): hires_and_promotions = petl.io.xls \ .fromxls(hires_and_promotions_excel, sheet='Data') \ .rename(column_map_shared) separations = petl.io.xls \ .fromxls(separations_excel, sheet='Data') \ .rename({**column_map_shared, **column_map_separations}) def dedup_separations(payroll_number, rows): rows_sorted = sorted(rows, key=lambda x: x['termination_date']) return rows_sorted[-1] separations_deduped = petl.rowreduce(separations, 'payroll_number', dedup_separations) exempt_roster = petl.io.xls \ .fromxls(exempt_roster_excel, sheet='Data') \ .rename(column_map_roster) merged = petl.mergesort(hires_and_promotions, separations_deduped, exempt_roster, key='payroll_number') def dedup_merged(payroll_number, rows): rows_sorted = sorted(rows, key=lambda x: x['latest_start_date']) if len(rows_sorted) == 1: return rows_sorted[-1] merged_row = [] for i in range(0, len(rows_sorted[0]) - 1): if (rows_sorted[0][i] == '' or rows_sorted[0][i] == None ) and rows_sorted[1][i] != '' and rows_sorted[1][i] != None: merged_row.append(rows_sorted[1][i]) elif (rows_sorted[1][i] == '' or rows_sorted[1][i] == None ) and rows_sorted[0][i] != '' and rows_sorted[0][i] != None: merged_row.append(rows_sorted[0][i]) elif rows_sorted[0][i] == rows_sorted[1][i]: merged_row.append(rows_sorted[0][i]) else: merged_row.append( rows_sorted[1][i]) ## take latest value by start date return merged_row merged_deduped = petl.rowreduce(merged, 'payroll_number', dedup_merged) petl.tocsv(merged_deduped, source=output_file)
# rowreduce table1 = [['foo', 'bar'], ['a', 3], ['a', 7], ['b', 2], ['b', 1], ['b', 9], ['c', 4]] from petl import rowreduce, look look(table1) def sumbar(key, rows): return [key, sum(row[1] for row in rows)] table2 = rowreduce(table1, key='foo', reducer=sumbar, fields=['foo', 'barsum']) look(table2) # recordreduce table1 = [['foo', 'bar'], ['a', 3], ['a', 7], ['b', 2], ['b', 1], ['b', 9], ['c', 4]] from petl import recordreduce, look look(table1)
def reduce_rows(self, columns, reduce_func, headers, presorted=False, **kwargs): """ Group rows by a column or columns, then reduce the groups to a single row. Based on the `rowreduce petl function <https://petl.readthedocs.io/en/stable/transform.html#petl.transform.reductions.rowreduce>`_. For example, the output from the query to get a table's definition is returned as one component per row. The `reduce_rows` method can be used to reduce all those to a single row containg the entire query. .. code-block:: python >>> ddl = rs.query(sql_to_get_table_ddl) >>> ddl.table +--------------+--------------+----------------------------------------------------+ | schemaname | tablename | ddl | +==============+==============+====================================================+ | 'db_scratch' | 'state_fips' | '--DROP TABLE db_scratch.state_fips;' | +--------------+--------------+----------------------------------------------------+ | 'db_scratch' | 'state_fips' | 'CREATE TABLE IF NOT EXISTS db_scratch.state_fips' | +--------------+--------------+----------------------------------------------------+ | 'db_scratch' | 'state_fips' | '(' | +--------------+--------------+----------------------------------------------------+ | 'db_scratch' | 'state_fips' | '\\tstate VARCHAR(1024) ENCODE RAW' | +--------------+--------------+----------------------------------------------------+ | 'db_scratch' | 'state_fips' | '\\t,stusab VARCHAR(1024) ENCODE RAW' | +--------------+--------------+----------------------------------------------------+ >>> reducer_fn = lambda columns, rows: [ ... f"{columns[0]}.{columns[1]}", ... '\\n'.join([row[2] for row in rows])] >>> ddl.reduce_rows( ... ['schemaname', 'tablename'], ... reducer_fn, ... ['tablename', 'ddl'], ... presorted=True) >>> ddl.table +-------------------------+-----------------------------------------------------------------------+ | tablename | ddl | +=========================+=======================================================================+ | 'db_scratch.state_fips' | '--DROP TABLE db_scratch.state_fips;\\nCREATE TABLE IF NOT EXISTS | | | db_scratch.state_fips\\n(\\n\\tstate VARCHAR(1024) ENCODE RAW\\n\\t | | | ,db_scratch.state_fips\\n(\\n\\tstate VARCHAR(1024) ENCODE RAW | | | \\n\\t,stusab VARCHAR(1024) ENCODE RAW\\n\\t,state_name | | | VARCHAR(1024) ENCODE RAW\\n\\t,statens VARCHAR(1024) ENCODE | | | RAW\\n)\\nDISTSTYLE EVEN\\n;' | +-------------------------+-----------------------------------------------------------------------+ `Args:` columns: list The column(s) by which to group the rows. reduce_func: fun The function by which to reduce the rows. Should take the 2 arguments, the columns list and the rows list and return a list. `reducer(columns: list, rows: list) -> list;` headers: list The list of headers for modified table. The length of `headers` should match the length of the list returned by the reduce function. presorted: bool If false, the row will be sorted. `Returns:` `Parsons Table` and also updates self """ # noqa: E501,E261 self.table = petl.rowreduce(self.table, columns, reduce_func, header=headers, presorted=presorted, **kwargs) return self
def reducer_execute(c, **kwargs): r = c() if 'addfields' in kwargs: r = etl.addfields(r, kwargs['addfields']) kwargs = filter_keys(kwargs, ("key", "reducer", "header")) return etl.rowreduce(r, **kwargs)