def get_aggregate_data(self, aggregation_keys: Sequence) -> etl.Table: agg = OrderedDict() agg['count'] = len return etl.aggregate(self.get_table(), key=aggregation_keys if len(aggregation_keys) > 1 else aggregation_keys[0], aggregation=agg).convert('count', lambda v: str(v))
def load_grouped_data(csv_file, fields) -> TableData: table = etl.fromcsv(csv_file) if len(fields) == 1: fields = fields[0] return TableData( header=etl.header(table), data=etl.aggregate(table, key=fields, aggregation=len), next_limit=None, )
def aggregate_characters_table(csv_path, filters): if len(filters) == 1: filters = filters[0] characters_table, headers, total_characters = get_characters_table( csv_path) data = etl.data( etl.aggregate(characters_table, key=filters, aggregation=len)) return headers, data, total_characters
def xls_tidy(xls,qvalue): d=etl.fromtsv(xls) sd=etl.select(d,lambda x: float(x.PepQValue) <=float(qvalue)) psmsummary=sd ssd=etl.cut(sd, 'Peptide', 'Protein', 'PepQValue') #remove the mod info in peptide. ssd=etl.transform.regex.sub(ssd,'Peptide', r'^[\w-]\.(.+)\.[\w-]$', r'\1') ssd=etl.transform.regex.sub(ssd,'Peptide', r'[\d\.\+]+', r'') aggregation = OrderedDict() aggregation['SpecCount'] = len cssd=etl.aggregate(ssd, 'Peptide', aggregation) fssd=etl.groupselectfirst(ssd, key=('Peptide','Protein',"PepQValue")) aggregation = OrderedDict() aggregation['Protein'] = 'Protein', etl.strjoin(';') aggregation['PepQValue'] = 'PepQValue', etl.strjoin(';') assd=etl.aggregate(fssd, 'Peptide', aggregation) pepsummary=etl.join(assd, cssd, key='Peptide') return (psmsummary, pepsummary)
def group_entries_by_day(inp): hdr = petl.header(inp) agg = OrderedDict() for field in hdr: # using first found value agg[field] = field, next agg['dur'] = 'dur', lambda durs: sum(durs, timedelta()) agg['start'] = 'start', min with_day = petl.addfield(inp, 'start_date', lambda row: row.get('start').date()) index_keys = ('start_date', 'description') result = petl.aggregate(with_day, index_keys, agg) return petl.cutout(result, 'start_date')
def get_context_data(self, **kwargs): context = super().get_context_data(**kwargs) context["filename"] = self.object.downloaded_file.name.split(os.path.sep)[-1] context["columns_query_kwarg"] = self.columns_query_kwarg table = petl.fromcsv(self.object.downloaded_file) full_table_header = list(petl.header(table)) context["column_options"] = full_table_header selected_columns = [c for c in self.request.GET.getlist(self.columns_query_kwarg) if c in full_table_header] context["selected_columns"] = selected_columns if selected_columns: context["header"] = selected_columns + ["Count"] context["rows"] = petl.records( petl.aggregate(table, selected_columns[0] if len(selected_columns) == 1 else selected_columns, len) ) return context
table3 = merge(table1, table2, key='bar') look(table3) # aggregate table1 = [['foo', 'bar', 'baz'], ['a', 3, True], ['a', 7, False], ['b', 2, True], ['b', 2, False], ['b', 9, False], ['c', 4, True]] from petl import aggregate, look look(table1) # aggregate whole rows table2 = aggregate(table1, 'foo', len) look(table2) # aggregate single field table3 = aggregate(table1, 'foo', sum, 'bar') look(table3) # alternative signature for single field aggregation using keyword args table4 = aggregate(table1, key=('foo', 'bar'), aggregation=list, value=('bar', 'baz')) look(table4) # aggregate multiple fields from collections import OrderedDict from petl import strjoin aggregation = OrderedDict() aggregation['count'] = len aggregation['minbar'] = 'bar', min aggregation['maxbar'] = 'bar', max aggregation['sumbar'] = 'bar', sum
['b', 1], ['b', 9], ['c', 4], ['d', 3], ['d'], ['e']] from petl import aggregate, look look(table1) from collections import OrderedDict aggregators = OrderedDict() aggregators['minbar'] = 'bar', min aggregators['maxbar'] = 'bar', max aggregators['sumbar'] = 'bar', sum aggregators['listbar'] = 'bar', list table2 = aggregate(table1, 'foo', aggregators) look(table2) # aggregation functions can also be added and/or updated using the suffix # notation on the returned table object, e.g.:: table3 = aggregate(table1, 'foo') table3['minbar'] = 'bar', min table3['maxbar'] = 'bar', max table3['sumbar'] = 'bar', sum table3['listbar'] = 'bar' # default aggregation is list look(table3) # rangerowreduce table1 = [['foo', 'bar'], ['a', 3],
conn_target = create_engine('postgresql://username:hostname:5432/password') except: logger.error( "ERROR: Unexpected error: Could not connect to PostgreSQL instance.") sys.exit() logger.info("SUCCESS: Connection to RDS PostgreSQL instance succeeded") #Source table = etl.fromdb( conn, """select res_company.name, sum(product_qty) as qty, sum(price_total) as total from report_pos_order inner join res_company on res_company.id = report_pos_order.company_id where date(report_pos_order.date AT TIME ZONE 'GMT +7') = current_date group by res_company.name order by sum(price_total) desc""") #Transformation #grouping with aggregation aggregation = OrderedDict() aggregation['qty'] = 'qty', sum aggregation['total'] = 'total', sum table1 = etl.aggregate(table, 'name', aggregation) dfsum = etl.todataframe(table1) #Target dfsum.to_sql('GMV Warung', conn_target, if_exists='replace', index=None)
print("FACT TABLE:") print(fact) #OLAP ----> MIN & MAX mins, maxs = etl.limits(fact, 'sales') print( "Minimum Sales:",mins) print("Maximum Sales:",maxs) #OLAP ---> PIVOT table1 = etl.pivot(product, 'category', 'subcategory','quantity', sum) print("PIVOT:") print(table1) #OLAP OPERATIONS ---> ROLL UP table2 = etl.aggregate(customer, 'state', len) table3 = etl.aggregate(customer, 'city', len) print("ROLL UP:") print(table2) print(table3) #OLAP OPERATIONS ---> SLICING print("SLICING:") table4= etl.rowslice(table3,3) print(table4) except Error as e : print ("Error while connecting to MySQL", e) finally: #closing database connection.
matchingField = data['attibutes'][x]['matchingField'] mappings[attr] = matchingField mappedTable = etl.fieldmap(dataTable, mappings) cleansedTable = mappedTable #add rules to clean the table - reversed for give the priority for top attributes for x in reversed(range(length)): attr = data['attibutes'][x]['attrName'] rules = data['attibutes'][x]['rules'] rulesListSize = len(rules) for y in range(rulesListSize): if rules[y] == "Remove Null Value Rows": cleansedTable = etl.select(cleansedTable, attr, lambda v: v != '') if rules[y] == "Remove Duplicates": cleansedTable = etl.aggregate(cleansedTable, attr) if rules[y] == "Sort": cleansedTable = etl.mergesort(cleansedTable, key=attr) if rules[y] == "Number Validation": cleansedTable = etl.select(cleansedTable, attr) if rules[y] == "Fill Missing Values": cleansedTable = etl.filldown(cleansedTable, attr) etl.tocsv(cleansedTable,'src/etl/outputs/cleansed.csv') #Create rawData Table dataTable = cleansedTable rawDataTable = cleansedTable reasonUniqueValues = etl.aggregate(dataTable,dataTable[0][20])
table2 = etl.rowreduce(table1, key='foo', reducer=sumbar, fields=['foo', 'barsum']) table2 # aggregate() ############# import petl as etl table1 = [['foo', 'bar', 'baz'], ['a', 3, True], ['a', 7, False], ['b', 2, True], ['b', 2, False], ['b', 9, False], ['c', 4, True]] # aggregate whole rows table2 = etl.aggregate(table1, 'foo', len) table2 # aggregate single field table3 = etl.aggregate(table1, 'foo', sum, 'bar') table3 # alternative signature using keyword args table4 = etl.aggregate(table1, key=('foo', 'bar'), aggregation=list, value=('bar', 'baz')) table4 # aggregate multiple fields from collections import OrderedDict import petl as etl aggregation = OrderedDict()
target_list.append(target) counter = collections.Counter(target_list) calls_counter = dict(counter) calls_per_office = [{"name" : key, "num_calls": value} for key, value in calls_counter.items()] return Table(calls_per_office) if __name__ == "__main__": # Get all outreaches for given tool id outreaches = newmode.get_outreaches(TOOL_ID) # Tranform raw outreach data for spreadsheet transformed_outreaches = transform_outreaches(outreaches) # Set up tables for Google Sheets calls_per_day = Table( petl.aggregate( transformed_outreaches.table, key="created_date", aggregation=len ) ) leaderboard = petl.aggregate( transformed_outreaches.table, key="name", aggregation=len ) calls_per_office = get_calls_per_office(transformed_outreaches) # rename columns for spreadsheet calls_per_day.rename_column('value', 'num_calls') calls_per_day=calls_per_day.rename_column('created_date', 'day') calls_per_office=calls_per_office.rename_column('name', 'office') # Sort leaderboard by num calls per person leaderboard_ranked = Table(petl.sort(leaderboard, 'value', reverse=True))
lambda r: r['txt']['callType']) converted_data = etl.addfield( converted_data, 'participants', lambda r: modified_participants(r['txt']['participants'])) converted_data = etl.addfield(converted_data, 'timestamp_ms', lambda r: r['timestamp'] / 1000) aggregations = OrderedDict() aggregations['summary'] = ('participants', 'timestamp_ms'), grouping_summary_by_room_id aggregations['initiated_time'] = ( 'participants', 'timestamp_ms'), get_initiated_time_interval aggregations['creation_data'] = ( 'participants'), get_room_creation_info aggregated_summary = etl.aggregate(converted_data, key=('room_id'), aggregation=aggregations) external_ids = etl.fromdicts( [{ 'id': '3979', 'external_id': '95109151-af77-11e9-94fa-a860b6030e49' }, { 'id': '3980', 'external_id': '95d8c92e-af77-11e9-99b7-a860b6030e49' }, { 'id': '3982', 'external_id': '97163c4a-af77-11e9-bdf9-a860b6030e49' }], header=['id', 'external_id']) aggregated_summary = etl.unpackdict(aggregated_summary,
# aggregate() ############# import petl as etl table1 = [['foo', 'bar', 'baz'], ['a', 3, True], ['a', 7, False], ['b', 2, True], ['b', 2, False], ['b', 9, False], ['c', 4, True]] # aggregate whole rows table2 = etl.aggregate(table1, 'foo', len) table2 # aggregate single field table3 = etl.aggregate(table1, 'foo', sum, 'bar') table3 # alternative signature using keyword args table4 = etl.aggregate(table1, key=('foo', 'bar'), aggregation=list, value=('bar', 'baz')) table4 # aggregate multiple fields from collections import OrderedDict import petl as etl aggregation = OrderedDict() aggregation['count'] = len aggregation['minbar'] = 'bar', min