Exemplo n.º 1
0
 def get_aggregate_data(self, aggregation_keys: Sequence) -> etl.Table:
     agg = OrderedDict()
     agg['count'] = len
     return etl.aggregate(self.get_table(),
                          key=aggregation_keys if len(aggregation_keys) > 1
                          else aggregation_keys[0],
                          aggregation=agg).convert('count',
                                                   lambda v: str(v))
def load_grouped_data(csv_file, fields) -> TableData:
    table = etl.fromcsv(csv_file)
    if len(fields) == 1:
        fields = fields[0]
    return TableData(
        header=etl.header(table),
        data=etl.aggregate(table, key=fields, aggregation=len),
        next_limit=None,
    )
Exemplo n.º 3
0
def aggregate_characters_table(csv_path, filters):
    if len(filters) == 1:
        filters = filters[0]

    characters_table, headers, total_characters = get_characters_table(
        csv_path)
    data = etl.data(
        etl.aggregate(characters_table, key=filters, aggregation=len))

    return headers, data, total_characters
Exemplo n.º 4
0
def xls_tidy(xls,qvalue):
    d=etl.fromtsv(xls)
    sd=etl.select(d,lambda x: float(x.PepQValue) <=float(qvalue))
    psmsummary=sd

    ssd=etl.cut(sd, 'Peptide', 'Protein', 'PepQValue')
    #remove the mod info in peptide.
    ssd=etl.transform.regex.sub(ssd,'Peptide', r'^[\w-]\.(.+)\.[\w-]$', r'\1')
    ssd=etl.transform.regex.sub(ssd,'Peptide', r'[\d\.\+]+', r'')

    aggregation = OrderedDict()
    aggregation['SpecCount'] = len
    cssd=etl.aggregate(ssd, 'Peptide', aggregation)

    fssd=etl.groupselectfirst(ssd, key=('Peptide','Protein',"PepQValue"))
    aggregation = OrderedDict()
    aggregation['Protein'] = 'Protein', etl.strjoin(';')
    aggregation['PepQValue'] = 'PepQValue', etl.strjoin(';')
    assd=etl.aggregate(fssd, 'Peptide', aggregation)
    pepsummary=etl.join(assd, cssd, key='Peptide')

    return (psmsummary, pepsummary)
Exemplo n.º 5
0
def group_entries_by_day(inp):
    hdr = petl.header(inp)

    agg = OrderedDict()
    for field in hdr:
        # using first found value
        agg[field] = field, next

    agg['dur'] = 'dur', lambda durs: sum(durs, timedelta())
    agg['start'] = 'start', min

    with_day = petl.addfield(inp, 'start_date',
                             lambda row: row.get('start').date())
    index_keys = ('start_date', 'description')
    result = petl.aggregate(with_day, index_keys, agg)
    return petl.cutout(result, 'start_date')
Exemplo n.º 6
0
    def get_context_data(self, **kwargs):
        context = super().get_context_data(**kwargs)
        context["filename"] = self.object.downloaded_file.name.split(os.path.sep)[-1]
        context["columns_query_kwarg"] = self.columns_query_kwarg
        table = petl.fromcsv(self.object.downloaded_file)

        full_table_header = list(petl.header(table))
        context["column_options"] = full_table_header

        selected_columns = [c for c in self.request.GET.getlist(self.columns_query_kwarg) if c in full_table_header]
        context["selected_columns"] = selected_columns

        if selected_columns:
            context["header"] = selected_columns + ["Count"]
            context["rows"] = petl.records(
                petl.aggregate(table, selected_columns[0] if len(selected_columns) == 1 else selected_columns, len)
            )

        return context
Exemplo n.º 7
0
table3 = merge(table1, table2, key='bar')
look(table3)


# aggregate
table1 = [['foo', 'bar', 'baz'],
          ['a', 3, True],
          ['a', 7, False],
          ['b', 2, True],
          ['b', 2, False],
          ['b', 9, False],
          ['c', 4, True]]
from petl import aggregate, look
look(table1)
# aggregate whole rows
table2 = aggregate(table1, 'foo', len)
look(table2)
# aggregate single field
table3 = aggregate(table1, 'foo', sum, 'bar')
look(table3)
# alternative signature for single field aggregation using keyword args
table4 = aggregate(table1, key=('foo', 'bar'), aggregation=list, value=('bar', 'baz'))
look(table4)
# aggregate multiple fields
from collections import OrderedDict
from petl import strjoin
aggregation = OrderedDict()
aggregation['count'] = len
aggregation['minbar'] = 'bar', min
aggregation['maxbar'] = 'bar', max
aggregation['sumbar'] = 'bar', sum
Exemplo n.º 8
0
          ['b', 1],
          ['b', 9],
          ['c', 4],
          ['d', 3],
          ['d'],
          ['e']]

from petl import aggregate, look
look(table1)
from collections import OrderedDict
aggregators = OrderedDict()
aggregators['minbar'] = 'bar', min
aggregators['maxbar'] = 'bar', max
aggregators['sumbar'] = 'bar', sum
aggregators['listbar'] = 'bar', list
table2 = aggregate(table1, 'foo', aggregators)
look(table2)
# aggregation functions can also be added and/or updated using the suffix
# notation on the returned table object, e.g.::
table3 = aggregate(table1, 'foo')
table3['minbar'] = 'bar', min
table3['maxbar'] = 'bar', max
table3['sumbar'] = 'bar', sum
table3['listbar'] = 'bar' # default aggregation is list
look(table3)


# rangerowreduce

table1 = [['foo', 'bar'],
          ['a', 3],
Exemplo n.º 9
0
    conn_target = create_engine('postgresql://username:hostname:5432/password')
except:
    logger.error(
        "ERROR: Unexpected error: Could not connect to PostgreSQL instance.")
    sys.exit()

logger.info("SUCCESS: Connection to RDS PostgreSQL instance succeeded")

#Source
table = etl.fromdb(
    conn, """select 
                 res_company.name, 
                 sum(product_qty) as qty, 
                 sum(price_total) as total 
                 from report_pos_order 
                 inner join res_company on res_company.id = report_pos_order.company_id 
                 where date(report_pos_order.date AT TIME ZONE 'GMT +7') = current_date 
                 group by res_company.name
                 order by sum(price_total) desc""")

#Transformation
#grouping with aggregation
aggregation = OrderedDict()
aggregation['qty'] = 'qty', sum
aggregation['total'] = 'total', sum
table1 = etl.aggregate(table, 'name', aggregation)
dfsum = etl.todataframe(table1)

#Target
dfsum.to_sql('GMV Warung', conn_target, if_exists='replace', index=None)
Exemplo n.º 10
0
        print("FACT TABLE:")
        print(fact)
        
        #OLAP ----> MIN & MAX
        mins, maxs = etl.limits(fact, 'sales')
        print( "Minimum Sales:",mins)
        print("Maximum Sales:",maxs)
        
        #OLAP ---> PIVOT
        table1 = etl.pivot(product, 'category', 'subcategory','quantity', sum)
        print("PIVOT:")
        print(table1)
        
        
        #OLAP OPERATIONS ---> ROLL UP
        table2 = etl.aggregate(customer, 'state', len)
        table3 = etl.aggregate(customer, 'city', len) 
        print("ROLL UP:")
        print(table2)
        print(table3)
        
        #OLAP OPERATIONS ---> SLICING
        print("SLICING:")
        table4= etl.rowslice(table3,3)
        print(table4)
        
        
except Error as e :
    print ("Error while connecting to MySQL", e)
finally:
#closing database connection.
Exemplo n.º 11
0
    matchingField = data['attibutes'][x]['matchingField']
    mappings[attr] = matchingField
    
mappedTable = etl.fieldmap(dataTable, mappings)

cleansedTable = mappedTable
#add rules to clean the table - reversed for give the priority for top attributes
for x in reversed(range(length)):
    attr = data['attibutes'][x]['attrName']
    rules = data['attibutes'][x]['rules']
    rulesListSize = len(rules)
    for y in range(rulesListSize):
        if rules[y] == "Remove Null Value Rows":
            cleansedTable = etl.select(cleansedTable, attr, lambda v: v != '')
        if rules[y] == "Remove Duplicates":
            cleansedTable = etl.aggregate(cleansedTable, attr)
        if rules[y] == "Sort":
            cleansedTable = etl.mergesort(cleansedTable, key=attr)
        if rules[y] == "Number Validation":
            cleansedTable = etl.select(cleansedTable, attr)
        if rules[y] == "Fill Missing Values":
            cleansedTable = etl.filldown(cleansedTable, attr)

etl.tocsv(cleansedTable,'src/etl/outputs/cleansed.csv')

#Create rawData Table
dataTable = cleansedTable
rawDataTable = cleansedTable

reasonUniqueValues = etl.aggregate(dataTable,dataTable[0][20])
Exemplo n.º 12
0
table2 = etl.rowreduce(table1,
                       key='foo',
                       reducer=sumbar,
                       fields=['foo', 'barsum'])
table2

# aggregate()
#############

import petl as etl

table1 = [['foo', 'bar', 'baz'], ['a', 3, True], ['a', 7, False],
          ['b', 2, True], ['b', 2, False], ['b', 9, False], ['c', 4, True]]
# aggregate whole rows
table2 = etl.aggregate(table1, 'foo', len)
table2
# aggregate single field
table3 = etl.aggregate(table1, 'foo', sum, 'bar')
table3
# alternative signature using keyword args
table4 = etl.aggregate(table1,
                       key=('foo', 'bar'),
                       aggregation=list,
                       value=('bar', 'baz'))
table4
# aggregate multiple fields
from collections import OrderedDict
import petl as etl

aggregation = OrderedDict()
Exemplo n.º 13
0
            target_list.append(target)

    counter = collections.Counter(target_list)
    calls_counter = dict(counter)
    calls_per_office = [{"name" : key, "num_calls": value} for key, value in calls_counter.items()]
    return Table(calls_per_office)

if __name__ == "__main__":
    # Get all outreaches for given tool id
    outreaches = newmode.get_outreaches(TOOL_ID)
    # Tranform raw outreach data for spreadsheet
    transformed_outreaches = transform_outreaches(outreaches)
    # Set up tables for Google Sheets
    calls_per_day = Table(
        petl.aggregate(
            transformed_outreaches.table, key="created_date", aggregation=len
        )
    )
    leaderboard = petl.aggregate(
        transformed_outreaches.table, key="name", aggregation=len
    )

    calls_per_office = get_calls_per_office(transformed_outreaches)

    # rename columns for spreadsheet
    calls_per_day.rename_column('value', 'num_calls')
    calls_per_day=calls_per_day.rename_column('created_date', 'day')

    calls_per_office=calls_per_office.rename_column('name', 'office')
    # Sort leaderboard by num calls per person
    leaderboard_ranked = Table(petl.sort(leaderboard, 'value', reverse=True))
Exemplo n.º 14
0
                                      lambda r: r['txt']['callType'])
        converted_data = etl.addfield(
            converted_data, 'participants',
            lambda r: modified_participants(r['txt']['participants']))
        converted_data = etl.addfield(converted_data, 'timestamp_ms',
                                      lambda r: r['timestamp'] / 1000)

        aggregations = OrderedDict()
        aggregations['summary'] = ('participants',
                                   'timestamp_ms'), grouping_summary_by_room_id
        aggregations['initiated_time'] = (
            'participants', 'timestamp_ms'), get_initiated_time_interval
        aggregations['creation_data'] = (
            'participants'), get_room_creation_info
        aggregated_summary = etl.aggregate(converted_data,
                                           key=('room_id'),
                                           aggregation=aggregations)

        external_ids = etl.fromdicts(
            [{
                'id': '3979',
                'external_id': '95109151-af77-11e9-94fa-a860b6030e49'
            }, {
                'id': '3980',
                'external_id': '95d8c92e-af77-11e9-99b7-a860b6030e49'
            }, {
                'id': '3982',
                'external_id': '97163c4a-af77-11e9-bdf9-a860b6030e49'
            }],
            header=['id', 'external_id'])
        aggregated_summary = etl.unpackdict(aggregated_summary,
Exemplo n.º 15
0

# aggregate()
#############

import petl as etl

table1 = [['foo', 'bar', 'baz'],
          ['a', 3, True],
          ['a', 7, False],
          ['b', 2, True],
          ['b', 2, False],
          ['b', 9, False],
          ['c', 4, True]]
# aggregate whole rows
table2 = etl.aggregate(table1, 'foo', len)
table2
# aggregate single field
table3 = etl.aggregate(table1, 'foo', sum, 'bar')
table3
# alternative signature using keyword args
table4 = etl.aggregate(table1, key=('foo', 'bar'),
                       aggregation=list, value=('bar', 'baz'))
table4
# aggregate multiple fields
from collections import OrderedDict
import petl as etl

aggregation = OrderedDict()
aggregation['count'] = len
aggregation['minbar'] = 'bar', min