예제 #1
0
    def transform_resource(self, resource):
        target = resource
        source = self.get("resource")
        field_names = self.get("fieldNames")
        ignore_fields = self.get("ignoreFields")
        sort_by_field = self.get("sortByField")
        if isinstance(source, str):
            source = target.package.get_resource(source)
        source.infer()
        view1 = target.to_petl()
        view2 = source.to_petl()

        # Ignore fields
        if ignore_fields:
            for field in source.schema.fields[len(target.schema.fields):]:
                target.schema.add_field(field)
            resource.data = petl.stack(view1, view2)

        # Default
        else:
            for field in source.schema.fields:
                if field.name not in target.schema.field_names:
                    target.schema.add_field(field)
            if field_names:
                for field in list(target.schema.fields):
                    if field.name not in field_names:
                        target.schema.remove_field(field.name)
            if sort_by_field:
                key = sort_by_field
                resource.data = petl.mergesort(view1,
                                               view2,
                                               key=key,
                                               header=field_names)
            else:
                resource.data = petl.cat(view1, view2, header=field_names)
예제 #2
0
    def transform_resource(self, source, target):
        if isinstance(self.__resource, str):
            self.__resource = source.package.get_resource(self.__resource)
        self.__resource.infer(only_sample=True)
        view1 = source.to_petl()
        view2 = self.__resource.to_petl()

        # Ignore fields
        if self.__ignore_fields:
            target.data = petl.stack(view1, view2)
            for field in self.__resource.schema.fields[len(target.schema.fields
                                                           ):]:
                target.schema.add_field(field)

        # Default
        else:
            if self.__sort:
                target.data = petl.mergesort(view1,
                                             view2,
                                             key=self.__sort,
                                             header=self.__field_names)
            else:
                target.data = petl.cat(view1, view2, header=self.__field_names)
            for field in self.__resource.schema.fields:
                if field.name not in target.schema.field_names:
                    target.schema.add_field(field)
            if self.__field_names:
                for field in list(target.schema.fields):
                    if field.name not in self.__field_names:
                        target.schema.remove_field(field.name)
예제 #3
0
def transform_xls(hires_and_promotions_excel, separations_excel,
                  exempt_roster_excel, output_file):
    hires_and_promotions = petl.io.xls \
                            .fromxls(hires_and_promotions_excel, sheet='Data') \
                            .rename(column_map_shared)

    separations = petl.io.xls \
                    .fromxls(separations_excel, sheet='Data') \
                    .rename({**column_map_shared, **column_map_separations})

    def dedup_separations(payroll_number, rows):
        rows_sorted = sorted(rows, key=lambda x: x['termination_date'])
        return rows_sorted[-1]

    separations_deduped = petl.rowreduce(separations, 'payroll_number',
                                         dedup_separations)

    exempt_roster = petl.io.xls \
                        .fromxls(exempt_roster_excel, sheet='Data') \
                        .rename(column_map_roster)

    merged = petl.mergesort(hires_and_promotions,
                            separations_deduped,
                            exempt_roster,
                            key='payroll_number')

    def dedup_merged(payroll_number, rows):
        rows_sorted = sorted(rows, key=lambda x: x['latest_start_date'])

        if len(rows_sorted) == 1:
            return rows_sorted[-1]

        merged_row = []
        for i in range(0, len(rows_sorted[0]) - 1):
            if (rows_sorted[0][i] == '' or rows_sorted[0][i] == None
                ) and rows_sorted[1][i] != '' and rows_sorted[1][i] != None:
                merged_row.append(rows_sorted[1][i])
            elif (rows_sorted[1][i] == '' or rows_sorted[1][i] == None
                  ) and rows_sorted[0][i] != '' and rows_sorted[0][i] != None:
                merged_row.append(rows_sorted[0][i])
            elif rows_sorted[0][i] == rows_sorted[1][i]:
                merged_row.append(rows_sorted[0][i])
            else:
                merged_row.append(
                    rows_sorted[1][i])  ## take latest value by start date

        return merged_row

    merged_deduped = petl.rowreduce(merged, 'payroll_number', dedup_merged)

    petl.tocsv(merged_deduped, source=output_file)
def createFacts(events, users):
    try:
        events_uid = etl.cutout(events, 'tracking_id', 'utm_medium', 'utm_campaign')
        events_tui = etl.cutout(events, 'user_id')

        stage_uid = etl.join(users, events_uid, key='user_id')
        stage_tui = etl.join(users, events_tui, key='tracking_id')

        stage_utm = etl.cut(stage_tui, 'user_id', 'utm_medium', 'utm_campaign')
        stage_uid_utm = etl.join(stage_uid, stage_utm, key='user_id')
        stage_m_s = etl.mergesort(stage_uid_utm, stage_tui, key=['created_at', 'email'])

        mappings = OrderedDict()
        mappings['tid'] = 'tracking_id'
        mappings['uid'] = 'user_id'
        mappings['utm_medium'] = 'utm_medium'
        mappings['utm_campaign'] = 'utm_campaign', {'audio': 'none', 'social': 'none'}
        mappings['utm_campaigntype'] = 'utm_campaign'
        mappings['email'] = 'email'
        mappings['subscription'] = 'type'
        mappings['sub_order'] = 'type', {'Signup Completed': '1', 'Trial Started': '2', 'Subscription Started': '3', 'Subscription Ended': '4'}
        mappings['created_at'] = 'created_at'

        # Mapping
        stage_mapping = etl.fieldmap(stage_m_s, mappings)

        # Sort
        stage_mapping_ordered = etl.sort(stage_mapping, key=['created_at', 'email', 'sub_order'])

        # Datetime split
        t1 = etl.split(stage_mapping_ordered, 'created_at', 'T', ['date', 'time'], include_original=True)
        t2 = etl.split(t1, 'date', '-', ['year', 'month', 'day'])
        stage_ready = etl.split(t2, 'time', ':', ['hour', 'minute', 'second'])

        # Export as csv to load folder
        etl.tocsv(stage_ready, 'load/facts.csv')

    except Exception as e:
        print("Something went wrong. Error {0}".format(e))
예제 #5
0
파일: examples.py 프로젝트: datamade/petl
table1 = (('foo', 'bar'),
          ('A', 9),
          ('C', 2),
          ('D', 10),
          ('A', 6),
          ('F', 1))
table2 = (('foo', 'bar'),
          ('B', 3),
          ('D', 10),
          ('A', 10),
          ('F', 4))

from petl import mergesort, look
look(table1)
look(table2)
table3 = mergesort(table1, table2, key='foo')
look(table3)


# mergesort - heterogeneous tables

table4 = (('foo', 'bar'),
          ('A', 9),
          ('C', 2),
          ('D', 10),
          ('A', 6),
          ('F', 1))

table5 = (('foo', 'baz'),
          ('B', 3),
          ('D', 10),
예제 #6
0
파일: examples.py 프로젝트: aklimchak/petl
table1 = (('foo', 'bar'),
          ('A', 9),
          ('C', 2),
          ('D', 10),
          ('A', 6),
          ('F', 1))
table2 = (('foo', 'bar'),
          ('B', 3),
          ('D', 10),
          ('A', 10),
          ('F', 4))

from petl import mergesort, look
look(table1)
look(table2)
table3 = mergesort(table1, table2, key='foo')
look(table3)


# mergesort - heterogeneous tables

table4 = (('foo', 'bar'),
          ('A', 9),
          ('C', 2),
          ('D', 10),
          ('A', 6),
          ('F', 1))

table5 = (('foo', 'baz'),
          ('B', 3),
          ('D', 10),
예제 #7
0
파일: sorts.py 프로젝트: zli69/petl
import petl as etl
table1 = [['foo', 'bar'], ['C', 2], ['A', 9], ['A', 6], ['F', 1], ['D', 10]]
table2 = etl.sort(table1, 'foo')
table2
# sorting by compound key is supported
table3 = etl.sort(table1, key=['foo', 'bar'])
table3
# if no key is specified, the default is a lexical sort
table4 = etl.sort(table1)
table4

# mergesort()
#############

import petl as etl
table1 = [['foo', 'bar'], ['A', 9], ['C', 2], ['D', 10], ['A', 6], ['F', 1]]
table2 = [['foo', 'bar'], ['B', 3], ['D', 10], ['A', 10], ['F', 4]]
table3 = etl.mergesort(table1, table2, key='foo')
table3.lookall()

# issorted()
############

import petl as etl
table1 = [['foo', 'bar', 'baz'], ['a', 1, True], ['b', 3, True], ['b', 2]]
etl.issorted(table1, key='foo')
etl.issorted(table1, key='bar')
etl.issorted(table1, key='foo', strict=True)
etl.issorted(table1, key='foo', reverse=True)
    
mappedTable = etl.fieldmap(dataTable, mappings)

cleansedTable = mappedTable
#add rules to clean the table - reversed for give the priority for top attributes
for x in reversed(range(length)):
    attr = data['attibutes'][x]['attrName']
    rules = data['attibutes'][x]['rules']
    rulesListSize = len(rules)
    for y in range(rulesListSize):
        if rules[y] == "Remove Null Value Rows":
            cleansedTable = etl.select(cleansedTable, attr, lambda v: v != '')
        if rules[y] == "Remove Duplicates":
            cleansedTable = etl.aggregate(cleansedTable, attr)
        if rules[y] == "Sort":
            cleansedTable = etl.mergesort(cleansedTable, key=attr)
        if rules[y] == "Number Validation":
            cleansedTable = etl.select(cleansedTable, attr)
        if rules[y] == "Fill Missing Values":
            cleansedTable = etl.filldown(cleansedTable, attr)

etl.tocsv(cleansedTable,'src/etl/outputs/cleansed.csv')

#Create rawData Table
dataTable = cleansedTable
rawDataTable = cleansedTable

reasonUniqueValues = etl.aggregate(dataTable,dataTable[0][20])

mappings = OrderedDict()
#  Facts

# This facts table will be the staging with all the needed info to quickly update with the dimension keys and load to the facts table
# The facts table will have columns to match each column on the dim Time table, to make it easier to get the reference key
#

events_uid = etl.cutout(events, 'tracking_id', 'utm_medium', 'utm_campaign')
events_tui = etl.cutout(events, 'user_id')

stage_uid = etl.join(users, events_uid, key='user_id')
stage_tui = etl.join(users, events_tui, key='tracking_id')
stage_utm = etl.cut(stage_tui, 'user_id', 'utm_medium', 'utm_campaign')
stage_uid_utm = etl.join(stage_uid, stage_utm, key='user_id')
stage_m_s = etl.mergesort(stage_uid_utm,
                          stage_tui,
                          key=['created_at', 'email'])

# Mapping definitions
mappings = OrderedDict()
mappings['tid'] = 'tracking_id'
mappings['uid'] = 'user_id'
mappings['utm_medium'] = 'utm_medium'
mappings['utm_campaign'] = 'utm_campaign', {'audio': 'none', 'social': 'none'}
mappings['utm_campaign_type'] = 'utm_campaign'
mappings['email'] = 'email'
mappings['subscription'] = 'type'
mappings['sub_order'] = 'type', {
    'Signup Completed': '1',
    'Trial Started': '2',
    'Subscription Started': '3',
예제 #10
0
파일: sorts.py 프로젝트: DeanWay/petl
# mergesort()
#############

import petl as etl
table1 = [['foo', 'bar'],
          ['A', 9],
          ['C', 2],
          ['D', 10],
          ['A', 6],
          ['F', 1]]
table2 = [['foo', 'bar'],
          ['B', 3],
          ['D', 10],
          ['A', 10],
          ['F', 4]]
table3 = etl.mergesort(table1, table2, key='foo')
table3.lookall()


# issorted()
############

import petl as etl
table1 = [['foo', 'bar', 'baz'],
          ['a', 1, True],
          ['b', 3, True],
          ['b', 2]]
etl.issorted(table1, key='foo')
etl.issorted(table1, key='bar')
etl.issorted(table1, key='foo', strict=True)
etl.issorted(table1, key='foo', reverse=True)