示例#1
0
def validate(cursor, table, constraints, task_name):
    header = etl.header(table)
    problems = etl.validate(table, constraints=constraints, header=header)
    problems = etl.addfield(problems, 'task_name', task_name)
    problems = etl.addfield(problems, 'create_date', datetime.now())

    # etl.todb(problems, cursor, 'etl_logs')
    etl.appenddb(problems, cursor, 'tetl_logs')
示例#2
0
def openFileTable(years, quarters):
    #print "In openFileTable"
    table = []
    #traverse through the list of years provided via filters
    for year in years:
        #traverse through the list of quarters provided via filters
        for quarter in quarters:
            fileString = year + '_' + quarter + '.csv'
            #print fileString
            table1 = etl.fromcsv(fileString)

            #TODO--migrate transformations accordingly
            #Q2
            #0        1             2          3           4          5         6         7           8            9        10         11              12         13         14       15       16         17     18   19     20   21     22        23       24   25      26   27    28         29          30         31           32     33      34         35          36           37              38           39               40             41            42             43           44
            #APN	Entitlement	BESTSTAT	BESTDATE	NAMEADDR	Alias	PLN_CASENO	BP_APPLNO	UNITS	NET_UNITS	AFF_UNITS	NET_AFF_UNITS	SECTION415	TENURE_TYPE	COST	PROPUSE	TOTAL_GSF	NET_GSF	CIE	NET_CIE	MED	NET_MED	MIPS	NET_MIPS	PDR	NET_PDR	RET	NET_RET	VISIT	NET_VISIT	FirstFiled	PLN_DESC	BP_DESC	PLANNER	SPONSOR	SP_CONTACT	SP_CONTACTPH	NEIGHBORHOOD	PLAN_AREA	PLAN_DISTRICT	HEIGHT_DISTRICT	ZONING_SIM	ZONING_DISTRICT	SUPE_DISTRICT	Location

            #Q3
            #     0             1          2       3            4           5         6           7            8        9         10      11      12            13         14              15           16           17       18     19    20    21  22       23       24        25   26     27   28      29       30          31          32       33       34       35            36   37    38           39             40             41           42              43           44               45              46
            #PROJECT_TYPE	NAMEADDR	Alias	BESTSTAT	BESTDATE	Entitlement	PLN_CASENO	PLN_DESC	PROPUSE	BP_APPLNO	BP_DESC	UNITS	NET_UNITS	AFF_UNITS	NET_AFF_UNITS	SECTION415	TENURE_TYPE	TOTAL_GSF	NET_GSF	CIE	NET_CIE	MED	NET_MED	MIPS	NET_MIPS	PDR	NET_PDR	RET	NET_RET	VISIT	NET_VISIT	APPLICANT	CONTACT	CONTACTPH	COST	FirstFiled	PLANNER	APN	PLAN_AREA	PLN_DISTRICT	SUPE_DISTRICT	ZONING_SIM	ZONING_DISTRICT	ZONING_GEN	NEIGHBORHOOD	HEIGHT_DISTRICT	Location

            #Q4
            # 0        1           2            3        4        5       6            7              8               9          10           11        12         13   14        15    16           17               18           19        20      21    22   23   24      25       26         27   28     29   30     31        32         33          34       35             36          37      38           39        40        41              42              43              44          45         46        47             48             49
            #APN	BESTSTAT	BESTDATE	NAMEADDR	Alias	UNITS	NET_UNITS	AFF_UNITS	NET_AFF_UNITS	SECTION415	TENURE_TYPE	PLN_CASENO	BP_APPLNO	PROPUSE	COST	BP_DESC	PLN_DESC	PROJECT_TYPE	ENTITLEMENT	TOTAL_GSF	NET_GSF	CIE	NET_CIE	MED	NET_MED	MIPS	NET_MIPS	PDR	NET_PDR	RET	NET_RET	VISIT	NET_VISIT	APPLICANT	CONTACT	CONTACTPHONE	FirstFiled	PLANNER	PLAN_AREA	PUBLICREALM	STATUS	PLAN_DISTRICT	SUPE_DISTRICT	ZONING_SIM	ZONING_DISTRICT	ZONING_GEN	ZONING	HEIGHT_DISTRICT	NEIGHBORHOOD	Location
            if quarter == 'Q1':
                #attrList = [0,2,4,5,12,32,40]
                table1 = etl.cut(table1, 'PROJECT_TYPE', 'NAMEADDR',
                                 'BESTDATE', 'BESTSTAT', 'PROPUSE',
                                 'NEIGHBORHOOD', 'LOCATION')

            if quarter == 'Q2':
                # print quarter
                #attrList = [2,3,4,15,37,42,44]
                table1 = etl.cut(table1, 'BESTSTAT', 'BESTDATE', 'NAMEADDR',
                                 'PROPUSE', 'NEIGHBORHOOD', 'ZONING_GEN',
                                 'Location')

            if quarter == 'Q3':
                #attrList = [0,1,3,4,8,44,46]
                table1 = etl.cut(table1, 'PROJECT_TYPE', 'NAMEADDR',
                                 'BESTSTAT', 'BESTDATE', 'PROPUSE',
                                 'NEIGHBORHOOD', 'LOCATION')

            if quarter == 'Q4':
                #attrList = [1,2,3,13,17,48,49]
                table1 = etl.cut(table1, 'BESTSTAT', 'BESTDATE', 'NAMEADDR',
                                 'PROPUSE', 'PROJECT_TYPE', 'NEIGHBORHOOD',
                                 'Location')

            #adding the quarter and year to identify the table
            table1 = etl.addfield(table1, 'Quarter', quarter)
            table1 = etl.addfield(table1, 'Year', year)

            #print tempTable

            table.append(table1)

    return table
示例#3
0
def ETL_MMS_NOW_schema(connection, tables, schema, system_name):
    '''Import all the data from the specified schema and tables.'''
    for destination, source in tables.items():
        try:
            current_table = etl.fromdb(connection,
                                       f'SELECT * from {schema}.{source}')
            print(f'    {destination}:{etl.nrows(current_table)}')

            if (source == 'application'):
                # add originating source
                table_plus_os = etl.addfield(current_table,
                                             'originating_system', system_name)

                table_plus_os_guid = join_mine_guids(connection, table_plus_os)

                etl.appenddb(table_plus_os_guid,
                             connection,
                             destination,
                             schema='now_submissions',
                             commit=False)
            else:

                etl.appenddb(current_table,
                             connection,
                             destination,
                             schema='now_submissions',
                             commit=False)

        except Exception as err:
            print(f'ETL Parsing error: {err}')
            raise
示例#4
0
文件: convert.py 项目: bazenkov/pytb
def transform_fields_254(tbl, ts_kv_dict):
    """The input is a dump of ts_kv table for TB version 2.5.4:
    +----------------------------------+---------------+---------------+--------+-------+--------+-------+
    |  entity_id                       | key           | ts            | bool_v | str_v | long_v | dbl_v |
    +==================================+===============+===============+========+=======+========+=======+
    |  1ea47494dc14d40bd76a73c738b665f | 25   | 1583010011665 |        |       |        | -1.8  |
    +----------------------------------+---------------+---------------+--------+-------+--------+-------+
    |  1ea47494dc14d40bd76a73c738b665f | 36 | 1583010000692 |        |       | 227    |       |
    +----------------------------------+---------------+---------------+--------+-------+--------+-------+  
    
    The output:
    +---------------------------------+---------------+---------------+--------+
    | entity_id                       | key           | ts            | value  |
    +=================================+===============+===============+========+
    | 1ea47494dc14d40bd76a73c738b665f | Temperature   | 1583010011665 |  -1.8  |
    +---------------------------------+---------------+---------------+--------+
    | 1ea47494dc14d40bd76a73c738b665f | WindDirection | 1583010000692 |   227  |
    +---------------------------------+---------------+---------------+--------+

    ts_kv_dict is a dict like {25:'Temperature', 36:'WindDirection'}
    """
    ts_kv_table = petl.transform.conversions.convert(tbl,
                                                     {'ts': int,
                                                      'key': lambda k: ts_kv_dict[k]})
    ts_kv_table = petl.addfield(ts_kv_table, 'value', lambda row: get_value(row))
    ts_kv_table = petl.cutout(ts_kv_table, 'bool_v', 'str_v', 'long_v', 'dbl_v')
    return ts_kv_table
示例#5
0
def drained_entries(ctx: typer.Context, issues, entries, project):
    config = ctx.meta['config']
    empty_entries, unset_entries = petl.biselect(
        entries, lambda row: row['issue_id'] is None)

    drain_issues = list(
        petl.dicts(
            transform.select_drain_issues(
                issues,
                assignee_id=ctx.meta['rdm_user']['id'],
                drain_cf_id=get_proj_attr(config, project,
                                          'rdm_drain_cf_id'))))

    if not len(drain_issues):
        log.error('No drain issues found')
        return petl.head(unset_entries, 0), entries

    if len(drain_issues) > 1:
        log.warning(
            f'Found {len(drain_issues)} drain issues. Will use only first one')

    drain_issue = drain_issues[0]
    drained = petl.addfield(petl.cutout(empty_entries, 'issue_id'), 'issue_id',
                            drain_issue['id'])
    return drained, unset_entries
示例#6
0
def add_issue_id_from_description(inp):
    def m(row):
        try:
            match = re.search(r'#(\d+)', row['description'])
            return int(match.group(1))
        except AttributeError:
            return None

    return etl.addfield(inp, 'issue_id', m)
示例#7
0
def main(argv):

    parser = argparse.ArgumentParser()
    parser.add_argument("--pledges-filename", required=True, help="Input UTF8 CSV with pledges data "
        "dumped from Servant Keeper")
    parser.add_argument("--output-filename", required=True, help="Output CSV filename which will be loaded with "
        "pledges data in CCB import format ")
    parser.add_argument('--trace', action='store_true', help="If specified, prints tracing/progress messages to "
        "stdout")
    args = parser.parse_args()

    assert os.path.isfile(args.pledges_filename), "Error: cannot open file '" + args.pledges_filename + "'"

    table = petl.fromcsv(args.pledges_filename)

    table = petl.rename(table, {
        'Frequency': 'SK Frequency',
        'Individual ID': 'SK Individual ID'
        })

    table = petl.addfield(table, 'Individual ID', lambda rec: rec['SK Individual ID'])
    table = petl.addfield(table, 'Campus', '')
    table = petl.addfield(table, 'Category Pledged To', lambda rec: rec['Account'])
    table = petl.addfield(table, 'Amount Pledged', convert_amount)
    table = petl.addfield(table, 'Total Amount Pledged', lambda rec: rec['Pledged'])
    table = petl.addfield(table, 'Frequency', lambda rec: {1:'Yearly', 4:'Yearly', 12:'Monthly',
        52:'Weekly', 24:'Monthly', 2:'Yearly'}[int(rec['Payments'])])
    table = petl.addfield(table, 'Number of Gifts', lambda rec: {'Yearly':1, 'Monthly':12,
        'Weekly':52}[rec['Frequency']])
    table = petl.addfield(table, 'Length Multiplier', lambda rec: {'Yearly':'Years', 'Monthly':'Months',
        'Weekly':'Weeks'}[rec['Frequency']])
    table = petl.addfield(table, 'Start Date', lambda rec: {'Operating Income':'2013-01-01',
        'Mortgage Principal':'2013-01-01', 'Operating Income 2015':'2015-01-01'}[rec['Account']])
    table = petl.addfield(table, 'End Date', lambda rec: {'Operating Income':'2013-12-31',
        'Mortgage Principal':'2013-12-31', 'Operating Income 2015':'2015-12-31'}[rec['Account']])

    trace('CONVERTING AND THEN EMITTING TO CSV FILE...', args.trace, banner=True)

    table.progress(200).tocsv(args.output_filename)

    trace('OUTPUT TO CSV COMPLETE.', args.trace, banner=True)

    trace('DONE!', args.trace, banner=True)
示例#8
0
def test_tograph():
    """
    Test converting a table to a Graph.
    """
    table2 = etl.addfield(table1, 'a', 'foaf:Person')
    g = table2.tograph(ctx)
    nt = g.serialize(format='nt')
    assert '<http://example.org/people/n2>' in nt
    assert '<http://xmlns.com/foaf/0.1/Person>' in nt

    #read back in as string
    ng = Graph().parse(data=nt, format='nt')
示例#9
0
 def load(self):
     self.table =  etl.fromcsv(self.InputTable)
     for Column in o.Columns:
         if "Type" in Column:
             print(Column)
             self.table = etl.convert(self.table, Column["Column"], eval(Column["Type"]) )     
         if "Convert" in Column:
             print(Column)
             self.table = etl.convert(self.table, Column["Column"], Column["Convert"])
         if "AddField" in Column:
             print(Column)
             self.table = etl.addfield(self.table, Column["Column"], eval(Column["AddField"]))
示例#10
0
def test_tojsonld():
    """
    Test serializing as JSON LD.
    """
    #serialize as jsonld
    table2 = etl.addfield(table1, 'a', 'foaf:Person')
    table2.tojsonld(ctx, source=tmpfile)

    with open(tmpfile, 'rb') as inf:
        data = json.load(inf)
        #Check context
        assert data['@context']['foaf'].find('foaf') > -1
        assert data['@graph'][0].get('name') is not None
示例#11
0
def group_entries_by_day(inp):
    hdr = petl.header(inp)

    agg = OrderedDict()
    for field in hdr:
        # using first found value
        agg[field] = field, next

    agg['dur'] = 'dur', lambda durs: sum(durs, timedelta())
    agg['start'] = 'start', min

    with_day = petl.addfield(inp, 'start_date',
                             lambda row: row.get('start').date())
    index_keys = ('start_date', 'description')
    result = petl.aggregate(with_day, index_keys, agg)
    return petl.cutout(result, 'start_date')
示例#12
0
def extract_named_objects_to_columns(inp, named_object_columns: List[str]):
    res = inp

    def m(column1: str, prop1: str):
        def mod(row):
            if row.get(column1):
                return row[column1].get(prop1, None)
            return None

        return mod

    for column in named_object_columns:
        for prop in ('id', 'name'):
            res = etl.addfield(res, f'{column}_{prop}', m(column, prop))

    return res
示例#13
0
def perform_people_table_transformations(
        petl_table: petl.Table,
        session: Optional[requests.Session] = None) -> petl.Table:
    # Add a date column (%Y-%m-%d) based on edited date
    new_table = petl.addfield(
        petl_table,
        "date",
        lambda row: row["edited"]
        [:10
         ],  # Seems more efficient than parsing and formatting back to isoformat
    )

    # Resolve the homeworld field into the homeworld's name (/planets/1/ -> Tatooine)
    homeworld_name_retriever = RelatedResourceAttributeRetriever(
        "name", session=session)
    new_table = petl.convert(
        new_table, "homeworld",
        lambda v: homeworld_name_retriever.fetch_from_url(v))

    # Fields referencing different resources and date fields other than date/birth_year can be dropped
    return petl.transform.basics.cutout(new_table, *FIELDS_TO_REMOVE)
示例#14
0
 def get_relationships(self):
     "Parses a list of `Relationship` objects."
     core_file = _find_loinc_table_core_file(self.uri.path)
     core = etl.fromcsv(core_file, delimiter=',')
     core = etl.cut(core, ['LOINC_NUM', 'LONG_COMMON_NAME'])
     hierarchy_file = _find_multi_axial_hierarchy_file(self.uri.path)
     hierarchy = etl.fromcsv(hierarchy_file, delimiter=',')
     hierarchy = etl.leftjoin(hierarchy, core, lkey='CODE', rkey='LOINC_NUM')
     hierarchy = etl.cut(hierarchy, ['IMMEDIATE_PARENT', 'CODE', 'CODE_TEXT', 'LONG_COMMON_NAME'])
     hierarchy = etl.fillright(hierarchy)
     hierarchy = etl.cut(hierarchy, ['IMMEDIATE_PARENT', 'CODE', 'LONG_COMMON_NAME'])
     hierarchy = etl.rename(hierarchy, 'LONG_COMMON_NAME', 'CODE_TEXT')
     parents = etl.cut(hierarchy, ['CODE', 'CODE_TEXT'])
     hierarchy = etl.selectne(hierarchy, 'IMMEDIATE_PARENT', '')
     hierarchy = etl.leftjoin(hierarchy, parents, lkey='IMMEDIATE_PARENT', rkey='CODE', lprefix='source.', rprefix='target.')
     hierarchy = etl.distinct(hierarchy)
     if self.versioned:
         version = _parse_version(hierarchy_file)
         hierarchy = etl.addfield(hierarchy, 'version', version)
     hierarchy = etl.rowmapmany(hierarchy, _to_json, ['relationship'])
     return hierarchy
示例#15
0
def order_by_constraint(base_path, table, schema, self_dep_set):
    file_name = base_path + "/content/data/" + table + ".tsv"
    tempfile = NamedTemporaryFile(mode='w',
                                  dir=base_path + "/content/data/",
                                  delete=False)
    table = etl.fromcsv(file_name,
                        delimiter='\t',
                        skipinitialspace=True,
                        quoting=csv.QUOTE_NONE,
                        quotechar='',
                        escapechar='')

    key_dep_dict = {}

    # print(file_name)
    for constraint in self_dep_set:
        child_dep, parent_dep = constraint.split(':')
        data = etl.values(table, child_dep, parent_dep)
        for d in data:
            key_dep_set = {d[1]}
            key_dep_dict.update({d[0]: key_dep_set})

    key_dep_list = toposort_flatten(key_dep_dict)
    table = etl.addfield(table, 'pwb_index',
                         lambda rec: int(key_dep_list.index(rec[child_dep])))
    table = etl.sort(table, 'pwb_index')
    table = etl.cutout(table, 'pwb_index')

    writer = csv.writer(tempfile,
                        delimiter='\t',
                        quoting=csv.QUOTE_NONE,
                        quotechar='',
                        lineterminator='\n',
                        escapechar='')

    writer.writerows(table)
    shutil.move(tempfile.name, file_name)
示例#16
0
文件: convert.py 项目: bazenkov/pytb
def transform_fields_old(tbl):
    """The input is a dump of ts_kv table for TB version <= 2.5.4:
    +-------------+---------------------------------+---------------+---------------+--------+-------+--------+-------+
    | entity_type | entity_id                       | key           | ts            | bool_v | str_v | long_v | dbl_v |
    +=============+=================================+===============+===============+========+=======+========+=======+
    | DEVICE      | 1ea47494dc14d40bd76a73c738b665f | Temperature   | 1583010011665 |        |       |        | -1.8  |
    +-------------+---------------------------------+---------------+---------------+--------+-------+--------+-------+
    | DEVICE      | 1ea47494dc14d40bd76a73c738b665f | WindDirection | 1583010000692 |        |       | 227    |       |
    +-------------+---------------------------------+---------------+---------------+--------+-------+--------+-------+  
    
    The output:
    +-------------+---------------------------------+---------------+---------------+--------+
    | entity_type | entity_id                       | key           | ts            | value  |
    +=============+=================================+===============+===============+========+
    | DEVICE      | 1ea47494dc14d40bd76a73c738b665f | Temperature   | 1583010011665 |  -1.8  |
    +-------------+---------------------------------+---------------+---------------+--------+
    | DEVICE      | 1ea47494dc14d40bd76a73c738b665f | WindDirection | 1583010000692 |   227  |
    +-------------+---------------------------------+---------------+---------------+--------+
    
    """
    ts_kv_table = petl.transform.conversions.convert(tbl, 'ts', int)
    ts_kv_table = petl.addfield(ts_kv_table, 'value', lambda row: get_value(row))
    ts_kv_table = petl.cutout(ts_kv_table, 'bool_v', 'str_v', 'long_v', 'dbl_v')
    return ts_kv_table
示例#17
0
example_data = """foo,bar,baz
a,1,3.4
b,2,7.4
c,6,2.2
d,9,8.1
"""
with open('example.csv', 'w') as f:
    f.write(example_data)

import petl as etl
table1 = etl.fromcsv('example.csv')
table2 = etl.convert(table1, 'foo', 'upper')
table3 = etl.convert(table2, 'bar', int)
table4 = etl.convert(table3, 'baz', float)
table5 = etl.addfield(table4, 'quux', lambda row: row.bar * row.baz)
table5

table = (
    etl
    .fromcsv('example.csv')
    .convert('foo', 'upper')
    .convert('bar', int)
    .convert('baz', float)
    .addfield('quux', lambda row: row.bar * row.baz)
)
table

l = [['foo', 'bar'], ['a', 1], ['b', 2], ['c', 2]]
table = etl.wrap(l)
table.look()
示例#18
0
    if table_h == 'quote':
        table_c['t'] = str(datetime.datetime.fromtimestamp(table_c['t']))
        price = table_c
    if table_h == 'stock/insider-transactions':
        for transaction in table_c['data']:
            insiders.append(transaction)
    if table_h == 'news-sentiment':
        sentiment = table_c
    if table_h == 'stock/recommendation':
        for recommend in table_c:
            recommendation.append(recommend)
    if table_h == 'stock/metric':
        metrics = table_c['metric']

table_price = etl.fromdicts([price])
table_price = etl.addfield(table_price, 'symbol', stock)
table_recommendation = etl.movefield(etl.fromdicts(recommendation), 'period',
                                     6)
table_insiders = etl.movefield(etl.fromdicts(insiders), 'transactionDate', 6)
table_insiders = etl.addfield(table_insiders, 'symbol', stock)

list_of_metrics = [
    '10DayAverageTradingVolume', '13WeekPriceReturnDaily',
    '26WeekPriceReturnDaily', '3MonthAverageTradingVolume', '52WeekHigh',
    '52WeekHighDate', '52WeekLow', '52WeekLowDate', '52WeekPriceReturnDaily',
    '5DayPriceReturnDaily'
]
important_metrics = {}
for metric in list_of_metrics:
    important_metrics[metric] = metrics[metric]
important_metrics['_date'] = str(datetime.datetime.now())[:19]
示例#19
0
def concat_columns(table, new_column, column_left, column_right):
    new_table = etl.addfield(
        table, new_column,
        lambda rec: rec[column_left] + ', ' + rec[column_right], 1)
    return etl.cutout(new_table, column_left, column_right)
示例#20
0
    def match_columns(self,
                      desired_columns,
                      fuzzy_match=True,
                      if_extra_columns='remove',
                      if_missing_columns='add'):
        """
        Changes the column names and ordering in this Table to match a list of desired column
        names.

        `Args:`
            desired_columns: list
                Ordered list of desired column names
            fuzzy_match: bool
                Whether to normalize column names when matching against the desired column names,
                removing whitespace and non-alphanumeric characters, and lowercasing everything.
                Eg. With this flag set, "FIRST NAME" would match "first_name".
                If the Table has two columns that normalize to the same string (eg. "FIRST NAME"
                and "first_name"), the latter will be considered an extra column.
            if_extra_columns: string
                If the Table has columns that don't match any desired columns, either 'remove'
                them, 'ignore' them, or 'fail' (raising an error).
            if_missing_columns: string
                If the Table is missing some of the desired columns, either 'add' them (with a
                value of None), 'ignore' them, or 'fail' (raising an error).

        `Returns:`
            `Parsons Table` and also updates self
        """

        from parsons.etl import Table  # Just trying to avoid recursive imports.

        normalize_fn = Table.get_normalized_column_name if fuzzy_match else (
            lambda s: s)

        # Create a mapping of our "normalized" name to the original column name
        current_columns_normalized = {
            normalize_fn(col): col
            for col in self.columns
        }

        # Track any columns we need to add to our current table from our desired columns
        columns_to_add = []
        # We are going to do a "cut" later to trim our table and re-order the columns, but
        # we won't have renamed our columns yet, so we need to remember their un-normalized
        # form
        cut_columns = []
        # We are going to also rename our columns AFTER we cut, so we want to remember their
        # normalized names
        final_header = []

        # Loop through our desired columns -- the columns we want to see in our final table
        for desired_column in desired_columns:
            normalized_desired = normalize_fn(desired_column)
            # Try to find our desired column in our Table
            if normalized_desired not in current_columns_normalized:
                # If we can't find our desired column in our current columns, then it's "missing"
                if if_missing_columns == 'fail':
                    # If our missing strategy is to fail, raise an exception
                    raise TypeError(
                        f"Table is missing column {desired_column}")
                elif if_missing_columns == 'add':
                    # We have to add to our table
                    columns_to_add.append(desired_column)
                    # We will need to remember this column when we cut down to desired columns
                    cut_columns.append(desired_column)
                    # This will be in the final table
                    final_header.append(desired_column)
                elif if_missing_columns != 'ignore':
                    # If it's not ignore, add, or fail, then it's not a valid strategy
                    raise TypeError(f"Invalid option {if_missing_columns} for "
                                    "argument `if_missing_columns`")
            else:
                # We have found this in our current columns, so take it out of our list to search
                current_column = current_columns_normalized.pop(
                    normalized_desired)
                # Add the column to our intermediate table as the old column name
                cut_columns.append(current_column)
                # Add to our final header list as the "desired" name
                final_header.append(desired_column)

        # Look for any "extra" columns from our current table that aren't in our desired columns
        for current_column in current_columns_normalized.values():
            # Figure out what to do with our "extra" columns
            if if_extra_columns == 'fail':
                # If our missing strategy is to fail, raise an exception
                raise TypeError(f"Table has extra column {current_column}")
            elif if_extra_columns == 'ignore':
                # If we're "ignore"ing our extra columns, we should keep them by adding them to
                # our intermediate and final columns list
                cut_columns.append(current_column)
                final_header.append(current_column)
            elif if_extra_columns != 'remove':
                # If it's not ignore, add, or fail, then it's not a valid strategy
                raise TypeError(f"Invalid option {if_extra_columns} for "
                                "argument `if_extra_columns`")

        # Add any columns we need to add
        for column in columns_to_add:
            self.table = petl.addfield(self.table, column, None)

        # Cut down to just the columns we care about
        self.table = petl.cut(self.table, *cut_columns)

        # Rename any columns
        self.table = petl.setheader(self.table, final_header)

        return self
示例#21
0

# Load Master.csv from the Lahman database.
table = etl.fromcsv(sys.argv[1])

# Use US births only
table2 = etl.select(table, lambda rec: rec.birthCountry == 'USA')

# Only use these fields
table3 = etl.cut(table2, 'nameFirst', 'nameLast', 'debut', 'bbrefID', 'weight', 'height', 'finalGame', 'birthCity', 'birthState', 'birthYear')

# Remove null birth city and birth year
table4 = etl.select(table3, lambda rec: rec.birthCity != "" and rec.birthYear != "")

# Add Baseball Reference URL
table5 = etl.addfield(table4, 'baseball_ref_url', add_bbreflink)
# Remove unnecessary bbrefid
table6 = etl.cutout(table5, "bbrefID")

# Load city,state lat long table.
city = etl.fromcsv(sys.argv[2])
# Only use these fields
city2 = etl.cut(city, "city", "state", "lat", "long")

# Join tables by two keys
lat_table = etl.leftjoin(table6, city2, lkey=["birthCity", "birthState"], rkey=["city", "state"])

# Output merged file to csv
lat_table.tocsv(sys.argv[3])

示例#22
0
import petl as etl
import petlld

# set up a petl table  to demonstrate
table1 = [["uri", "name"], ["n1", "Smith, Bob"], ["n2", "Jones, Sally"], ["n3", "Adams, Bill"]]

# use petl utilities to add a column with our data type - foaf:Person
table2 = etl.addfield(table1, "a", "foaf:Person")

# a JSON-LD context for our data
ctx = {
    "@base": "http://example.org/people/",
    "a": "@type",
    "uri": "@id",
    "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
    "foaf": "http://xmlns.com/foaf/0.1/",
    "name": "rdfs:label",
}

# serialize the data as JSON-LD
table2.tojsonld(ctx, indent=2)

graph = table2.tograph(ctx)

print
print "-" * 10
print

print graph.serialize(format="turtle")
示例#23
0
def procesar_fuente(path, nombre):
    try: 
        # Procesamos primero casos confirmados
        tabla = etl.fromcsv(path)

        # Cambiamos el nombre a los encabezados
        tabla = etl.rename(tabla, {'Country/Region': 'Country'})

        # Ajustamos los tipos de datos
        # A partir de la columna 5, el tipo de dato es integer, que es el número de personas/casos
        # Adicionalmente aprovechamos para cambiar el formato de la fecha de 1/23/20 a 2020-01-23 en el header
        headers = etl.fieldnames(tabla)
        i=0
        for header in headers:
            if i>=4:
                tabla = etl.convert(tabla, header, int)        # corregimos el tipo de dato
                fecha =  datetime.datetime.strptime(header, '%m/%d/%y')    # calculamos la fecha en formato correcto
                tabla = etl.rename(tabla, header, fecha.strftime('%Y-%m-%d'))   
            i = i + 1

        # Eliminamos las columnas de Province/State, Lat y Lon que no vamos a utilizar
        tabla = etl.cutout(tabla, 0, 2, 3)

        # Ajustamos algunos nombres de países para luego asignarles una región/continente
        tabla = etl.convert(tabla, 'Country', 'replace', 'Congo (Brazzaville)', 'Congo')
        tabla = etl.convert(tabla, 'Country', 'replace', 'Congo (Kinshasa)', 'Democratic Republic of the Congo')
        tabla = etl.convert(tabla, 'Country', 'replace', 'Cote d\'Ivoire', 'Ivory Coast')
        tabla = etl.convert(tabla, 'Country', 'replace', 'Korea, South', 'South Korea')
        tabla = etl.convert(tabla, 'Country', 'replace', 'West Bank and Gaza', 'Palestine')
        tabla = etl.convert(tabla, 'Country', 'replace', 'Burma', 'Myanmar')
        tabla = etl.convert(tabla, 'Country', 'replace', 'US', 'USA')
        tabla = etl.convert(tabla, 'Country', 'replace', 'Taiwan*', 'Taiwan')

        # Luego procedemos a agrupar y acumular los resultados por el país
        df_confirmed = etl.todataframe(tabla)
        df = df_confirmed.groupby(['Country']).sum()
        tabla = etl.fromdataframe(df, include_index=True)

        # Renombramos el campo de Country nuevamente
        tabla = etl.rename(tabla, {'index': 'Country'})

        # Luego agregamos las columnas de fecha como datos y renombramos las nuevas columnas
        tabla = etl.melt(tabla, 'Country')
        tabla = etl.rename(tabla, {'variable': 'Date'})
        tabla = etl.rename(tabla, {'value': 'Cases'})

        # Luego agregamos el continente para agrupar
        tabla = etl.addfield(tabla, 'Continent', lambda rec: get_continent_code(rec['Country']))

        # Y nuevamente nos aseguramos que sean del tipo de dato que deben ser.
        tabla = etl.convert(tabla, 'Cases', int)
        tabla = etl.convert(tabla, 'Date', lambda v: datetime.datetime.strptime(v, '%Y-%m-%d') )

        #Finalmente, subimos el archivo al repositorio de datos
        conn = pymysql.connect(password='******', database='covid', user='******')
        conn.cursor().execute('SET SQL_MODE=ANSI_QUOTES')
        etl.todb(tabla, conn, nombre, create=True, drop=True)
        conn.close()
    except:
        print('Se ha presentado un error! ', sys.exc_info()[0])
        raise
示例#24
0
def append_tailings_reports_to_code_required_reports(connection, commit=False):
    src_table = etl.fromdb(
        connection,
        'SELECT exp_doc.mine_guid, exp_doc.exp_document_guid, req_doc.req_document_name, exp_doc.due_date, exp_doc.exp_document_status_code, exp_doc.received_date, exp_doc.active_ind, exp_doc_x.mine_document_guid, exp_doc.create_user, exp_doc.create_timestamp, exp_doc.update_user, exp_doc.update_timestamp from mine_expected_document exp_doc \
        inner join mine_expected_document_xref exp_doc_x on exp_doc.exp_document_guid = exp_doc_x.exp_document_guid\
        inner join mds_required_document req_doc on req_doc.req_document_guid = exp_doc.req_document_guid'
    )

    req_document_crr_defintion_map = [
        ['req_document_name', 'mine_report_definition_id'],
        ['Summary of TSF and Dam Safety Recommendations', 28],
        ['ITRB Activities Report', 27],
        ['Register of Tailings Storage Facilities and Dams', 47],
        ['Dam Safety Inspection (DSI) Report', 26],
        ['Dam Safety Review (DSR) Report', 31],
        ['“As-built” Reports', 32],
        ['Annual Reclamation', 25],
        ['MERP Record of Testing', 3],
        #['Annual Manager\'s Report', __________________ ], no mapping or data, ignore.
        ['OMS Manual', 33],
        ['Annual reconciliation of water balance and water management plans', 44],
        ['TSF risk assessment', 46],
        ['Mine Emergency Preparedness and Response Plan (MERP)', 24],
        ['Performance of high risk dumps', 29]
    ]

    table1 = etl.join(src_table, req_document_crr_defintion_map, 'req_document_name')
    mine_report = etl.cutout(table1, 'req_document_name')

    #to be inserted into db
    mine_report = etl.addfield(mine_report, 'submission_year', 2019)
    mine_report = etl.rename(mine_report, 'exp_document_status_code',
                             'mine_report_submission_status_code')
    mine_report = etl.addfield(mine_report, 'deleted_ind', lambda x: not x.active_ind)
    mine_report = etl.cutout(mine_report, 'active_ind')
    #to determine what FK's will be so can insert into related tables
    max_report_id = etl.fromdb(connection,
                               'select last_value from public.mine_report_mine_report_id_seq')[1][0]
    max_report_submission_id = etl.fromdb(
        connection,
        'select last_value from public.mine_report_submission_mine_report_submission_id_seq')[1][0]

    #if sequence hasn't been used yet, fix off by one
    if max_report_id == 1:
        max_report_id = 0
    if max_report_submission_id == 1:
        max_report_submission_id = 0

    #get one-to-many
    mine_report, mine_report_submission_documents = etl.unjoin(mine_report,
                                                               'mine_document_guid',
                                                               key='exp_document_guid')

    #add PK's for mappings
    mine_report_with_ids = etl.addrownumbers(mine_report,
                                             start=max_report_id + 1,
                                             step=1,
                                             field='mine_report_id')
    mine_report_with_ids = etl.addrownumbers(mine_report_with_ids,
                                             start=max_report_submission_id + 1,
                                             step=1,
                                             field='mine_report_submission_id')
    print(f'max_report_id= {max_report_id}, max_report_submission_id={max_report_submission_id}')
    #copy out fields for submission tables
    mine_report_submissions = etl.cut(mine_report_with_ids, [
        'mine_report_id', 'exp_document_guid', 'mine_report_submission_status_code', 'create_user',
        'create_timestamp', 'update_user', 'update_timestamp'
    ])
    mine_report_submissions = etl.addfield(mine_report_submissions,
                                           'submission_date', lambda x: x.create_timestamp)
    #remove fields not in mine_report
    mine_report = etl.cutout(mine_report, 'mine_report_submission_status_code')

    #replace exp_document_guid FK with mine_report_submission FK
    submission_id_lookup = etl.cut(mine_report_with_ids,
                                   ['mine_report_submission_id', 'exp_document_guid'])
    mine_report_submission_documents = etl.join(submission_id_lookup,
                                                mine_report_submission_documents,
                                                key='exp_document_guid')
    mine_report_submission_documents = etl.cutout(mine_report_submission_documents,
                                                  'exp_document_guid')

    #removed original PK
    mine_report = etl.cutout(mine_report, 'exp_document_guid')
    mine_report_submissions = etl.cutout(mine_report_submissions, 'exp_document_guid')

    print(etl.valuecounter(etl.distinct(table1, key='exp_document_guid'), 'req_document_name'))
    print(etl.valuecounter(mine_report, 'mine_report_definition_id'))
    print(table1)
    print(mine_report)
    print(mine_report_submissions)
    print(mine_report_submission_documents)

 
    etl.appenddb(mine_report, connection, 'mine_report', commit=False)
    print('INSERT mine_report staged')
    etl.appenddb(mine_report_submissions, connection, 'mine_report_submission', commit=False)
    print('INSERT mine_report_submission staged')
    etl.appenddb(mine_report_submission_documents,
                    connection,
                    'mine_report_document_xref',
                    commit=False)
    print('INSERT mine_report_document_xref staged')
    if commit:  
        connection.commit()
        print('DATA CREATION COMPLETE')
    else:
        connection.rollback()
        print('NO DATA CREATED: add --commit=true to insert report rows')
示例#25
0
def sales_summary(start_dt=None, end_dt=None):
    """tally up gross (sale over list) profits
    TODO: tally up net profites (gross profit vs inventory purchase total)

    TODO: Keyword Arguments:
        start_dt {[type]} -- datetime for start of query (default: {None})
        end_dt {[type]} -- datetime for start of query [description] (default: {None})

    Returns:
        [dict] -- various types of sales information, stored in a dictionary.
    """

    # products = db.session.query(Product).all()
    # sales = db.session.query(Sale).all()

    # retrieve existing tables
    products_records = etl.fromdb(db.engine, 'SELECT * FROM product')
    sales_records = etl.fromdb(db.engine, 'SELECT * FROM sale')

    # join product info to sales data
    sales_data = etl.join(sales_records,
                          products_records,
                          lkey='product_id',
                          rkey='id')

    # prep joined sales data for tabulation
    sales_data = etl.convert(sales_data, 'date', lambda dt: format_date(dt))
    sales_data = etl.sort(sales_data, 'date')
    sales_data = etl.convert(sales_data, 'quantity',
                             lambda q: handle_none(q, replace_with=1))
    sales_data = etl.addfield(sales_data, 'profit',
                              lambda rec: calculate_profit(rec))
    sales_data = etl.addfield(sales_data, 'gross_sales',
                              lambda rec: calculate_gross_sales(rec))

    # summarize data into charting-friendly data structures
    chart_count = etl.fold(sales_data,
                           'date',
                           operator.add,
                           'quantity',
                           presorted=True)
    chart_count = etl.rename(chart_count, {'key': 'x', 'value': 'y'})
    chart_count, chart_count_missing_date = etl.biselect(
        chart_count, lambda rec: rec.x is not None)
    # print(chart_count)
    # etl.lookall(chart_count)

    chart_gross = etl.fold(sales_data,
                           'date',
                           operator.add,
                           'gross_sales',
                           presorted=True)
    chart_gross = etl.rename(chart_gross, {'key': 'x', 'value': 'y'})
    chart_gross, chart_gross_missing_date = etl.biselect(
        chart_gross, lambda rec: rec.x is not None)
    # print(chart_gross)
    # etl.lookall(chart_gross)

    chart_profit = etl.fold(sales_data,
                            'date',
                            operator.add,
                            'profit',
                            presorted=True)
    chart_profit = etl.rename(chart_profit, {'key': 'x', 'value': 'y'})
    chart_profit, chart_profit_missing_date = etl.biselect(
        chart_profit, lambda rec: rec.x is not None)

    # tabulate some figures
    gross_sales = 0
    profits = 0
    for sale in etl.dicts(sales_data):
        profits += calculate_profit(sale)
        gross_sales += calculate_gross_sales(sale)

    # for i in etl.dicts(chart_count):
    #     print(i)
    # for i in etl.dicts(chart_gross):
    #     print(i)

    return {
        'gross_sales': gross_sales,
        'profits': profits,
        'chart_gross': list(etl.dicts(chart_gross)),
        'chart_gross_missing_date': list(etl.dicts(chart_gross_missing_date)),
        'chart_profit': list(etl.dicts(chart_profit)),
        'chart_profit_missing_date':
        list(etl.dicts(chart_profit_missing_date)),
        'chart_count': list(etl.dicts(chart_count)),
        'chart_count_missing_date': list(etl.dicts(chart_count_missing_date))
    }
示例#26
0
def lee_convert(filepath):
    """
    Takes the file path to a csv in the format used by Kansas City proper
    converts to universal format 
    outputs csv.
    """
    kclee = etl.fromcsv(filepath)

    kclee = etl.addfield(kclee, 'PoleID', lambda x: 'KCLEE' + x['OBJECTID'])
    kclee = etl.addfield(kclee, 'Longitude', lambda x: x['POINT_X'])
    kclee = etl.addfield(kclee, 'Latitude', lambda x: x['POINT_Y'])
    kclee = etl.addfield(kclee, 'LightbulbType', lambda x: x['LAMPTYPE'])
    kclee = etl.addfield(kclee, 'Wattage', lambda x: x['WATTS'])
    kclee = etl.addfield(kclee, 'Lumens', lambda x: x['LUMENS'])
    kclee = etl.addfield(kclee, 'AttachedTech', False)
    kclee = etl.addfield(kclee, 'LightAttributes', lambda x: x['FIXTURETYP'])
    kclee = etl.addfield(kclee, 'FiberWiFiEnable', False)
    kclee = etl.addfield(kclee, 'PoleType', None)
    kclee = etl.addfield(kclee, 'PoleOwner', 'Lee Summit')
    kclee = etl.addfield(kclee, 'DataSource', 'Lee Summit')
    kclee = etl.cut(kclee, 'PoleID', 'Longitude', 'Latitude', 'LightbulbType',
                    'Wattage', 'Lumens', 'AttachedTech', 'LightAttributes',
                    'FiberWiFiEnable', 'PoleType', 'PoleOwner', 'DataSource')
    etl.tocsv(kclee, 'data/kcleesummit_clean.csv')
示例#27
0
def kcmo_convert(filepath, xtrapath):
    """
    Takes the file path to a csv in the format used by Kansas City proper
    converts to universal format 
    outputs csv.
    """
    kcmo = etl.fromcsv(filepath)
    kcx = etl.fromxlsx(xtrapath)
    kcjoin = etl.join(kcmo, kcx, lkey='POLEID', rkey='IDNumber')
    del kcmo
    del kcx

    kcjoin = etl.addfield(kcjoin, 'PoleID', lambda x: x['POLEID'])
    kcjoin = etl.addfield(kcjoin, 'Longitude',
                          lambda x: geom_to_tuple(x['the_geom'])[0])
    kcjoin = etl.addfield(kcjoin, 'Latitude',
                          lambda x: geom_to_tuple(x['the_geom'])[1])
    kcjoin = etl.addfield(kcjoin, 'LightbulbType',
                          lambda x: x['LUMINAIRE TYPE'])
    kcjoin = etl.addfield(kcjoin, 'Wattage', lambda x: x['WATTS'])
    kcjoin = etl.addfield(kcjoin, 'Lumens', None)
    kcjoin = etl.addfield(
        kcjoin, 'LightAttributes', lambda x: make_a_list(
            x['ATTACHMENT 10'], x['ATTACHMENT 9'], x['ATTACHMENT 8'], x[
                'ATTACHMENT 7'], x['ATTACHMENT 6'], x['ATTACHMENT 5'], x[
                    'ATTACHMENT 4'], x['ATTACHMENT 3'], x['ATTACHMENT 2'], x[
                        'ATTACHMENT 1'], x['SPECIAL_N2'], x['SPECIAL_NO']))
    kcjoin = etl.addfield(kcjoin, 'AttachedTech',
                          lambda x: bool(x['LightAttributes']))
    kcjoin = etl.addfield(
        kcjoin, 'FiberWiFiEnable', lambda x: find_wifi(*x[
            'LightAttributes'], x['SPECIAL_N2'], x['SPECIAL_NO']))
    kcjoin = etl.addfield(kcjoin, 'PoleType', lambda x: x['POLE TYPE'])
    kcjoin = etl.addfield(kcjoin, 'PoleOwner', lambda x: x['POLE OWNER'])
    kcjoin = etl.addfield(kcjoin, 'DataSource', 'Kansas City')
    kcjoin = etl.cut(kcjoin, 'PoleID', 'Longitude', 'Latitude',
                     'LightbulbType', 'Wattage', 'Lumens', 'AttachedTech',
                     'LightAttributes', 'FiberWiFiEnable', 'PoleType',
                     'PoleOwner', 'DataSource')
    etl.tocsv(kcjoin, 'data/kcmo_clean.csv')
示例#28
0
import petl as etl

table_header = [
    "Fixed Acidity", "Volatile Acidity", "Citric Acid", "Sugar", "Chlorides",
    "Free SO2", "Total SO2", "Density", "pH", "Sulfates", "Alcohol", "Quality"
]

table1 = etl.addfield(
    etl.convertnumbers(
        etl.setheader(etl.fromcsv('winequality-red.csv'), table_header)),
    "Type", "Red")
table2 = etl.addfield(
    etl.convertnumbers(
        etl.setheader(etl.fromcsv('winequality-white.csv'), table_header)),
    "Type", "White")

#print(etl.head(table1))
#print(etl.head(table2))

table1_filtered = etl.select(table1, "Quality", lambda v: v > 6)
table2_filtered = etl.select(table2, "Quality", lambda v: v > 4)

good_wines = etl.cat(table1_filtered, table2_filtered)

good_wines_enhanced = etl.addfields(
    good_wines,
    [("Max Acidity",
      lambda rec: rec["Fixed Acidity"] + rec["Volatile Acidity"]),
     ("Locked SO2", lambda rec: rec["Total SO2"] - rec["Free SO2"])])
#print(etl.head(good_wines_enhanced))
#print(etl.tail(good_wines_enhanced))
import petl as etl

#Extracting data from example csv file
table1 = etl.fromcsv('example.csv')
print table1
#etl.look(table1)

#Transformation function to be applied on extracted data
table2 = etl.convert(table1,'foo','upper')
table3 = etl.convert(table2,'bar',int)
table4 = etl.convert(table3,'baz',float)
table5 = etl.addfield(table4, 'finally', lambda row: row.bar * row.baz)
print table5
#etl.look(table5)

#Writing above ETL pipeline in a functional style
table = (etl
         .fromcsv('example.csv')
         .convert('foo', 'upper')
         .convert('bar', int)
         .convert('baz', float)
         .addfield('finally', lambda row: row.bar * row.baz)
       )

table.look() #look function only displays five rows.
print table

#OOP style programming
l = [['foo','bar'], ['a', 1], ['b', 2], ['c', 2]]
table6 = etl.wrap(l)
print table6
示例#30
0
look(table1)
table2 = convertnumbers(table1)
look(table2)


# addfield

table1 = [['foo', 'bar'],
          ['M', 12],
          ['F', 34],
          ['-', 56]]

from petl import addfield, look
look(table1)
# using a fixed value
table2 = addfield(table1, 'baz', 42)
look(table2)
# calculating the value
table2 = addfield(table1, 'baz', lambda rec: rec['bar'] * 2)
look(table2)
# an expression string can also be used via expr
from petl import expr
table3 = addfield(table1, 'baz', expr('{bar} * 2'))
look(table3)
    

# rowslice

table1 = [['foo', 'bar'],
          ['a', 1],
          ['b', 2],
示例#31
0
import petl as etl
readFile = etl.fromtsv("donedeal_data_sample.tsv")
tmpTable = etl.addfield(readFile, 'InKms', lambda rec: rec['mileage'])
tmpTable2File = etl.convert(tmpTable,
                            'InKms',
                            lambda v: int(float(v) * 1.6),
                            where=lambda r: r.mileageType == 'miles')
etl.totsv(tmpTable2File, 'donedeal_inKms.tsv')
示例#32
0
def transform(mmj_employees, organization_id, debug, fake_email, source_db):
    """
    Load the transformed data into the destination(s)
    """
    # source data table
    source_dt = utils.view_to_list(mmj_employees)
    cut_data = [
        'id', 'email', 'first_name', 'organization_id', 'last_name',
        'created_at', 'updated_at', 'login'
    ]

    employee_data = etl.cut(source_dt, cut_data)

    employees = (etl.addfield(
        employee_data,
        'keys').addfield('name').addfield('role').addfield('dateOfBirth'))

    mappings = OrderedDict()
    mappings['id'] = 'id'
    mappings['name'] = \
        lambda name: _set_name(name.first_name, name.last_name, name.login)
    """
    Roles:
        1 = site-admin
        2 = site-admin
        3 = store-manager
        4 = budtender
    """
    mappings['role'] = lambda x: _assign_role(x.id, source_db)

    mappings['createdAt'] = 'created_at'
    mappings['updatedAt'] = 'updated_at'
    mappings['dateOfBirth'] = \
        lambda _: datetime.datetime(year=1970, month=01,
                                    day=01, hour=02, minute=30)
    mappings['organization_id'] = 'organization_id'  # keep mmj org
    mappings['accountStatus'] = lambda x: _active(x.id, source_db)

    fields = etl.fieldmap(employees, mappings)
    merged_employees = etl.merge(employees, fields, key='id')

    mapped_employees = []
    for item in etl.dicts(merged_employees):
        item['keys'] = {
            'id': item['id'],
            'organization_id': item['organization_id']
        }

        # remove any item['keys'] tuples with None values
        for key in item['keys'].keys():
            if not item['keys'][key]:
                del item['keys'][key]

        item['email'] = _set_email(item['email'], fake_email, debug)

        del item['login']
        del item['first_name']
        del item['last_name']
        del item['created_at']
        del item['id']
        del item['organization_id']
        # set up final structure for API
        mapped_employees.append(item)

    if debug:
        result = json.dumps(mapped_employees,
                            sort_keys=True,
                            indent=4,
                            default=utils.json_serial)
        print(result)

    return mapped_employees
# introduction
example_data = """foo,bar,baz
a,1,2.3
b,4,5.6
c,7,8.9
"""
with open('example.csv', 'w') as f:
    f.write(example_data)

import petl as etl

table1 = etl.fromcsv('example.csv')
print(table1)
table2 = etl.convert(table1, 'foo', 'upper')
print(table2)
table3 = etl.convert(table2, {'bar': int, 'baz': float})
print(table3)
table4 = etl.addfield(table3, 'bar*baz', lambda row: row.baz * row.bar)
print(table4.look())

# 管道操作
table=(
    etl
    .fromcsv('example.csv')
    .convert({'foo':'upper','bar':int,'baz':float})
    .addfield('bar*bar_new',lambda r:r.baz* r.bar)
    .convert('baz', lambda v, row: v * float(row.bar),pass_row=True)
)
print(table.look())

示例#34
0
def transform(mmj_menu_items, mmj_categories, prices, organization_id,
              source_db, debug):
    """
    Transform data
    """
    # source data table
    source_dt = utils.view_to_list(mmj_menu_items)

    cut_menu_data = [
        'id', 'vendor_id', 'menu_id', 'dispensary_id', 'strain_id',
        'created_at', 'updated_at', 'category_id', 'name', 'sativa', 'indica',
        'on_hold', 'product_type', 'image_file_name', 'medicine_amount',
        'product_type'
    ]

    cut_prices = [
        'menu_item_id', 'dispensary_id', 'price_half_gram', 'price_gram',
        'price_two_gram', 'price_eigth', 'price_quarter', 'price_half',
        'price_ounce'
    ]

    # Cut out all the fields we don't need to load
    menu_items = etl.cut(source_dt, cut_menu_data)
    prices_data = etl.cut(prices, cut_prices)

    menu_items = (etl.addfield(
        menu_items, 'createdAtEpoch').addfield('unitOfMeasure').addfield(
            'locationProductDetails').addfield('keys').addfield('restockLevel')
                  )

    # Two-step transform and cut. First we need to cut the name
    # and id from the source data to map to.
    cut_source_cats = etl.cut(mmj_categories, 'name', 'id', 'measurement')
    source_values = etl.values(cut_source_cats, 'name', 'id')

    # Then we nede a dict of categories to compare against.
    # id is stored to match against when transforming and mapping categories
    mmj_categories = dict([(value, id) for (value, id) in source_values])

    mappings = OrderedDict()
    mappings['id'] = 'id'
    mappings['createdAt'] = 'created_at'
    mappings['updatedAt'] = 'updated_at'
    mappings['createdAtEpoch'] = lambda x: utils.create_epoch(x.created_at)
    mappings['name'] = 'name'
    mappings['shareOnWM'] = lambda x: _wm_integration(x.id, source_db)
    """
    1 = Units
    2 = Grams (weight)
    """
    mappings['unitOfMeasure'] = \
        lambda x: _map_uom(x.category_id, source_db)

    fields = etl.fieldmap(menu_items, mappings)
    data = etl.merge(menu_items, fields, key='id')

    items = []
    for item in etl.dicts(data):

        breakpoint_pricing = (etl.select(
            prices_data,
            lambda x: x.dispensary_id == item['dispensary_id']).rename({
                'price_eigth':
                'price_eighth'
            }).cutout('menu_item_id'))
        # Set image url for load to download
        url = None
        if debug and item['image_file_name'] is not None:
            url = ("https://wm-mmjmenu-images-development.s3."
                   "amazonaws.com/menu_items/images/{0}/large/"
                   "{1}").format(item['id'], item['image_file_name'])
        elif item['image_file_name'] is not None:
            url = ("https://wm-mmjmenu-images-production.s3."
                   "amazonaws.com/menu_items/images/{0}/large/"
                   "{1}").format(item['id'], item['image_file_name'])

        item['image_file_name'] = url

        item['categoryId'] = _map_categories(item['category_id'],
                                             item['sativa'], item['indica'],
                                             mmj_categories, menu_items)
        item['keys'] = {
            'dispensary_id': item['dispensary_id'],
            'id': item['id'],
            'menu_id': item['menu_id'],
            'vendor_id': item['vendor_id'],
            'strain_id': item['strain_id'],
            'category_id': item['category_id']
        }

        # set a default netMJ value if the menu item is a unit product
        if item['unitOfMeasure'] is 2:
            item['netMarijuana'] = int(item['medicine_amount'])

        for key in item['keys'].keys():
            if not item['keys'][key]:
                del item['keys'][key]

        item['locationProductDetails'] = {
            'id': item['id'],
            'active': _active(item['on_hold'])
        }

        item['restockLevel'] = _restock_level(item['dispensary_id'],
                                              item['product_type'], source_db)

        if item['shareOnWM'] is None:
            item['shareOnWM'] = False

        for price in etl.dicts(breakpoint_pricing):
            try:
                price_two_gram = price['price_two_gram']
            except KeyError:
                price_two_gram = 0.0

            item['locationProductDetails']['weightPricing'] = {
                'price_half_gram':
                utils.dollars_to_cents(price['price_half_gram']),
                'price_two_gram': utils.dollars_to_cents(price_two_gram),
                'price_gram': utils.dollars_to_cents(price['price_gram']),
                'price_eighth': utils.dollars_to_cents(price['price_eighth']),
                'price_quarter':
                utils.dollars_to_cents(price['price_quarter']),
                'price_half': utils.dollars_to_cents(price['price_half']),
                'price_ounce': utils.dollars_to_cents(price['price_ounce'])
            }

        del item['vendor_id']
        del item['indica']
        del item['dispensary_id']
        del item['id']
        del item['strain_id']
        del item['on_hold']
        del item['menu_id']
        del item['sativa']
        del item['category_id']
        del item['updated_at']
        del item['created_at']
        del item['product_type']

        if item['image_file_name'] is None:
            del item['image_file_name']

        # set up final structure for API
        items.append(item)

    # Remove inactive items
    for item in items:
        if item['locationProductDetails']['active'] is False:
            items.remove(item)

    if debug:
        result = json.dumps(items,
                            sort_keys=True,
                            indent=4,
                            default=utils.json_serial)
        print(result)

    return items
示例#35
0
# Luego procedemos a agrupar y acumular los resultados por el país
df_confirmed = etl.todataframe(t_confirmed)
df = df_confirmed.groupby(['Country']).sum()
t_confirmed = etl.fromdataframe(df, include_index=True)

# Renombramos el campo de Country nuevamente
t_confirmed = etl.rename(t_confirmed, {'index': 'Country'})

# Luego agregamos las columnas de fecha como datos y renombramos las nuevas columnas
t_confirmed = etl.melt(t_confirmed, 'Country')
t_confirmed = etl.rename(t_confirmed, {'variable': 'Date'})
t_confirmed = etl.rename(t_confirmed, {'value': 'Cases'})

# Luego agregamos el continente para agrupar
t_confirmed = etl.addfield(t_confirmed, 'Continent',
                           lambda rec: get_continent_code(rec['Country']))

# Y nuevamente nos aseguramos que sean del tipo de dato que deben ser.
t_confirmed = etl.convert(t_confirmed, 'Cases', int)
t_confirmed = etl.convert(t_confirmed, 'Date',
                          lambda v: datetime.datetime.strptime(v, '%Y-%m-%d'))

#Finalmente, subimos el archivo al repositorio de datos
conn = pymysql.connect(password='******', database='covid', user='******')
conn.cursor().execute('SET SQL_MODE=ANSI_QUOTES')
etl.todb(t_confirmed, conn, 'confirmed', create=True, drop=True)
conn.close()

# Ejemplos de visualización para debugging
#print(etl.header(t_confirmed))
#print(etl.records(t_confirmed))
示例#36
0
def main(argv):

    parser = argparse.ArgumentParser()
    parser.add_argument("--pledges-filename",
                        required=True,
                        help="Input UTF8 CSV with pledges data "
                        "dumped from Servant Keeper")
    parser.add_argument("--output-filename",
                        required=True,
                        help="Output CSV filename which will be loaded with "
                        "pledges data in CCB import format ")
    parser.add_argument(
        '--trace',
        action='store_true',
        help="If specified, prints tracing/progress messages to "
        "stdout")
    args = parser.parse_args()

    assert os.path.isfile(
        args.pledges_filename
    ), "Error: cannot open file '" + args.pledges_filename + "'"

    table = petl.fromcsv(args.pledges_filename)

    table = petl.rename(table, {
        'Frequency': 'SK Frequency',
        'Individual ID': 'SK Individual ID'
    })

    table = petl.addfield(table, 'Individual ID',
                          lambda rec: rec['SK Individual ID'])
    table = petl.addfield(table, 'Campus', '')
    table = petl.addfield(table, 'Category Pledged To',
                          lambda rec: rec['Account'])
    table = petl.addfield(table, 'Amount Pledged', convert_amount)
    table = petl.addfield(table, 'Total Amount Pledged',
                          lambda rec: rec['Pledged'])
    table = petl.addfield(
        table, 'Frequency', lambda rec: {
            1: 'Yearly',
            4: 'Yearly',
            12: 'Monthly',
            52: 'Weekly',
            24: 'Monthly',
            2: 'Yearly'
        }[int(rec['Payments'])])
    table = petl.addfield(
        table, 'Number of Gifts', lambda rec: {
            'Yearly': 1,
            'Monthly': 12,
            'Weekly': 52
        }[rec['Frequency']])
    table = petl.addfield(
        table, 'Length Multiplier', lambda rec: {
            'Yearly': 'Years',
            'Monthly': 'Months',
            'Weekly': 'Weeks'
        }[rec['Frequency']])
    table = petl.addfield(
        table, 'Start Date', lambda rec: {
            'Operating Income': '2013-01-01',
            'Mortgage Principal': '2013-01-01',
            'Operating Income 2015': '2015-01-01'
        }[rec['Account']])
    table = petl.addfield(
        table, 'End Date', lambda rec: {
            'Operating Income': '2013-12-31',
            'Mortgage Principal': '2013-12-31',
            'Operating Income 2015': '2015-12-31'
        }[rec['Account']])

    trace('CONVERTING AND THEN EMITTING TO CSV FILE...',
          args.trace,
          banner=True)

    table.progress(200).tocsv(args.output_filename)

    trace('OUTPUT TO CSV COMPLETE.', args.trace, banner=True)

    trace('DONE!', args.trace, banner=True)
示例#37
0
文件: examples.py 项目: datamade/petl
look(table1)
table2 = convertnumbers(table1)
look(table2)


# addfield

table1 = [['foo', 'bar'],
          ['M', 12],
          ['F', 34],
          ['-', 56]]

from petl import addfield, look
look(table1)
# using a fixed value
table2 = addfield(table1, 'baz', 42)
look(table2)
# calculating the value
table2 = addfield(table1, 'baz', lambda rec: rec['bar'] * 2)
look(table2)
# an expression string can also be used via expr
from petl import expr
table3 = addfield(table1, 'baz', expr('{bar} * 2'))
look(table3)
    

# rowslice

table1 = [['foo', 'bar'],
          ['a', 1],
          ['b', 2],
示例#38
0
文件: basics.py 项目: DeanWay/petl
          ['C', True],
          ['D', False]]
table10 = etl.cat(table8, table9, header=['A', 'foo', 'B', 'bar', 'C'])
table10


# addfield()
############

import petl as etl
table1 = [['foo', 'bar'],
          ['M', 12],
          ['F', 34],
          ['-', 56]]
# using a fixed value
table2 = etl.addfield(table1, 'baz', 42)
table2
# calculating the value
table2 = etl.addfield(table1, 'baz', lambda rec: rec['bar'] * 2)
table2


# rowslice()
############

import petl as etl
table1 = [['foo', 'bar'],
          ['a', 1],
          ['b', 2],
          ['c', 5],
          ['d', 7],