def main(argv): global g parser = argparse.ArgumentParser() parser.add_argument("--contributions-filename", required=True, help="Input UTF8 CSV with contributions data " "dumped from Servant Keeper") parser.add_argument("--split-detail-files", required=False, nargs='*', default=argparse.SUPPRESS, help="List of CSV files which have records that can be used to replace top-level 'Split Transaction' " "records in the main contributions file.") parser.add_argument("--chart-of-accounts-filename", required=True, help="Input UTF8 CSV with Chart of Accounts " "data from Servant Keeper") parser.add_argument("--output-filename", required=True, help="Output CSV filename which will be loaded with " "contributions data in CCB import format ") parser.add_argument('--trace', action='store_true', help="If specified, prints tracing/progress messages to " "stdout") args = parser.parse_args() assert os.path.isfile(args.contributions_filename), "Error: cannot open file '" + args.contributions_filename + "'" dict_split_transaction_details = load_split_transaction_details(args.split_detail_files) table = petl.fromcsv(args.contributions_filename) table = petl.rename(table, { 'Individual ID': 'SK Individual ID', 'Amount': 'SK Amount' }) trace('REMOVING SPLIT TRANSACTIONS...', args.trace, banner=True) table = replace_split_transactions(table, dict_split_transaction_details) table_coa = petl.fromcsv(args.chart_of_accounts_filename) table = petl.leftjoin(table, table_coa, lkey='Account', rkey='SK Account') table = petl.addfield(table, 'Individual ID', lambda rec: rec['SK Individual ID']) table = petl.addfield(table, 'Date of Contribution', lambda rec: rec['Batch Date']) table = petl.addfield(table, 'Amount', lambda rec: rec['SK Amount']) table = petl.addfield(table, 'Type of Gift', lambda rec: rec['Type']) table = petl.addfield(table, 'Check Number', lambda rec: rec['Check #']) table = petl.addfield(table, 'Fund', convert_fund) table = petl.addfield(table, 'Sub Fund', convert_sub_fund) table = petl.addfield(table, 'Campus', '') table = petl.addfield(table, 'Transaction Grouping', '') table = petl.addfield(table, 'Batch Number/Name', '') table = petl.addfield(table, 'Tax Deductible', lambda rec: rec['Tax']) table = petl.addfield(table, 'Memo', convert_notes) trace('CONVERTING AND THEN EMITTING TO CSV FILE...', args.trace, banner=True) table.progress(200).tocsv(args.output_filename) trace('OUTPUT TO CSV COMPLETE.', args.trace, banner=True) if len(g.set_unfound_accounts) > 0: trace('UNMATCHED SK ACCOUNTS!', args.trace, banner=True) for acct in g.set_unfound_accounts: trace(acct, args.trace) trace('DONE!', args.trace, banner=True)
def _petl_transform(self, record_set): if "transform" in self.task: transform = self.task["transform"] if "convert" in transform: conversions = {} for field, func in transform["convert"]: conversions[field] = func record_set = etl.convert(record_set, conversions) if "filter" in transform: record_set = etl.select(record_set, transform["filter"]) if "remove" in transform: cuts = [] for field in transform["remove"]: cuts.append(field) record_set = etl.cutout(record_set, cuts) if "rename" in transform: names = {} for old, new_one in transform["rename"]: names[old] = new_one record_set = etl.rename(record_set, names) return record_set
def transform(data,data_set): data = data['observations'] data = etl.fromdicts(data, header=['value','realtime_start','realtime_end','date']) data = etl.cut(data,'date','value') data = etl.rename(data,{'date':'date','value': data_set.lower()}) data = etl.convert(data,data_set.lower(),lambda val: 0 if val == '.' else val) return data
def clean(table): """Do some cleanup of TABLE TABLE is a petl table.""" # Rename column to expand name table = etl.rename(table, {'WVRSTATE': 'waiverstate'}) # More conversions table = etl.convert( table, { 'EXCLTYPE': lambda f: f.strip(), # Trim extra spaces 'EXCLDATE': munge_date, # Arrange date for sqlite 'REINDATE': munge_date, # Arrange date for sqlite 'WAIVERDATE': munge_date # Arrange date for sqlite }) # Do some cleanup conversions on individual data table = etl.convert( table, { 'DOB': munge_date, 'MIDNAME': lambda f: f if f != " " else "" # no spaces as middle names }) table = etl.convertall(table, lambda f: None if f.strip() == '' else f) return table
def transform_to_petl(data): isodate = etl.dateparser("%Y-%m-%d") data = etl.fromdataframe(data) data = etl.rename(data, {"index": "Date", "VALUE": "Value"}) data = etl.convert(data, {"Date": lambda d: d[:10]}) data = etl.convert(data, {"Date": lambda d: isodate(d)}) return data
def _shape_data(self, raw_planets: etl.Table, raw_people: etl.Table) -> etl.Table: planets = etl.cut(raw_planets, ( Planet.Columns.NAME, Planet.Columns.URL, )) people = etl.cut( raw_people, ( Person.Columns.NAME, Person.Columns.HEIGHT, Person.Columns.MASS, Person.Columns.HAIR_COLOR, Person.Columns.SKIN_COLOR, Person.Columns.EYE_COLOR, Person.Columns.BIRTH_YEAR, Person.Columns.GENDER, Person.Columns.HOMEWORLD, Person.Columns.EDITED, ), ) combined = etl.join( planets, people, lkey=Planet.Columns.URL, rkey=Person.Columns.HOMEWORLD, lprefix=Planet.PREFIX, ) renamed = etl.rename( combined, { Person.Columns.EDITED: Person.RenamedColumns.DATE, Planet.prefix_value(Planet.Columns.NAME): Person.Columns.HOMEWORLD, }, ) converted = etl.convert( renamed, { Person.RenamedColumns.DATE: lambda v: parse(v).date(), }) return etl.cut( converted, ( Person.Columns.NAME, Person.Columns.HEIGHT, Person.Columns.MASS, Person.Columns.HAIR_COLOR, Person.Columns.SKIN_COLOR, Person.Columns.EYE_COLOR, Person.Columns.BIRTH_YEAR, Person.Columns.GENDER, Person.Columns.HOMEWORLD, Person.RenamedColumns.DATE, ), )
def unpackcall(tbl, *keys, **kwargs): """ Unpack the call column. E.g.:: >>> from petlx.vcf import fromvcf, unpackinfo, meltsamples, unpackcall >>> from petl import look, cutout >>> t1 = fromvcf('../fixture/sample.vcf') >>> t2 = meltsamples(t1) >>> t3 = unpackcall(t2) >>> t4 = cutout(t3, 'INFO') >>> look(t4) +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | 'CHROM' | 'POS' | 'ID' | 'REF' | 'ALT' | 'QUAL' | 'FILTER' | 'SAMPLE' | 'GT' | 'GQ' | 'DP' | 'HQ' | +=========+=======+=============+=======+=======+========+==========+===========+=======+======+======+==============+ | '19' | 111 | None | 'A' | [C] | 9.6 | [] | 'NA00001' | '0|0' | None | None | [10, 10] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '19' | 111 | None | 'A' | [C] | 9.6 | [] | 'NA00002' | '0|0' | None | None | [10, 10] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '19' | 111 | None | 'A' | [C] | 9.6 | [] | 'NA00003' | '0/1' | None | None | [3, 3] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '19' | 112 | None | 'A' | [G] | 10 | [] | 'NA00001' | '0|0' | None | None | [10, 10] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '19' | 112 | None | 'A' | [G] | 10 | [] | 'NA00002' | '0|0' | None | None | [10, 10] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '19' | 112 | None | 'A' | [G] | 10 | [] | 'NA00003' | '0/1' | None | None | [3, 3] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '20' | 14370 | 'rs6054257' | 'G' | [A] | 29 | [] | 'NA00001' | '0|0' | 48 | 1 | [51, 51] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '20' | 14370 | 'rs6054257' | 'G' | [A] | 29 | [] | 'NA00002' | '1|0' | 48 | 8 | [51, 51] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '20' | 14370 | 'rs6054257' | 'G' | [A] | 29 | [] | 'NA00003' | '1/1' | 43 | 5 | [None, None] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '20' | 17330 | None | 'T' | [A] | 3 | ['q10'] | 'NA00001' | '0|0' | 49 | 3 | [58, 50] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ .. versionadded:: 0.5 """ if not keys: if hasattr(tbl, 'filename'): try: import vcf as pyvcf except ImportError as e: raise UnsatisfiedDependency(e, dep_message) reader = pyvcf.Reader(filename=tbl.filename) # all FORMAT keys = reader.formats.keys() else: tbl = convert(tbl, 'CALL', lambda v: v.data._asdict() ) # enable sampling of keys from data result = unpackdict(tbl, 'CALL', keys=keys) if 'prefix' in kwargs: result = rename(result, {k: kwargs['prefix'] + k for k in keys}) if hasattr(tbl, 'filename'): return VCFWrapper(result, tbl.filename) else: return result
def createDimSubscriptions(events): try: dim_subscriptions_cut = etl.cut(events, 'type') dim_subscriptions_rename = etl.rename(dim_subscriptions_cut, {'type': 'subscription_name'}) dim_subscriptions = etl.distinct(dim_subscriptions_rename) # Export as csv to load folder etl.tocsv(dim_subscriptions, 'load/dim_subscriptions.csv') except Exception as e: print("Something went wrong. Error {0}".format(e))
def createDimMedium(events): try: dim_medium_cut = etl.cut(events, 'utm_medium') dim_medium_rename = etl.rename(dim_medium_cut, {'utm_medium': 'medium'}) dim_medium = etl.distinct(dim_medium_rename) # Export as csv to load folder etl.tocsv(dim_medium, 'load/dim_medium.csv') except Exception as e: print("Something went wrong. Error {0}".format(e))
def createDimCampaignType(events): try: dim_campaigntype_cut = etl.cut(events, 'utm_campaign') dim_campaigntype_rename = etl.rename(dim_campaigntype_cut, {'utm_campaign': 'campaign_type'}) dim_campaigntype = etl.distinct(dim_campaigntype_rename) # export as csv to load folder etl.tocsv(dim_campaigntype, 'load/dim_campaigntype.csv') except Exception as e: print("Something went wrong. Error {0}".format(e))
def unpackcall(tbl, *keys, **kwargs): """ Unpack the call column. E.g.:: >>> from petlx.vcf import fromvcf, unpackinfo, meltsamples, unpackcall >>> from petl import look, cutout >>> t1 = fromvcf('../fixture/sample.vcf') >>> t2 = meltsamples(t1) >>> t3 = unpackcall(t2) >>> t4 = cutout(t3, 'INFO') >>> look(t4) +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | 'CHROM' | 'POS' | 'ID' | 'REF' | 'ALT' | 'QUAL' | 'FILTER' | 'SAMPLE' | 'GT' | 'GQ' | 'DP' | 'HQ' | +=========+=======+=============+=======+=======+========+==========+===========+=======+======+======+==============+ | '19' | 111 | None | 'A' | [C] | 9.6 | [] | 'NA00001' | '0|0' | None | None | [10, 10] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '19' | 111 | None | 'A' | [C] | 9.6 | [] | 'NA00002' | '0|0' | None | None | [10, 10] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '19' | 111 | None | 'A' | [C] | 9.6 | [] | 'NA00003' | '0/1' | None | None | [3, 3] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '19' | 112 | None | 'A' | [G] | 10 | [] | 'NA00001' | '0|0' | None | None | [10, 10] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '19' | 112 | None | 'A' | [G] | 10 | [] | 'NA00002' | '0|0' | None | None | [10, 10] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '19' | 112 | None | 'A' | [G] | 10 | [] | 'NA00003' | '0/1' | None | None | [3, 3] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '20' | 14370 | 'rs6054257' | 'G' | [A] | 29 | [] | 'NA00001' | '0|0' | 48 | 1 | [51, 51] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '20' | 14370 | 'rs6054257' | 'G' | [A] | 29 | [] | 'NA00002' | '1|0' | 48 | 8 | [51, 51] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '20' | 14370 | 'rs6054257' | 'G' | [A] | 29 | [] | 'NA00003' | '1/1' | 43 | 5 | [None, None] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '20' | 17330 | None | 'T' | [A] | 3 | ['q10'] | 'NA00001' | '0|0' | 49 | 3 | [58, 50] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ .. versionadded:: 0.5 """ if not keys: if hasattr(tbl, 'filename'): try: import vcf as pyvcf except ImportError as e: raise UnsatisfiedDependency(e, dep_message) reader = pyvcf.Reader(filename=tbl.filename) # all FORMAT keys = reader.formats.keys() else: tbl = convert(tbl, 'CALL', lambda v: v.data._asdict()) # enable sampling of keys from data result = unpackdict(tbl, 'CALL', keys=keys) if 'prefix' in kwargs: result = rename(result, {k: kwargs['prefix'] + k for k in keys}) if hasattr(tbl, 'filename'): return VCFWrapper(result, tbl.filename) else: return result
def transform_data(data): tbl_data = petl.fromdicts(data) tbl_data = petl.convert( tbl_data, {k: v['value'] for k, v in fields_to_transform.items()}) tbl_data = petl.rename( tbl_data, {k: v['key'] for k, v in fields_to_transform.items()}) tbl_data_allowed = petl.cut(tbl_data, *allowed_fields) return tbl_data_allowed
def clean_and_separate(table): """Do some cleanup of TABLE and split into individual and business tables. TABLE is a petl table.""" # Rename column to expand name table = etl.rename(table, {'WVRSTATE': 'waiverstate'}) # More conversions table = etl.convert( table, { 'EXCLTYPE': lambda f: f.strip(), # Trim extra spaces 'EXCLDATE': munge_date, # Arrange date for sqlite 'REINDATE': munge_date, # Arrange date for sqlite 'WAIVERDATE': munge_date # Arrange date for sqlite }) # Separate into two tables, as this is actually two different data sets individual = etl.select(table, "{LASTNAME} != '' and {FIRSTNAME} != ''") business = etl.select(table, "{LASTNAME} == '' and {FIRSTNAME} == ''") # Sanity check: Make sure we split the rows without dupes or # missing any. The +1 is to account for the second header row # that gets counted when we have two tables. if len(business) + len(individual) != len(table) + 1: fatal( "Separating business and individual exclusions came up with the wrong number of rows!" ) # Remove unused columns individual = etl.transform.basics.cutout(individual, "BUSNAME") business = etl.transform.basics.cutout(business, "LASTNAME", "FIRSTNAME", "MIDNAME", "DOB") # Do some cleanup conversions on individual data individual = etl.convert( individual, { 'DOB': munge_date, 'MIDNAME': lambda f: f if f != " " else "" # no spaces as middle names }) return individual, business
def main(argv): parser = argparse.ArgumentParser() parser.add_argument("--pledges-filename", required=True, help="Input UTF8 CSV with pledges data " "dumped from Servant Keeper") parser.add_argument("--output-filename", required=True, help="Output CSV filename which will be loaded with " "pledges data in CCB import format ") parser.add_argument('--trace', action='store_true', help="If specified, prints tracing/progress messages to " "stdout") args = parser.parse_args() assert os.path.isfile(args.pledges_filename), "Error: cannot open file '" + args.pledges_filename + "'" table = petl.fromcsv(args.pledges_filename) table = petl.rename(table, { 'Frequency': 'SK Frequency', 'Individual ID': 'SK Individual ID' }) table = petl.addfield(table, 'Individual ID', lambda rec: rec['SK Individual ID']) table = petl.addfield(table, 'Campus', '') table = petl.addfield(table, 'Category Pledged To', lambda rec: rec['Account']) table = petl.addfield(table, 'Amount Pledged', convert_amount) table = petl.addfield(table, 'Total Amount Pledged', lambda rec: rec['Pledged']) table = petl.addfield(table, 'Frequency', lambda rec: {1:'Yearly', 4:'Yearly', 12:'Monthly', 52:'Weekly', 24:'Monthly', 2:'Yearly'}[int(rec['Payments'])]) table = petl.addfield(table, 'Number of Gifts', lambda rec: {'Yearly':1, 'Monthly':12, 'Weekly':52}[rec['Frequency']]) table = petl.addfield(table, 'Length Multiplier', lambda rec: {'Yearly':'Years', 'Monthly':'Months', 'Weekly':'Weeks'}[rec['Frequency']]) table = petl.addfield(table, 'Start Date', lambda rec: {'Operating Income':'2013-01-01', 'Mortgage Principal':'2013-01-01', 'Operating Income 2015':'2015-01-01'}[rec['Account']]) table = petl.addfield(table, 'End Date', lambda rec: {'Operating Income':'2013-12-31', 'Mortgage Principal':'2013-12-31', 'Operating Income 2015':'2015-12-31'}[rec['Account']]) trace('CONVERTING AND THEN EMITTING TO CSV FILE...', args.trace, banner=True) table.progress(200).tocsv(args.output_filename) trace('OUTPUT TO CSV COMPLETE.', args.trace, banner=True) trace('DONE!', args.trace, banner=True)
def rename_column(self, column_name, new_column_name): """ Rename a column `Args:` column_name: str The current column name new_column_name: str The new column name `Returns:` `Parsons Table` and also updates self """ if new_column_name in self.columns: raise ValueError(f"Column {new_column_name} already exists") self.table = petl.rename(self.table, column_name, new_column_name) return self
def xref_symbol_reports(): symbol_reports = [ f for f in os.listdir() if re.match('OCLC Datasync Unresolved.*\.csv', f) ] today = str(date.today()) for report in symbol_reports: symbol_split = re.split('^.*processing.(M[A-Z]{2}).*$', report) symbol = symbol_split[1] xlsx_outfile = symbol + '_datasync_unresolved_' + today + '.xlsx' xls_outfile = symbol + '_datasync_unresolved_' + today + '.xls' txt_outfile = symbol + '_staging_OCNs_' + today + '.txt' symbol_table_raw = etl.fromcsv(report, encoding='utf-8') symbol_table = etl.rename(symbol_table_raw, '\ufeffMMS Id', 'MMS ID') symbol_table2 = etl.select(symbol_table, "{MMS ID} is not None") symbol_table_sorted = etl.sort(symbol_table2, 'MMS ID') xref_table = etl.fromcsv('unresxref.csv') xref_table2 = etl.select(xref_table, "{MMS ID} is not None") xref_table_sorted = etl.sort(xref_table2, 'MMS ID') symbol_xref_table = etl.join(symbol_table_sorted, xref_table_sorted, presorted=True, lkey="MMS ID", rkey="MMS ID") try: etl.toxlsx(symbol_xref_table, xlsx_outfile, encoding='utf-8') except TypeError: etl.toxls(symbol_xref_table, xls_outfile, 'Sheet1', encoding='utf-8') staging_ocns_table = etl.cut(symbol_xref_table, 'Staging OCN') template = '{Staging OCN}\n' etl.totext(staging_ocns_table, txt_outfile, template=template)
def get_relationships(self): "Parses a list of `Relationship` objects." core_file = _find_loinc_table_core_file(self.uri.path) core = etl.fromcsv(core_file, delimiter=',') core = etl.cut(core, ['LOINC_NUM', 'LONG_COMMON_NAME']) hierarchy_file = _find_multi_axial_hierarchy_file(self.uri.path) hierarchy = etl.fromcsv(hierarchy_file, delimiter=',') hierarchy = etl.leftjoin(hierarchy, core, lkey='CODE', rkey='LOINC_NUM') hierarchy = etl.cut(hierarchy, ['IMMEDIATE_PARENT', 'CODE', 'CODE_TEXT', 'LONG_COMMON_NAME']) hierarchy = etl.fillright(hierarchy) hierarchy = etl.cut(hierarchy, ['IMMEDIATE_PARENT', 'CODE', 'LONG_COMMON_NAME']) hierarchy = etl.rename(hierarchy, 'LONG_COMMON_NAME', 'CODE_TEXT') parents = etl.cut(hierarchy, ['CODE', 'CODE_TEXT']) hierarchy = etl.selectne(hierarchy, 'IMMEDIATE_PARENT', '') hierarchy = etl.leftjoin(hierarchy, parents, lkey='IMMEDIATE_PARENT', rkey='CODE', lprefix='source.', rprefix='target.') hierarchy = etl.distinct(hierarchy) if self.versioned: version = _parse_version(hierarchy_file) hierarchy = etl.addfield(hierarchy, 'version', version) hierarchy = etl.rowmapmany(hierarchy, _to_json, ['relationship']) return hierarchy
def get_table(characters_data): characters_headers = [ 'name', 'height', 'mass', 'hair_color', 'skin_color', 'eye_color', 'birth_year', 'gender', 'homeworld', 'edited', ] characters_table = etl.fromdicts(characters_data, characters_headers) characters_table = etl.rename(characters_table, 'edited', 'date') characters_table = etl.convert( characters_table, 'date', lambda date_field: datetime.strptime( date_field, '%Y-%m-%dT%H:%M:%S.%fZ').strftime('%Y-%m-%d')) characters_table = replace_homeworld_planet_name(characters_table) return characters_table
def tsv_fix(base_path, new_file_name, pk_list, illegal_columns_lower_case, tsv_process): if tsv_process: pwb_replace_in_file(new_file_name, '\0', '') # Remove null bytes table = etl.fromcsv(new_file_name, delimiter='\t', skipinitialspace=True, quoting=csv.QUOTE_NONE, quotechar='', escapechar='') row_count = etl.nrows(table) if tsv_process: tempfile = NamedTemporaryFile(mode='w', dir=base_path + "/content/data/", delete=False) table = pwb_lower_case_header(table) table = etl.rename(table, illegal_columns_lower_case, strict=False) print(new_file_name) for pk in pk_list: table = etl.convert(table, pk.lower(), lambda a: a if len(str(a)) > 0 else '-') writer = csv.writer(tempfile, delimiter='\t', quoting=csv.QUOTE_NONE, quotechar='', escapechar='', lineterminator='\n') writer.writerows(table) shutil.move(tempfile.name, new_file_name) return row_count
# Load the files users = etl.fromcsv('data/users.csv') events = etl.fromjson('data/events2.json') # Transform # Dim Customers # Filter necessary data only dim_customers = etl.cut(users, 'user_id', 'email') # Export as csv to load folder etl.tocsv(dim_customers, 'load/dim_customers.csv') # Dim Subscriptions # Use the distinct values present in the type column to load into the dim subscription table dim_subscriptions_cut = etl.cut(events, 'type') dim_subscriptions_rename = etl.rename(dim_subscriptions_cut, {'type': 'subscription_name'}) dim_subscriptions = etl.distinct(dim_subscriptions_rename) # Export as csv to load folder etl.tocsv(dim_subscriptions, 'load/dim_subscriptions.csv') # Dim Medium # Use the distinct values present in the utm_medium colum to load into the dim medium table dim_medium_cut = etl.cut(events, 'utm_medium') dim_medium_rename = etl.rename(dim_medium_cut, {'utm_medium': 'medium'}) dim_medium = etl.distinct(dim_medium_rename) # Export as csv to load folder etl.tocsv(dim_medium, 'load/dim_medium.csv') # Dim Campaign Type # Use the distinct values present in the utm_campaign column to load into the dim campaign table # Note:
from __future__ import division, print_function, absolute_import # rename() ########## import petl as etl table1 = [['sex', 'age'], ['m', 12], ['f', 34], ['-', 56]] # rename a single field table2 = etl.rename(table1, 'sex', 'gender') table2 # rename multiple fields by passing a dictionary as the second argument table3 = etl.rename(table1, {'sex': 'gender', 'age': 'age_years'}) table3 # setheader() ############# import petl as etl table1 = [['foo', 'bar'], ['a', 1], ['b', 2]] table2 = etl.setheader(table1, ['foofoo', 'barbar']) table2 # extendheader()
def transform_and_aggregate_datetimes(query_results, rollup): """transform datetime to the correct TZ; aggregate the values in the query results based on the datetime rollup args. Aggregation is performed for: * hourly or daily time intervals * total NOTE: in order to handle potential No-Data values in the DB during aggregation, we convert them to 0. The `src` field then indicates if any values in the rollup were N/D. Then, if the value field in the aggregated row still shows 0 after summation, *and* the src field shows N/D, we turn that zero into None. If there was a partial reading (e.g., the sensor has values for the first half hour but N/D for the second, and we are doing an hourly rollup), then the values will stay there, but the source field will indicate both N/D and whatever the source was for the workable sensor values. TODO: move this work over to the database query """ t1 = etl\ .fromdicts(query_results)\ .convert('xts', lambda v: v.astimezone(TZ).isoformat(), failonerror=True) #.rename('xts', 'ts') # print("t1") # print(t1) # print("rollup", rollup) if rollup in [INTERVAL_DAILY, INTERVAL_HOURLY]: petl_aggs = OrderedDict( val=('val', _sumround), # sum the rainfall vales src=('src', _listset) # create a list of all rainfall sources included in the rollup ) t2 = etl\ .convert( t1, 'xts', lambda v: _rollup_date(v, rollup), # convert datetimes to their rolled-up value in iso-format failonerror=True )\ .convert( 'val', lambda v: 0 if v is None else v, # convert rainfall values to 0 if no-data failonerror=True )\ .aggregate( ('xts', 'sid'), petl_aggs # aggregate rainfall values (sum) and sources (list) for each timestamp+ID combo, )\ .convert( 'val', lambda v, r: None if ('N/D' in r.src and v == 0) else v, # replace 0 values with no data if aggregated source says its N/D pass_row=True, failonerror=True )\ .sort('sid') # .convert( # 'xts', # lambda v: TZ.localize(parse(v)).isoformat(), # convert that datetime to iso format w/ timezone # failonerror=True # ) # print("t2 time rollup") elif rollup in [INTERVAL_SUM]: petl_aggs = OrderedDict( val=('val', _sumround), # sum the rainfall vales src=('src', _listset), # create a list of all rainfall sources included in the rollup xts=('xts', _minmax) # create a iso datetime range string from the min and max datetimes found ) t2 = etl\ .aggregate( t1, 'sid', petl_aggs # aggregate rainfall values (sum) and sources (list), and datetimes (str) for each ID, )\ .convert( 'val', lambda v, r: None if ('N/D' in r.src and v == 0) else v, # replace 0 values with no data if aggregated source says its N/D pass_row=True )\ .sort('sid')\ # print("t2 sum") else: t2 = t1 # print("t2 = t1") # print(t2) # h = etl.header(t2) # rename the timestamp and sensor id fields, # print("t2 header:", list(etl.header(t2))) # rename_kw = {} # for h1, h0 in [('xts', 'ts'), ('sid', 'id')]: # if h1 in h: # rename_kw[h1] = h0 # if len(rename_kw.items()) > 0: # {'xts':'ts', 'sid':'id'} t3 = etl.rename(t2, {'xts':'ts', 'sid':'id'}, strict=False) # else: # t3 = t2 # print("t3") # print(t3) # convert to list of dicts and return return list(etl.dicts(t3))
elif (country == 'Kosovo') or (country == 'Holy See'): return 'EU' else: return 'N/A' # Fuente de los datos que vamos a leer uri_confirmed = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv' uri_death = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv' uri_recovered = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv' # Procesamos primero casos confirmados t_confirmed = etl.fromcsv(uri_confirmed) # Cambiamos el nombre a los encabezados t_confirmed = etl.rename(t_confirmed, {'Country/Region': 'Country'}) # Ajustamos los tipos de datos # A partir de la columna 5, el tipo de dato es integer, que es el número de personas/casos # Adicionalmente aprovechamos para cambiar el formato de la fecha de 1/23/20 a 2020-01-23 en el header headers = etl.fieldnames(t_confirmed) i = 0 for header in headers: if i >= 4: t_confirmed = etl.convert(t_confirmed, header, int) # corregimos el tipo de dato fecha = datetime.datetime.strptime( header, '%m/%d/%y') # calculamos la fecha en formato correcto t_confirmed = etl.rename(t_confirmed, header, fecha.strftime('%Y-%m-%d')) i = i + 1
def unpackinfo(tbl, *keys, **kwargs): """ Unpack the INFO field into separate fields. E.g.:: >>> from petlx.vcf import fromvcf, unpackinfo >>> from petl import look >>> t1 = fromvcf('../fixture/sample.vcf', samples=False) >>> look(t1) +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+ | 'CHROM' | 'POS' | 'ID' | 'REF' | 'ALT' | 'QUAL' | 'FILTER' | 'INFO' | +=========+=========+=============+=======+===========+========+==========+=========================================================================================+ | '19' | 111 | None | 'A' | [C] | 9.6 | [] | {} | +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+ | '19' | 112 | None | 'A' | [G] | 10 | [] | {} | +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+ | '20' | 14370 | 'rs6054257' | 'G' | [A] | 29 | [] | OrderedDict([('NS', 3), ('DP', 14), ('AF', [0.5]), ('DB', True), ('H2', True)]) | +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+ | '20' | 17330 | None | 'T' | [A] | 3 | ['q10'] | OrderedDict([('NS', 3), ('DP', 11), ('AF', [0.017])]) | +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+ | '20' | 1110696 | 'rs6040355' | 'A' | [G, T] | 67 | [] | OrderedDict([('NS', 2), ('DP', 10), ('AF', [0.333, 0.667]), ('AA', 'T'), ('DB', True)]) | +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+ | '20' | 1230237 | None | 'T' | [None] | 47 | [] | OrderedDict([('NS', 3), ('DP', 13), ('AA', 'T')]) | +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+ | '20' | 1234567 | 'microsat1' | 'G' | [GA, GAC] | 50 | [] | OrderedDict([('NS', 3), ('DP', 9), ('AA', 'G'), ('AN', 6), ('AC', [3, 1])]) | +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+ | '20' | 1235237 | None | 'T' | [None] | None | [] | {} | +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+ | 'X' | 10 | 'rsTest' | 'AC' | [A, ATG] | 10 | [] | {} | +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+ >>> t2 = unpackinfo(t1) >>> look(t2) +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+ | 'CHROM' | 'POS' | 'ID' | 'REF' | 'ALT' | 'QUAL' | 'FILTER' | 'NS' | 'AN' | 'AC' | 'DP' | 'AF' | 'AA' | 'DB' | 'H2' | +=========+=========+=============+=======+===========+========+==========+======+======+========+======+================+======+======+======+ | '19' | 111 | None | 'A' | [C] | 9.6 | [] | None | None | None | None | None | None | None | None | +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+ | '19' | 112 | None | 'A' | [G] | 10 | [] | None | None | None | None | None | None | None | None | +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+ | '20' | 14370 | 'rs6054257' | 'G' | [A] | 29 | [] | 3 | None | None | 14 | [0.5] | None | True | True | +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+ | '20' | 17330 | None | 'T' | [A] | 3 | ['q10'] | 3 | None | None | 11 | [0.017] | None | None | None | +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+ | '20' | 1110696 | 'rs6040355' | 'A' | [G, T] | 67 | [] | 2 | None | None | 10 | [0.333, 0.667] | 'T' | True | None | +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+ | '20' | 1230237 | None | 'T' | [None] | 47 | [] | 3 | None | None | 13 | None | 'T' | None | None | +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+ | '20' | 1234567 | 'microsat1' | 'G' | [GA, GAC] | 50 | [] | 3 | 6 | [3, 1] | 9 | None | 'G' | None | None | +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+ | '20' | 1235237 | None | 'T' | [None] | None | [] | None | None | None | None | None | None | None | None | +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+ | 'X' | 10 | 'rsTest' | 'AC' | [A, ATG] | 10 | [] | None | None | None | None | None | None | None | None | +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+ .. versionadded:: 0.5 """ if not keys: if hasattr(tbl, 'filename'): try: import vcf as pyvcf except ImportError as e: raise UnsatisfiedDependency(e, dep_message) reader = pyvcf.Reader(filename=tbl.filename) # all INFO keys = reader.infos.keys() result = unpackdict(tbl, 'INFO', keys=keys) if 'prefix' in kwargs: result = rename(result, {k: kwargs['prefix'] + k for k in keys}) if hasattr(tbl, 'filename'): return VCFWrapper(result, tbl.filename) else: return result
def sales_summary(start_dt=None, end_dt=None): """tally up gross (sale over list) profits TODO: tally up net profites (gross profit vs inventory purchase total) TODO: Keyword Arguments: start_dt {[type]} -- datetime for start of query (default: {None}) end_dt {[type]} -- datetime for start of query [description] (default: {None}) Returns: [dict] -- various types of sales information, stored in a dictionary. """ # products = db.session.query(Product).all() # sales = db.session.query(Sale).all() # retrieve existing tables products_records = etl.fromdb(db.engine, 'SELECT * FROM product') sales_records = etl.fromdb(db.engine, 'SELECT * FROM sale') # join product info to sales data sales_data = etl.join(sales_records, products_records, lkey='product_id', rkey='id') # prep joined sales data for tabulation sales_data = etl.convert(sales_data, 'date', lambda dt: format_date(dt)) sales_data = etl.sort(sales_data, 'date') sales_data = etl.convert(sales_data, 'quantity', lambda q: handle_none(q, replace_with=1)) sales_data = etl.addfield(sales_data, 'profit', lambda rec: calculate_profit(rec)) sales_data = etl.addfield(sales_data, 'gross_sales', lambda rec: calculate_gross_sales(rec)) # summarize data into charting-friendly data structures chart_count = etl.fold(sales_data, 'date', operator.add, 'quantity', presorted=True) chart_count = etl.rename(chart_count, {'key': 'x', 'value': 'y'}) chart_count, chart_count_missing_date = etl.biselect( chart_count, lambda rec: rec.x is not None) # print(chart_count) # etl.lookall(chart_count) chart_gross = etl.fold(sales_data, 'date', operator.add, 'gross_sales', presorted=True) chart_gross = etl.rename(chart_gross, {'key': 'x', 'value': 'y'}) chart_gross, chart_gross_missing_date = etl.biselect( chart_gross, lambda rec: rec.x is not None) # print(chart_gross) # etl.lookall(chart_gross) chart_profit = etl.fold(sales_data, 'date', operator.add, 'profit', presorted=True) chart_profit = etl.rename(chart_profit, {'key': 'x', 'value': 'y'}) chart_profit, chart_profit_missing_date = etl.biselect( chart_profit, lambda rec: rec.x is not None) # tabulate some figures gross_sales = 0 profits = 0 for sale in etl.dicts(sales_data): profits += calculate_profit(sale) gross_sales += calculate_gross_sales(sale) # for i in etl.dicts(chart_count): # print(i) # for i in etl.dicts(chart_gross): # print(i) return { 'gross_sales': gross_sales, 'profits': profits, 'chart_gross': list(etl.dicts(chart_gross)), 'chart_gross_missing_date': list(etl.dicts(chart_gross_missing_date)), 'chart_profit': list(etl.dicts(chart_profit)), 'chart_profit_missing_date': list(etl.dicts(chart_profit_missing_date)), 'chart_count': list(etl.dicts(chart_count)), 'chart_count_missing_date': list(etl.dicts(chart_count_missing_date)) }
lkey='id', rkey='director_id') # print(directorsAndGenresAndMovies) # print(actorsAndRoles) # print(moviesAndGenresAndDirectorsAndRoles) # Delete unnecessary columns from all tables ranks = etl.cut(moviesAndDirectorsAndRoles, 'movie_id', 'rank', 'director_id', 'actor_id') movies = etl.cut(moviesAndDirectorsAndRoles, 'movie_id', 'name') directors = etl.cut(directorsAndMovies, 'id', 'full_name') actors = etl.cut(actors, 'id', 'full_name') # Rename id to include table name directors = etl.rename(directors, 'id', 'director_id') actors = etl.rename(actors, 'id', 'actor_id') # Remove rows with NULL ranks ranks = etl.distinct(ranks) ranks = etl.selectnotnone(ranks, 'rank') # Remove duplicates after cutting columns movies = etl.distinct(movies) directors = etl.distinct(directors) actors = etl.distinct(actors) # Insert final tables into data warehouse etl.todb(ranks, imdbWarehouse, 'ranks') etl.todb(movies, imdbWarehouse, 'movies') etl.todb(actors, imdbWarehouse, 'actors')
########## Json extraction and maping tableJ = etl.fromjson('cust_data.json', header=['id','gender','first_name','last_name', 'email','ville']) tableJ = etl.movefield(tableJ, 'gender', 4) ########## CSV extraction and conversion tableCSV = etl.fromcsv('week_cust.csv') tableCSV = etl.convert(tableCSV, 'id', int) ########### Sqlserver connection and extraction connectionSqlServer=pyodbc.connect("Driver={SQL Server Native Client 11.0};" "Server=81_64_msdn;" "Database=BD4client;" "Trusted_Connection=yes;" "convert_unicode =True;") cursor = connectionSqlServer.cursor() cursor.execute('SELECT id, first_name, last_name, email, gender, ville FROM client_DATA') tableSqlServer = cursor.fetchall() tableSqlServer =[('id','first_name','last_name', 'email','gender','ville')]+tableSqlServer cursor.close() connectionSqlServer.close() ######### Staging area transforming and concatenation StagingArea = etl.cat(tableCSV, tableJ,tableSqlServer) StagingArea = etl.convert(StagingArea, 'gender', {'Male': 'M', 'Female': 'F', 'male': 'M', 'female': 'F', None: 'N'}) StagingArea = etl.rename(StagingArea, 'ville', 'city') ######## mysql connection = mysql.connect(host="localhost", user="******", passwd="", db="customerdatabase") curseur = connection.cursor() curseur.execute('SET SQL_MODE=ANSI_QUOTES') #### load data, assuming table " CustomerData" already exists in the database etl.appenddb(StagingArea, connection, 'customerdata', schema='customerdatabase', commit='commit') curseur.close() connection.close()
def procesar_fuente(path, nombre): try: # Procesamos primero casos confirmados tabla = etl.fromcsv(path) # Cambiamos el nombre a los encabezados tabla = etl.rename(tabla, {'Country/Region': 'Country'}) # Ajustamos los tipos de datos # A partir de la columna 5, el tipo de dato es integer, que es el número de personas/casos # Adicionalmente aprovechamos para cambiar el formato de la fecha de 1/23/20 a 2020-01-23 en el header headers = etl.fieldnames(tabla) i=0 for header in headers: if i>=4: tabla = etl.convert(tabla, header, int) # corregimos el tipo de dato fecha = datetime.datetime.strptime(header, '%m/%d/%y') # calculamos la fecha en formato correcto tabla = etl.rename(tabla, header, fecha.strftime('%Y-%m-%d')) i = i + 1 # Eliminamos las columnas de Province/State, Lat y Lon que no vamos a utilizar tabla = etl.cutout(tabla, 0, 2, 3) # Ajustamos algunos nombres de países para luego asignarles una región/continente tabla = etl.convert(tabla, 'Country', 'replace', 'Congo (Brazzaville)', 'Congo') tabla = etl.convert(tabla, 'Country', 'replace', 'Congo (Kinshasa)', 'Democratic Republic of the Congo') tabla = etl.convert(tabla, 'Country', 'replace', 'Cote d\'Ivoire', 'Ivory Coast') tabla = etl.convert(tabla, 'Country', 'replace', 'Korea, South', 'South Korea') tabla = etl.convert(tabla, 'Country', 'replace', 'West Bank and Gaza', 'Palestine') tabla = etl.convert(tabla, 'Country', 'replace', 'Burma', 'Myanmar') tabla = etl.convert(tabla, 'Country', 'replace', 'US', 'USA') tabla = etl.convert(tabla, 'Country', 'replace', 'Taiwan*', 'Taiwan') # Luego procedemos a agrupar y acumular los resultados por el país df_confirmed = etl.todataframe(tabla) df = df_confirmed.groupby(['Country']).sum() tabla = etl.fromdataframe(df, include_index=True) # Renombramos el campo de Country nuevamente tabla = etl.rename(tabla, {'index': 'Country'}) # Luego agregamos las columnas de fecha como datos y renombramos las nuevas columnas tabla = etl.melt(tabla, 'Country') tabla = etl.rename(tabla, {'variable': 'Date'}) tabla = etl.rename(tabla, {'value': 'Cases'}) # Luego agregamos el continente para agrupar tabla = etl.addfield(tabla, 'Continent', lambda rec: get_continent_code(rec['Country'])) # Y nuevamente nos aseguramos que sean del tipo de dato que deben ser. tabla = etl.convert(tabla, 'Cases', int) tabla = etl.convert(tabla, 'Date', lambda v: datetime.datetime.strptime(v, '%Y-%m-%d') ) #Finalmente, subimos el archivo al repositorio de datos conn = pymysql.connect(password='******', database='covid', user='******') conn.cursor().execute('SET SQL_MODE=ANSI_QUOTES') etl.todb(tabla, conn, nombre, create=True, drop=True) conn.close() except: print('Se ha presentado un error! ', sys.exc_info()[0]) raise
def main(argv): parser = argparse.ArgumentParser() parser.add_argument("--pledges-filename", required=True, help="Input UTF8 CSV with pledges data " "dumped from Servant Keeper") parser.add_argument("--output-filename", required=True, help="Output CSV filename which will be loaded with " "pledges data in CCB import format ") parser.add_argument( '--trace', action='store_true', help="If specified, prints tracing/progress messages to " "stdout") args = parser.parse_args() assert os.path.isfile( args.pledges_filename ), "Error: cannot open file '" + args.pledges_filename + "'" table = petl.fromcsv(args.pledges_filename) table = petl.rename(table, { 'Frequency': 'SK Frequency', 'Individual ID': 'SK Individual ID' }) table = petl.addfield(table, 'Individual ID', lambda rec: rec['SK Individual ID']) table = petl.addfield(table, 'Campus', '') table = petl.addfield(table, 'Category Pledged To', lambda rec: rec['Account']) table = petl.addfield(table, 'Amount Pledged', convert_amount) table = petl.addfield(table, 'Total Amount Pledged', lambda rec: rec['Pledged']) table = petl.addfield( table, 'Frequency', lambda rec: { 1: 'Yearly', 4: 'Yearly', 12: 'Monthly', 52: 'Weekly', 24: 'Monthly', 2: 'Yearly' }[int(rec['Payments'])]) table = petl.addfield( table, 'Number of Gifts', lambda rec: { 'Yearly': 1, 'Monthly': 12, 'Weekly': 52 }[rec['Frequency']]) table = petl.addfield( table, 'Length Multiplier', lambda rec: { 'Yearly': 'Years', 'Monthly': 'Months', 'Weekly': 'Weeks' }[rec['Frequency']]) table = petl.addfield( table, 'Start Date', lambda rec: { 'Operating Income': '2013-01-01', 'Mortgage Principal': '2013-01-01', 'Operating Income 2015': '2015-01-01' }[rec['Account']]) table = petl.addfield( table, 'End Date', lambda rec: { 'Operating Income': '2013-12-31', 'Mortgage Principal': '2013-12-31', 'Operating Income 2015': '2015-12-31' }[rec['Account']]) trace('CONVERTING AND THEN EMITTING TO CSV FILE...', args.trace, banner=True) table.progress(200).tocsv(args.output_filename) trace('OUTPUT TO CSV COMPLETE.', args.trace, banner=True) trace('DONE!', args.trace, banner=True)
def main(): mort = fromcsv('mortality.csv') mort = skip(mort,2) mort = cut(mort,"Country Name","2006","2007","2008","2009","2010","2011","2012") mort = petl.rename(mort, 'Country Name', 'country') mort_rates = facetcolumns(mort, 'country') def getmort(c,y): try: return float(mort_rates[c][y][0]) except: return None ''' faceting allows us to do for example: pprint(mort_rates['Malaysia']['2009']) pprint(mort_rates['Denmark']) ''' f = open('mnhc.ids') base_url = 'http://localhost:7000/cube/cida/' fact_url = base_url+'facts?cut=project_number:'#A033033007 aggregate_url = base_url+ 'aggregate?cut=project_number:'#A033033007 bads = ['Africa MC','Americas MC','Asia MC','Europe MC'] renamethis={'Afghanistan TIS':'Afghanistan', 'Tanzania,Un Rep':'Tanzania', 'Lao, Dem. Rep.':'Lao', 'Congo, Dem Rep.':'Congo'} ''' use project browser instead,since there seems to be nothing that uniquely identifies historical activities, for example there are 6 entries for #A033033007 ''' mashup_file=open('mashup.json','w') mashup_data=[] for id in list(f)[0:-1]: project_data = json.load(urllib2.urlopen(fact_url + id)) # now we can append aggregates aggr = json.load(urllib2.urlopen(aggregate_url + id))['summary'] # just grab the first record p=project_data[0] c= p['country_region_name'] country = renamethis[c] if c in renamethis.keys() else c if country not in bads: mashup_data.append({ 'id':p['id'], 'project':p['project_number'], 'year':p['fiscal_year'], 'continent':p['continent_name'], 'country':country, 'max':int(round(p['maximum_cida_contribution'])), 'spent':int(round(aggr['amount_spent_sum'])), 'mortality_rate':getmort(c,p['fiscal_year'])}) mashup=fromdicts(mashup_data) tocsv(mashup,'raw-mnhc.csv') key_fields = ['project', 'max', 'year', 'country', 'mortality_rate', 'id', 'continent'] value_field = 'spent' '''' tbl_out = ( tbl_in .aggregate(key=key_fields, aggregation=sum, value=value_field) .unpack('key', key_fields) .rename('value', value_field) ) ''' tbl_out = aggregate(mashup, ['country','year','mortality_rate'], sum, 'spent') tbl_out = unpack(tbl_out, 'key',['country','year','mortality_rate']) tbl_out = rename(tbl_out, 'value','spent') tbl_out = cut(tbl_out, 'country','year','spent','mortality_rate') #cut to reorder print look(tbl_out) tojson(tbl_out,"mashup.json") tocsv(tbl_out,"mnhc-report.csv")
'26WeekPriceReturnDaily', '3MonthAverageTradingVolume', '52WeekHigh', '52WeekHighDate', '52WeekLow', '52WeekLowDate', '52WeekPriceReturnDaily', '5DayPriceReturnDaily' ] important_metrics = {} for metric in list_of_metrics: important_metrics[metric] = metrics[metric] important_metrics['_date'] = str(datetime.datetime.now())[:19] table_metrics = etl.fromdicts([important_metrics]) table_financial_metrics = etl.rename( table_metrics, { '10DayAverageTradingVolume': '_10DayAverageTradingVolume', '13WeekPriceReturnDaily': '_13WeekPriceReturnDaily', '26WeekPriceReturnDaily': '_26WeekPriceReturnDaily', '3MonthAverageTradingVolume': '_3MonthAverageTradingVolume', '52WeekHigh': '_52WeekHigh', '52WeekHighDate': '_52WeekHighDate', '52WeekLow': '_52WeekLow', '52WeekLowDate': '_52WeekLowDate', '52WeekPriceReturnDaily': '_52WeekPriceReturnDaily', '5DayPriceReturnDaily': '_5DayPriceReturnDaily' }) table_financial_metrics = etl.addfield(table_financial_metrics, 'symbol', stock) list_of_sentiments = ['bearishPercent', 'bullishPercent'] important_sentiments = {} for sentiments in list_of_sentiments: important_sentiments[sentiments] = sentiment['sentiment'][sentiments] important_sentiments['date_and_time'] = str(datetime.datetime.now())[:19] table_sentiment = etl.fromdicts([important_sentiments])
fc['b']['bar'] fc['b']['baz'] fc['c'] # rename table1 = [['sex', 'age'], ['m', 12], ['f', 34], ['-', 56]] from petl import look, rename look(table1) # rename a single field table2 = rename(table1, 'sex', 'gender') look(table2) # rename multiple fields by passing a dictionary as the second argument table3 = rename(table1, {'sex': 'gender', 'age': 'age_years'}) look(table3) # the returned table object can also be used to modify the field mapping using the suffix notation table4 = rename(table1) table4['sex'] = 'gender' table4['age'] = 'age_years' look(table4) # cut table1 = [['foo', 'bar', 'baz'], ['A', 1, 2.7],
def dimension_values(): connection = psycopg2.connect(dbname='voyager', user='******', password='******', host='172.16.0.45') engine = create_engine('postgresql://*****:*****@172.16.0.45:5432/voyager') com = 'select id as id_component, name as component from dim_com' table_com = etl.fromdb(connection, com) loc = 'select id as id_location, name as name from dim_loc' table_loc = etl.fromdb(connection, loc) tim = 'select id as id_time, time as timestamp from dim_time' table_time = etl.fromdb(connection, tim) print(table_com) print(table_loc) print(table_time) for ran in range(0, 65424, 1000): sql = "select * from KNMI_station_data kk " \ "RIGHT JOIN weatherstations w ON " \ " CAST (kk.weather_station_id AS INTEGER) = CAST (w.station_number AS INTEGER) " \ "WHERE w.station_number NOT LIKE \'NL%%\' AND date > 20190901 LIMIT 1000 OFFSET %s" % ran print(sql) table = etl.fromdb(connection, sql) print('knmi') print(table) table.log_progress() table = etl.convert(table, 'date', str) table = etl.convert(table, 'hour', str) table = etl.convert(table, 'temperature', int) table = etl.convert(table, 'temperature_dew', int) table = etl.convert(table, 'temperature_min', int) table = etl.convert(table, 'wind_speed_avg', int) table = etl.convert(table, 'wind_speed', int) table = etl.convert(table, 'wind_speed_max', int) table = etl.convert(table, 'temperature', lambda v: v / 10) table = etl.convert(table, 'temperature_dew', lambda v: v / 10) table = etl.convert(table, 'temperature_min', lambda v: v / 10) table = etl.convert(table, 'wind_speed_avg', lambda v: v / 10) table = etl.convert(table, 'wind_speed', lambda v: v / 10) table = etl.convert(table, 'wind_speed_max', lambda v: v / 10) df = pd.DataFrame(table) df.columns = df.iloc[0] df = df.drop(0) df['timestamp'] = df['date'] + df['hour'] df['weather_station_id'] = df['weather_station_id'].astype(str) df['timestamp'] = df['timestamp'].apply(custom_to_datetime) df['timestamp'] = df['timestamp'].astype(str) df = df.drop(columns=['date', 'hour'], axis=1) final_knmi_table = etl.fromdataframe(df) final_knmi_table = etl.melt(final_knmi_table, key=[ 'weather_station_id', 'timestamp', 'id', 'latitude', 'longitude', 'name', 'station_number', 'data_source_id', 'altitude' ]) final_knmi_table = etl.rename(final_knmi_table, 'variable', 'component') print(final_knmi_table) final_knmi_table2 = etl.join(final_knmi_table, table_com, key='component') final_knmi_table2 = etl.join(final_knmi_table2, table_loc, key='name') final_knmi_table2 = etl.join(final_knmi_table2, table_time, key='timestamp') print('dos') print(final_knmi_table2) df = pd.DataFrame(final_knmi_table2) df.columns = df.iloc[0] df = df.drop(0) fact_source = df[[ 'id_component', 'id_location', 'id_time', 'value', 'data_source_id', 'weather_station_id' ]] print(fact_source) fact_source.to_sql('fact_source', engine, if_exists='append', index=False, method='multi') for rn in range(0, 1148, 1000): print('lmn') final_lmn_table = etl.fromdb( connection, "select ld.id, ld.station_number, ld.value, ld.timestamp, ls.name as component, " "ws.id as lid, ws.latitude, ws.longitude, ws.data_source_id, ws.altitude, ws.name as name" " from luchtmeetnet_data ld " "right join luchtmeetnet_sensors ls on ld.formula = ls.formula " " join weatherstations ws on ld.station_number = ws.station_number " "where ws.station_number like \'NL%%\' AND timestamp > '2019-09-01' " "LIMIT 1000 OFFSET %s" % rn) final_lmn_table = etl.rename(final_lmn_table, {'station_number': 'weather_station_id'}) final_lmn_table = etl.movefield(final_lmn_table, 'timestamp', 1) # print(final_lmn_table) # print(final_lmn_table) # print(table_com) final_lmn_table2 = etl.join(final_lmn_table, table_com, key='component') # print(final_lmn_table2) final_lmn_table2 = etl.join(final_lmn_table2, table_loc, key='name') # print(final_lmn_table2) df = pd.DataFrame(final_lmn_table2) df.columns = df.iloc[0] df = df.drop(0) df['timestamp'] = df['timestamp'].str[:-6] # print(df) final_lmn_table2 = etl.fromdataframe(df) final_lmn_table2 = etl.join(final_lmn_table2, table_time, key='timestamp') # print(final_lmn_table2) print(final_lmn_table2) final_lmn_df = pd.DataFrame(final_lmn_table2) final_lmn_df.columns = final_lmn_df.iloc[0] final_lmn_df = final_lmn_df.drop(0) fact_source = final_lmn_df[[ 'id_component', 'id_location', 'id_time', 'value', 'data_source_id', 'weather_station_id' ]] print(fact_source) fact_source.to_sql('fact_source', engine, if_exists='append', index=False, method='multi')
def rename_headers(table, mappings): return etl.rename(table, mappings)
def append_tailings_reports_to_code_required_reports(connection, commit=False): src_table = etl.fromdb( connection, 'SELECT exp_doc.mine_guid, exp_doc.exp_document_guid, req_doc.req_document_name, exp_doc.due_date, exp_doc.exp_document_status_code, exp_doc.received_date, exp_doc.active_ind, exp_doc_x.mine_document_guid, exp_doc.create_user, exp_doc.create_timestamp, exp_doc.update_user, exp_doc.update_timestamp from mine_expected_document exp_doc \ inner join mine_expected_document_xref exp_doc_x on exp_doc.exp_document_guid = exp_doc_x.exp_document_guid\ inner join mds_required_document req_doc on req_doc.req_document_guid = exp_doc.req_document_guid' ) req_document_crr_defintion_map = [ ['req_document_name', 'mine_report_definition_id'], ['Summary of TSF and Dam Safety Recommendations', 28], ['ITRB Activities Report', 27], ['Register of Tailings Storage Facilities and Dams', 47], ['Dam Safety Inspection (DSI) Report', 26], ['Dam Safety Review (DSR) Report', 31], ['“As-built” Reports', 32], ['Annual Reclamation', 25], ['MERP Record of Testing', 3], #['Annual Manager\'s Report', __________________ ], no mapping or data, ignore. ['OMS Manual', 33], ['Annual reconciliation of water balance and water management plans', 44], ['TSF risk assessment', 46], ['Mine Emergency Preparedness and Response Plan (MERP)', 24], ['Performance of high risk dumps', 29] ] table1 = etl.join(src_table, req_document_crr_defintion_map, 'req_document_name') mine_report = etl.cutout(table1, 'req_document_name') #to be inserted into db mine_report = etl.addfield(mine_report, 'submission_year', 2019) mine_report = etl.rename(mine_report, 'exp_document_status_code', 'mine_report_submission_status_code') mine_report = etl.addfield(mine_report, 'deleted_ind', lambda x: not x.active_ind) mine_report = etl.cutout(mine_report, 'active_ind') #to determine what FK's will be so can insert into related tables max_report_id = etl.fromdb(connection, 'select last_value from public.mine_report_mine_report_id_seq')[1][0] max_report_submission_id = etl.fromdb( connection, 'select last_value from public.mine_report_submission_mine_report_submission_id_seq')[1][0] #if sequence hasn't been used yet, fix off by one if max_report_id == 1: max_report_id = 0 if max_report_submission_id == 1: max_report_submission_id = 0 #get one-to-many mine_report, mine_report_submission_documents = etl.unjoin(mine_report, 'mine_document_guid', key='exp_document_guid') #add PK's for mappings mine_report_with_ids = etl.addrownumbers(mine_report, start=max_report_id + 1, step=1, field='mine_report_id') mine_report_with_ids = etl.addrownumbers(mine_report_with_ids, start=max_report_submission_id + 1, step=1, field='mine_report_submission_id') print(f'max_report_id= {max_report_id}, max_report_submission_id={max_report_submission_id}') #copy out fields for submission tables mine_report_submissions = etl.cut(mine_report_with_ids, [ 'mine_report_id', 'exp_document_guid', 'mine_report_submission_status_code', 'create_user', 'create_timestamp', 'update_user', 'update_timestamp' ]) mine_report_submissions = etl.addfield(mine_report_submissions, 'submission_date', lambda x: x.create_timestamp) #remove fields not in mine_report mine_report = etl.cutout(mine_report, 'mine_report_submission_status_code') #replace exp_document_guid FK with mine_report_submission FK submission_id_lookup = etl.cut(mine_report_with_ids, ['mine_report_submission_id', 'exp_document_guid']) mine_report_submission_documents = etl.join(submission_id_lookup, mine_report_submission_documents, key='exp_document_guid') mine_report_submission_documents = etl.cutout(mine_report_submission_documents, 'exp_document_guid') #removed original PK mine_report = etl.cutout(mine_report, 'exp_document_guid') mine_report_submissions = etl.cutout(mine_report_submissions, 'exp_document_guid') print(etl.valuecounter(etl.distinct(table1, key='exp_document_guid'), 'req_document_name')) print(etl.valuecounter(mine_report, 'mine_report_definition_id')) print(table1) print(mine_report) print(mine_report_submissions) print(mine_report_submission_documents) etl.appenddb(mine_report, connection, 'mine_report', commit=False) print('INSERT mine_report staged') etl.appenddb(mine_report_submissions, connection, 'mine_report_submission', commit=False) print('INSERT mine_report_submission staged') etl.appenddb(mine_report_submission_documents, connection, 'mine_report_document_xref', commit=False) print('INSERT mine_report_document_xref staged') if commit: connection.commit() print('DATA CREATION COMPLETE') else: connection.rollback() print('NO DATA CREATED: add --commit=true to insert report rows')