def validate(cursor, table, constraints, task_name): header = etl.header(table) problems = etl.validate(table, constraints=constraints, header=header) problems = etl.addfield(problems, 'task_name', task_name) problems = etl.addfield(problems, 'create_date', datetime.now()) # etl.todb(problems, cursor, 'etl_logs') etl.appenddb(problems, cursor, 'tetl_logs')
def openFileTable(years, quarters): #print "In openFileTable" table = [] #traverse through the list of years provided via filters for year in years: #traverse through the list of quarters provided via filters for quarter in quarters: fileString = year + '_' + quarter + '.csv' #print fileString table1 = etl.fromcsv(fileString) #TODO--migrate transformations accordingly #Q2 #0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 #APN Entitlement BESTSTAT BESTDATE NAMEADDR Alias PLN_CASENO BP_APPLNO UNITS NET_UNITS AFF_UNITS NET_AFF_UNITS SECTION415 TENURE_TYPE COST PROPUSE TOTAL_GSF NET_GSF CIE NET_CIE MED NET_MED MIPS NET_MIPS PDR NET_PDR RET NET_RET VISIT NET_VISIT FirstFiled PLN_DESC BP_DESC PLANNER SPONSOR SP_CONTACT SP_CONTACTPH NEIGHBORHOOD PLAN_AREA PLAN_DISTRICT HEIGHT_DISTRICT ZONING_SIM ZONING_DISTRICT SUPE_DISTRICT Location #Q3 # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 #PROJECT_TYPE NAMEADDR Alias BESTSTAT BESTDATE Entitlement PLN_CASENO PLN_DESC PROPUSE BP_APPLNO BP_DESC UNITS NET_UNITS AFF_UNITS NET_AFF_UNITS SECTION415 TENURE_TYPE TOTAL_GSF NET_GSF CIE NET_CIE MED NET_MED MIPS NET_MIPS PDR NET_PDR RET NET_RET VISIT NET_VISIT APPLICANT CONTACT CONTACTPH COST FirstFiled PLANNER APN PLAN_AREA PLN_DISTRICT SUPE_DISTRICT ZONING_SIM ZONING_DISTRICT ZONING_GEN NEIGHBORHOOD HEIGHT_DISTRICT Location #Q4 # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 #APN BESTSTAT BESTDATE NAMEADDR Alias UNITS NET_UNITS AFF_UNITS NET_AFF_UNITS SECTION415 TENURE_TYPE PLN_CASENO BP_APPLNO PROPUSE COST BP_DESC PLN_DESC PROJECT_TYPE ENTITLEMENT TOTAL_GSF NET_GSF CIE NET_CIE MED NET_MED MIPS NET_MIPS PDR NET_PDR RET NET_RET VISIT NET_VISIT APPLICANT CONTACT CONTACTPHONE FirstFiled PLANNER PLAN_AREA PUBLICREALM STATUS PLAN_DISTRICT SUPE_DISTRICT ZONING_SIM ZONING_DISTRICT ZONING_GEN ZONING HEIGHT_DISTRICT NEIGHBORHOOD Location if quarter == 'Q1': #attrList = [0,2,4,5,12,32,40] table1 = etl.cut(table1, 'PROJECT_TYPE', 'NAMEADDR', 'BESTDATE', 'BESTSTAT', 'PROPUSE', 'NEIGHBORHOOD', 'LOCATION') if quarter == 'Q2': # print quarter #attrList = [2,3,4,15,37,42,44] table1 = etl.cut(table1, 'BESTSTAT', 'BESTDATE', 'NAMEADDR', 'PROPUSE', 'NEIGHBORHOOD', 'ZONING_GEN', 'Location') if quarter == 'Q3': #attrList = [0,1,3,4,8,44,46] table1 = etl.cut(table1, 'PROJECT_TYPE', 'NAMEADDR', 'BESTSTAT', 'BESTDATE', 'PROPUSE', 'NEIGHBORHOOD', 'LOCATION') if quarter == 'Q4': #attrList = [1,2,3,13,17,48,49] table1 = etl.cut(table1, 'BESTSTAT', 'BESTDATE', 'NAMEADDR', 'PROPUSE', 'PROJECT_TYPE', 'NEIGHBORHOOD', 'Location') #adding the quarter and year to identify the table table1 = etl.addfield(table1, 'Quarter', quarter) table1 = etl.addfield(table1, 'Year', year) #print tempTable table.append(table1) return table
def ETL_MMS_NOW_schema(connection, tables, schema, system_name): '''Import all the data from the specified schema and tables.''' for destination, source in tables.items(): try: current_table = etl.fromdb(connection, f'SELECT * from {schema}.{source}') print(f' {destination}:{etl.nrows(current_table)}') if (source == 'application'): # add originating source table_plus_os = etl.addfield(current_table, 'originating_system', system_name) table_plus_os_guid = join_mine_guids(connection, table_plus_os) etl.appenddb(table_plus_os_guid, connection, destination, schema='now_submissions', commit=False) else: etl.appenddb(current_table, connection, destination, schema='now_submissions', commit=False) except Exception as err: print(f'ETL Parsing error: {err}') raise
def transform_fields_254(tbl, ts_kv_dict): """The input is a dump of ts_kv table for TB version 2.5.4: +----------------------------------+---------------+---------------+--------+-------+--------+-------+ | entity_id | key | ts | bool_v | str_v | long_v | dbl_v | +==================================+===============+===============+========+=======+========+=======+ | 1ea47494dc14d40bd76a73c738b665f | 25 | 1583010011665 | | | | -1.8 | +----------------------------------+---------------+---------------+--------+-------+--------+-------+ | 1ea47494dc14d40bd76a73c738b665f | 36 | 1583010000692 | | | 227 | | +----------------------------------+---------------+---------------+--------+-------+--------+-------+ The output: +---------------------------------+---------------+---------------+--------+ | entity_id | key | ts | value | +=================================+===============+===============+========+ | 1ea47494dc14d40bd76a73c738b665f | Temperature | 1583010011665 | -1.8 | +---------------------------------+---------------+---------------+--------+ | 1ea47494dc14d40bd76a73c738b665f | WindDirection | 1583010000692 | 227 | +---------------------------------+---------------+---------------+--------+ ts_kv_dict is a dict like {25:'Temperature', 36:'WindDirection'} """ ts_kv_table = petl.transform.conversions.convert(tbl, {'ts': int, 'key': lambda k: ts_kv_dict[k]}) ts_kv_table = petl.addfield(ts_kv_table, 'value', lambda row: get_value(row)) ts_kv_table = petl.cutout(ts_kv_table, 'bool_v', 'str_v', 'long_v', 'dbl_v') return ts_kv_table
def drained_entries(ctx: typer.Context, issues, entries, project): config = ctx.meta['config'] empty_entries, unset_entries = petl.biselect( entries, lambda row: row['issue_id'] is None) drain_issues = list( petl.dicts( transform.select_drain_issues( issues, assignee_id=ctx.meta['rdm_user']['id'], drain_cf_id=get_proj_attr(config, project, 'rdm_drain_cf_id')))) if not len(drain_issues): log.error('No drain issues found') return petl.head(unset_entries, 0), entries if len(drain_issues) > 1: log.warning( f'Found {len(drain_issues)} drain issues. Will use only first one') drain_issue = drain_issues[0] drained = petl.addfield(petl.cutout(empty_entries, 'issue_id'), 'issue_id', drain_issue['id']) return drained, unset_entries
def add_issue_id_from_description(inp): def m(row): try: match = re.search(r'#(\d+)', row['description']) return int(match.group(1)) except AttributeError: return None return etl.addfield(inp, 'issue_id', m)
def main(argv): parser = argparse.ArgumentParser() parser.add_argument("--pledges-filename", required=True, help="Input UTF8 CSV with pledges data " "dumped from Servant Keeper") parser.add_argument("--output-filename", required=True, help="Output CSV filename which will be loaded with " "pledges data in CCB import format ") parser.add_argument('--trace', action='store_true', help="If specified, prints tracing/progress messages to " "stdout") args = parser.parse_args() assert os.path.isfile(args.pledges_filename), "Error: cannot open file '" + args.pledges_filename + "'" table = petl.fromcsv(args.pledges_filename) table = petl.rename(table, { 'Frequency': 'SK Frequency', 'Individual ID': 'SK Individual ID' }) table = petl.addfield(table, 'Individual ID', lambda rec: rec['SK Individual ID']) table = petl.addfield(table, 'Campus', '') table = petl.addfield(table, 'Category Pledged To', lambda rec: rec['Account']) table = petl.addfield(table, 'Amount Pledged', convert_amount) table = petl.addfield(table, 'Total Amount Pledged', lambda rec: rec['Pledged']) table = petl.addfield(table, 'Frequency', lambda rec: {1:'Yearly', 4:'Yearly', 12:'Monthly', 52:'Weekly', 24:'Monthly', 2:'Yearly'}[int(rec['Payments'])]) table = petl.addfield(table, 'Number of Gifts', lambda rec: {'Yearly':1, 'Monthly':12, 'Weekly':52}[rec['Frequency']]) table = petl.addfield(table, 'Length Multiplier', lambda rec: {'Yearly':'Years', 'Monthly':'Months', 'Weekly':'Weeks'}[rec['Frequency']]) table = petl.addfield(table, 'Start Date', lambda rec: {'Operating Income':'2013-01-01', 'Mortgage Principal':'2013-01-01', 'Operating Income 2015':'2015-01-01'}[rec['Account']]) table = petl.addfield(table, 'End Date', lambda rec: {'Operating Income':'2013-12-31', 'Mortgage Principal':'2013-12-31', 'Operating Income 2015':'2015-12-31'}[rec['Account']]) trace('CONVERTING AND THEN EMITTING TO CSV FILE...', args.trace, banner=True) table.progress(200).tocsv(args.output_filename) trace('OUTPUT TO CSV COMPLETE.', args.trace, banner=True) trace('DONE!', args.trace, banner=True)
def test_tograph(): """ Test converting a table to a Graph. """ table2 = etl.addfield(table1, 'a', 'foaf:Person') g = table2.tograph(ctx) nt = g.serialize(format='nt') assert '<http://example.org/people/n2>' in nt assert '<http://xmlns.com/foaf/0.1/Person>' in nt #read back in as string ng = Graph().parse(data=nt, format='nt')
def load(self): self.table = etl.fromcsv(self.InputTable) for Column in o.Columns: if "Type" in Column: print(Column) self.table = etl.convert(self.table, Column["Column"], eval(Column["Type"]) ) if "Convert" in Column: print(Column) self.table = etl.convert(self.table, Column["Column"], Column["Convert"]) if "AddField" in Column: print(Column) self.table = etl.addfield(self.table, Column["Column"], eval(Column["AddField"]))
def test_tojsonld(): """ Test serializing as JSON LD. """ #serialize as jsonld table2 = etl.addfield(table1, 'a', 'foaf:Person') table2.tojsonld(ctx, source=tmpfile) with open(tmpfile, 'rb') as inf: data = json.load(inf) #Check context assert data['@context']['foaf'].find('foaf') > -1 assert data['@graph'][0].get('name') is not None
def group_entries_by_day(inp): hdr = petl.header(inp) agg = OrderedDict() for field in hdr: # using first found value agg[field] = field, next agg['dur'] = 'dur', lambda durs: sum(durs, timedelta()) agg['start'] = 'start', min with_day = petl.addfield(inp, 'start_date', lambda row: row.get('start').date()) index_keys = ('start_date', 'description') result = petl.aggregate(with_day, index_keys, agg) return petl.cutout(result, 'start_date')
def extract_named_objects_to_columns(inp, named_object_columns: List[str]): res = inp def m(column1: str, prop1: str): def mod(row): if row.get(column1): return row[column1].get(prop1, None) return None return mod for column in named_object_columns: for prop in ('id', 'name'): res = etl.addfield(res, f'{column}_{prop}', m(column, prop)) return res
def perform_people_table_transformations( petl_table: petl.Table, session: Optional[requests.Session] = None) -> petl.Table: # Add a date column (%Y-%m-%d) based on edited date new_table = petl.addfield( petl_table, "date", lambda row: row["edited"] [:10 ], # Seems more efficient than parsing and formatting back to isoformat ) # Resolve the homeworld field into the homeworld's name (/planets/1/ -> Tatooine) homeworld_name_retriever = RelatedResourceAttributeRetriever( "name", session=session) new_table = petl.convert( new_table, "homeworld", lambda v: homeworld_name_retriever.fetch_from_url(v)) # Fields referencing different resources and date fields other than date/birth_year can be dropped return petl.transform.basics.cutout(new_table, *FIELDS_TO_REMOVE)
def get_relationships(self): "Parses a list of `Relationship` objects." core_file = _find_loinc_table_core_file(self.uri.path) core = etl.fromcsv(core_file, delimiter=',') core = etl.cut(core, ['LOINC_NUM', 'LONG_COMMON_NAME']) hierarchy_file = _find_multi_axial_hierarchy_file(self.uri.path) hierarchy = etl.fromcsv(hierarchy_file, delimiter=',') hierarchy = etl.leftjoin(hierarchy, core, lkey='CODE', rkey='LOINC_NUM') hierarchy = etl.cut(hierarchy, ['IMMEDIATE_PARENT', 'CODE', 'CODE_TEXT', 'LONG_COMMON_NAME']) hierarchy = etl.fillright(hierarchy) hierarchy = etl.cut(hierarchy, ['IMMEDIATE_PARENT', 'CODE', 'LONG_COMMON_NAME']) hierarchy = etl.rename(hierarchy, 'LONG_COMMON_NAME', 'CODE_TEXT') parents = etl.cut(hierarchy, ['CODE', 'CODE_TEXT']) hierarchy = etl.selectne(hierarchy, 'IMMEDIATE_PARENT', '') hierarchy = etl.leftjoin(hierarchy, parents, lkey='IMMEDIATE_PARENT', rkey='CODE', lprefix='source.', rprefix='target.') hierarchy = etl.distinct(hierarchy) if self.versioned: version = _parse_version(hierarchy_file) hierarchy = etl.addfield(hierarchy, 'version', version) hierarchy = etl.rowmapmany(hierarchy, _to_json, ['relationship']) return hierarchy
def order_by_constraint(base_path, table, schema, self_dep_set): file_name = base_path + "/content/data/" + table + ".tsv" tempfile = NamedTemporaryFile(mode='w', dir=base_path + "/content/data/", delete=False) table = etl.fromcsv(file_name, delimiter='\t', skipinitialspace=True, quoting=csv.QUOTE_NONE, quotechar='', escapechar='') key_dep_dict = {} # print(file_name) for constraint in self_dep_set: child_dep, parent_dep = constraint.split(':') data = etl.values(table, child_dep, parent_dep) for d in data: key_dep_set = {d[1]} key_dep_dict.update({d[0]: key_dep_set}) key_dep_list = toposort_flatten(key_dep_dict) table = etl.addfield(table, 'pwb_index', lambda rec: int(key_dep_list.index(rec[child_dep]))) table = etl.sort(table, 'pwb_index') table = etl.cutout(table, 'pwb_index') writer = csv.writer(tempfile, delimiter='\t', quoting=csv.QUOTE_NONE, quotechar='', lineterminator='\n', escapechar='') writer.writerows(table) shutil.move(tempfile.name, file_name)
def transform_fields_old(tbl): """The input is a dump of ts_kv table for TB version <= 2.5.4: +-------------+---------------------------------+---------------+---------------+--------+-------+--------+-------+ | entity_type | entity_id | key | ts | bool_v | str_v | long_v | dbl_v | +=============+=================================+===============+===============+========+=======+========+=======+ | DEVICE | 1ea47494dc14d40bd76a73c738b665f | Temperature | 1583010011665 | | | | -1.8 | +-------------+---------------------------------+---------------+---------------+--------+-------+--------+-------+ | DEVICE | 1ea47494dc14d40bd76a73c738b665f | WindDirection | 1583010000692 | | | 227 | | +-------------+---------------------------------+---------------+---------------+--------+-------+--------+-------+ The output: +-------------+---------------------------------+---------------+---------------+--------+ | entity_type | entity_id | key | ts | value | +=============+=================================+===============+===============+========+ | DEVICE | 1ea47494dc14d40bd76a73c738b665f | Temperature | 1583010011665 | -1.8 | +-------------+---------------------------------+---------------+---------------+--------+ | DEVICE | 1ea47494dc14d40bd76a73c738b665f | WindDirection | 1583010000692 | 227 | +-------------+---------------------------------+---------------+---------------+--------+ """ ts_kv_table = petl.transform.conversions.convert(tbl, 'ts', int) ts_kv_table = petl.addfield(ts_kv_table, 'value', lambda row: get_value(row)) ts_kv_table = petl.cutout(ts_kv_table, 'bool_v', 'str_v', 'long_v', 'dbl_v') return ts_kv_table
example_data = """foo,bar,baz a,1,3.4 b,2,7.4 c,6,2.2 d,9,8.1 """ with open('example.csv', 'w') as f: f.write(example_data) import petl as etl table1 = etl.fromcsv('example.csv') table2 = etl.convert(table1, 'foo', 'upper') table3 = etl.convert(table2, 'bar', int) table4 = etl.convert(table3, 'baz', float) table5 = etl.addfield(table4, 'quux', lambda row: row.bar * row.baz) table5 table = ( etl .fromcsv('example.csv') .convert('foo', 'upper') .convert('bar', int) .convert('baz', float) .addfield('quux', lambda row: row.bar * row.baz) ) table l = [['foo', 'bar'], ['a', 1], ['b', 2], ['c', 2]] table = etl.wrap(l) table.look()
if table_h == 'quote': table_c['t'] = str(datetime.datetime.fromtimestamp(table_c['t'])) price = table_c if table_h == 'stock/insider-transactions': for transaction in table_c['data']: insiders.append(transaction) if table_h == 'news-sentiment': sentiment = table_c if table_h == 'stock/recommendation': for recommend in table_c: recommendation.append(recommend) if table_h == 'stock/metric': metrics = table_c['metric'] table_price = etl.fromdicts([price]) table_price = etl.addfield(table_price, 'symbol', stock) table_recommendation = etl.movefield(etl.fromdicts(recommendation), 'period', 6) table_insiders = etl.movefield(etl.fromdicts(insiders), 'transactionDate', 6) table_insiders = etl.addfield(table_insiders, 'symbol', stock) list_of_metrics = [ '10DayAverageTradingVolume', '13WeekPriceReturnDaily', '26WeekPriceReturnDaily', '3MonthAverageTradingVolume', '52WeekHigh', '52WeekHighDate', '52WeekLow', '52WeekLowDate', '52WeekPriceReturnDaily', '5DayPriceReturnDaily' ] important_metrics = {} for metric in list_of_metrics: important_metrics[metric] = metrics[metric] important_metrics['_date'] = str(datetime.datetime.now())[:19]
def concat_columns(table, new_column, column_left, column_right): new_table = etl.addfield( table, new_column, lambda rec: rec[column_left] + ', ' + rec[column_right], 1) return etl.cutout(new_table, column_left, column_right)
def match_columns(self, desired_columns, fuzzy_match=True, if_extra_columns='remove', if_missing_columns='add'): """ Changes the column names and ordering in this Table to match a list of desired column names. `Args:` desired_columns: list Ordered list of desired column names fuzzy_match: bool Whether to normalize column names when matching against the desired column names, removing whitespace and non-alphanumeric characters, and lowercasing everything. Eg. With this flag set, "FIRST NAME" would match "first_name". If the Table has two columns that normalize to the same string (eg. "FIRST NAME" and "first_name"), the latter will be considered an extra column. if_extra_columns: string If the Table has columns that don't match any desired columns, either 'remove' them, 'ignore' them, or 'fail' (raising an error). if_missing_columns: string If the Table is missing some of the desired columns, either 'add' them (with a value of None), 'ignore' them, or 'fail' (raising an error). `Returns:` `Parsons Table` and also updates self """ from parsons.etl import Table # Just trying to avoid recursive imports. normalize_fn = Table.get_normalized_column_name if fuzzy_match else ( lambda s: s) # Create a mapping of our "normalized" name to the original column name current_columns_normalized = { normalize_fn(col): col for col in self.columns } # Track any columns we need to add to our current table from our desired columns columns_to_add = [] # We are going to do a "cut" later to trim our table and re-order the columns, but # we won't have renamed our columns yet, so we need to remember their un-normalized # form cut_columns = [] # We are going to also rename our columns AFTER we cut, so we want to remember their # normalized names final_header = [] # Loop through our desired columns -- the columns we want to see in our final table for desired_column in desired_columns: normalized_desired = normalize_fn(desired_column) # Try to find our desired column in our Table if normalized_desired not in current_columns_normalized: # If we can't find our desired column in our current columns, then it's "missing" if if_missing_columns == 'fail': # If our missing strategy is to fail, raise an exception raise TypeError( f"Table is missing column {desired_column}") elif if_missing_columns == 'add': # We have to add to our table columns_to_add.append(desired_column) # We will need to remember this column when we cut down to desired columns cut_columns.append(desired_column) # This will be in the final table final_header.append(desired_column) elif if_missing_columns != 'ignore': # If it's not ignore, add, or fail, then it's not a valid strategy raise TypeError(f"Invalid option {if_missing_columns} for " "argument `if_missing_columns`") else: # We have found this in our current columns, so take it out of our list to search current_column = current_columns_normalized.pop( normalized_desired) # Add the column to our intermediate table as the old column name cut_columns.append(current_column) # Add to our final header list as the "desired" name final_header.append(desired_column) # Look for any "extra" columns from our current table that aren't in our desired columns for current_column in current_columns_normalized.values(): # Figure out what to do with our "extra" columns if if_extra_columns == 'fail': # If our missing strategy is to fail, raise an exception raise TypeError(f"Table has extra column {current_column}") elif if_extra_columns == 'ignore': # If we're "ignore"ing our extra columns, we should keep them by adding them to # our intermediate and final columns list cut_columns.append(current_column) final_header.append(current_column) elif if_extra_columns != 'remove': # If it's not ignore, add, or fail, then it's not a valid strategy raise TypeError(f"Invalid option {if_extra_columns} for " "argument `if_extra_columns`") # Add any columns we need to add for column in columns_to_add: self.table = petl.addfield(self.table, column, None) # Cut down to just the columns we care about self.table = petl.cut(self.table, *cut_columns) # Rename any columns self.table = petl.setheader(self.table, final_header) return self
# Load Master.csv from the Lahman database. table = etl.fromcsv(sys.argv[1]) # Use US births only table2 = etl.select(table, lambda rec: rec.birthCountry == 'USA') # Only use these fields table3 = etl.cut(table2, 'nameFirst', 'nameLast', 'debut', 'bbrefID', 'weight', 'height', 'finalGame', 'birthCity', 'birthState', 'birthYear') # Remove null birth city and birth year table4 = etl.select(table3, lambda rec: rec.birthCity != "" and rec.birthYear != "") # Add Baseball Reference URL table5 = etl.addfield(table4, 'baseball_ref_url', add_bbreflink) # Remove unnecessary bbrefid table6 = etl.cutout(table5, "bbrefID") # Load city,state lat long table. city = etl.fromcsv(sys.argv[2]) # Only use these fields city2 = etl.cut(city, "city", "state", "lat", "long") # Join tables by two keys lat_table = etl.leftjoin(table6, city2, lkey=["birthCity", "birthState"], rkey=["city", "state"]) # Output merged file to csv lat_table.tocsv(sys.argv[3])
import petl as etl import petlld # set up a petl table to demonstrate table1 = [["uri", "name"], ["n1", "Smith, Bob"], ["n2", "Jones, Sally"], ["n3", "Adams, Bill"]] # use petl utilities to add a column with our data type - foaf:Person table2 = etl.addfield(table1, "a", "foaf:Person") # a JSON-LD context for our data ctx = { "@base": "http://example.org/people/", "a": "@type", "uri": "@id", "rdfs": "http://www.w3.org/2000/01/rdf-schema#", "foaf": "http://xmlns.com/foaf/0.1/", "name": "rdfs:label", } # serialize the data as JSON-LD table2.tojsonld(ctx, indent=2) graph = table2.tograph(ctx) print print "-" * 10 print print graph.serialize(format="turtle")
def procesar_fuente(path, nombre): try: # Procesamos primero casos confirmados tabla = etl.fromcsv(path) # Cambiamos el nombre a los encabezados tabla = etl.rename(tabla, {'Country/Region': 'Country'}) # Ajustamos los tipos de datos # A partir de la columna 5, el tipo de dato es integer, que es el número de personas/casos # Adicionalmente aprovechamos para cambiar el formato de la fecha de 1/23/20 a 2020-01-23 en el header headers = etl.fieldnames(tabla) i=0 for header in headers: if i>=4: tabla = etl.convert(tabla, header, int) # corregimos el tipo de dato fecha = datetime.datetime.strptime(header, '%m/%d/%y') # calculamos la fecha en formato correcto tabla = etl.rename(tabla, header, fecha.strftime('%Y-%m-%d')) i = i + 1 # Eliminamos las columnas de Province/State, Lat y Lon que no vamos a utilizar tabla = etl.cutout(tabla, 0, 2, 3) # Ajustamos algunos nombres de países para luego asignarles una región/continente tabla = etl.convert(tabla, 'Country', 'replace', 'Congo (Brazzaville)', 'Congo') tabla = etl.convert(tabla, 'Country', 'replace', 'Congo (Kinshasa)', 'Democratic Republic of the Congo') tabla = etl.convert(tabla, 'Country', 'replace', 'Cote d\'Ivoire', 'Ivory Coast') tabla = etl.convert(tabla, 'Country', 'replace', 'Korea, South', 'South Korea') tabla = etl.convert(tabla, 'Country', 'replace', 'West Bank and Gaza', 'Palestine') tabla = etl.convert(tabla, 'Country', 'replace', 'Burma', 'Myanmar') tabla = etl.convert(tabla, 'Country', 'replace', 'US', 'USA') tabla = etl.convert(tabla, 'Country', 'replace', 'Taiwan*', 'Taiwan') # Luego procedemos a agrupar y acumular los resultados por el país df_confirmed = etl.todataframe(tabla) df = df_confirmed.groupby(['Country']).sum() tabla = etl.fromdataframe(df, include_index=True) # Renombramos el campo de Country nuevamente tabla = etl.rename(tabla, {'index': 'Country'}) # Luego agregamos las columnas de fecha como datos y renombramos las nuevas columnas tabla = etl.melt(tabla, 'Country') tabla = etl.rename(tabla, {'variable': 'Date'}) tabla = etl.rename(tabla, {'value': 'Cases'}) # Luego agregamos el continente para agrupar tabla = etl.addfield(tabla, 'Continent', lambda rec: get_continent_code(rec['Country'])) # Y nuevamente nos aseguramos que sean del tipo de dato que deben ser. tabla = etl.convert(tabla, 'Cases', int) tabla = etl.convert(tabla, 'Date', lambda v: datetime.datetime.strptime(v, '%Y-%m-%d') ) #Finalmente, subimos el archivo al repositorio de datos conn = pymysql.connect(password='******', database='covid', user='******') conn.cursor().execute('SET SQL_MODE=ANSI_QUOTES') etl.todb(tabla, conn, nombre, create=True, drop=True) conn.close() except: print('Se ha presentado un error! ', sys.exc_info()[0]) raise
def append_tailings_reports_to_code_required_reports(connection, commit=False): src_table = etl.fromdb( connection, 'SELECT exp_doc.mine_guid, exp_doc.exp_document_guid, req_doc.req_document_name, exp_doc.due_date, exp_doc.exp_document_status_code, exp_doc.received_date, exp_doc.active_ind, exp_doc_x.mine_document_guid, exp_doc.create_user, exp_doc.create_timestamp, exp_doc.update_user, exp_doc.update_timestamp from mine_expected_document exp_doc \ inner join mine_expected_document_xref exp_doc_x on exp_doc.exp_document_guid = exp_doc_x.exp_document_guid\ inner join mds_required_document req_doc on req_doc.req_document_guid = exp_doc.req_document_guid' ) req_document_crr_defintion_map = [ ['req_document_name', 'mine_report_definition_id'], ['Summary of TSF and Dam Safety Recommendations', 28], ['ITRB Activities Report', 27], ['Register of Tailings Storage Facilities and Dams', 47], ['Dam Safety Inspection (DSI) Report', 26], ['Dam Safety Review (DSR) Report', 31], ['“As-built” Reports', 32], ['Annual Reclamation', 25], ['MERP Record of Testing', 3], #['Annual Manager\'s Report', __________________ ], no mapping or data, ignore. ['OMS Manual', 33], ['Annual reconciliation of water balance and water management plans', 44], ['TSF risk assessment', 46], ['Mine Emergency Preparedness and Response Plan (MERP)', 24], ['Performance of high risk dumps', 29] ] table1 = etl.join(src_table, req_document_crr_defintion_map, 'req_document_name') mine_report = etl.cutout(table1, 'req_document_name') #to be inserted into db mine_report = etl.addfield(mine_report, 'submission_year', 2019) mine_report = etl.rename(mine_report, 'exp_document_status_code', 'mine_report_submission_status_code') mine_report = etl.addfield(mine_report, 'deleted_ind', lambda x: not x.active_ind) mine_report = etl.cutout(mine_report, 'active_ind') #to determine what FK's will be so can insert into related tables max_report_id = etl.fromdb(connection, 'select last_value from public.mine_report_mine_report_id_seq')[1][0] max_report_submission_id = etl.fromdb( connection, 'select last_value from public.mine_report_submission_mine_report_submission_id_seq')[1][0] #if sequence hasn't been used yet, fix off by one if max_report_id == 1: max_report_id = 0 if max_report_submission_id == 1: max_report_submission_id = 0 #get one-to-many mine_report, mine_report_submission_documents = etl.unjoin(mine_report, 'mine_document_guid', key='exp_document_guid') #add PK's for mappings mine_report_with_ids = etl.addrownumbers(mine_report, start=max_report_id + 1, step=1, field='mine_report_id') mine_report_with_ids = etl.addrownumbers(mine_report_with_ids, start=max_report_submission_id + 1, step=1, field='mine_report_submission_id') print(f'max_report_id= {max_report_id}, max_report_submission_id={max_report_submission_id}') #copy out fields for submission tables mine_report_submissions = etl.cut(mine_report_with_ids, [ 'mine_report_id', 'exp_document_guid', 'mine_report_submission_status_code', 'create_user', 'create_timestamp', 'update_user', 'update_timestamp' ]) mine_report_submissions = etl.addfield(mine_report_submissions, 'submission_date', lambda x: x.create_timestamp) #remove fields not in mine_report mine_report = etl.cutout(mine_report, 'mine_report_submission_status_code') #replace exp_document_guid FK with mine_report_submission FK submission_id_lookup = etl.cut(mine_report_with_ids, ['mine_report_submission_id', 'exp_document_guid']) mine_report_submission_documents = etl.join(submission_id_lookup, mine_report_submission_documents, key='exp_document_guid') mine_report_submission_documents = etl.cutout(mine_report_submission_documents, 'exp_document_guid') #removed original PK mine_report = etl.cutout(mine_report, 'exp_document_guid') mine_report_submissions = etl.cutout(mine_report_submissions, 'exp_document_guid') print(etl.valuecounter(etl.distinct(table1, key='exp_document_guid'), 'req_document_name')) print(etl.valuecounter(mine_report, 'mine_report_definition_id')) print(table1) print(mine_report) print(mine_report_submissions) print(mine_report_submission_documents) etl.appenddb(mine_report, connection, 'mine_report', commit=False) print('INSERT mine_report staged') etl.appenddb(mine_report_submissions, connection, 'mine_report_submission', commit=False) print('INSERT mine_report_submission staged') etl.appenddb(mine_report_submission_documents, connection, 'mine_report_document_xref', commit=False) print('INSERT mine_report_document_xref staged') if commit: connection.commit() print('DATA CREATION COMPLETE') else: connection.rollback() print('NO DATA CREATED: add --commit=true to insert report rows')
def sales_summary(start_dt=None, end_dt=None): """tally up gross (sale over list) profits TODO: tally up net profites (gross profit vs inventory purchase total) TODO: Keyword Arguments: start_dt {[type]} -- datetime for start of query (default: {None}) end_dt {[type]} -- datetime for start of query [description] (default: {None}) Returns: [dict] -- various types of sales information, stored in a dictionary. """ # products = db.session.query(Product).all() # sales = db.session.query(Sale).all() # retrieve existing tables products_records = etl.fromdb(db.engine, 'SELECT * FROM product') sales_records = etl.fromdb(db.engine, 'SELECT * FROM sale') # join product info to sales data sales_data = etl.join(sales_records, products_records, lkey='product_id', rkey='id') # prep joined sales data for tabulation sales_data = etl.convert(sales_data, 'date', lambda dt: format_date(dt)) sales_data = etl.sort(sales_data, 'date') sales_data = etl.convert(sales_data, 'quantity', lambda q: handle_none(q, replace_with=1)) sales_data = etl.addfield(sales_data, 'profit', lambda rec: calculate_profit(rec)) sales_data = etl.addfield(sales_data, 'gross_sales', lambda rec: calculate_gross_sales(rec)) # summarize data into charting-friendly data structures chart_count = etl.fold(sales_data, 'date', operator.add, 'quantity', presorted=True) chart_count = etl.rename(chart_count, {'key': 'x', 'value': 'y'}) chart_count, chart_count_missing_date = etl.biselect( chart_count, lambda rec: rec.x is not None) # print(chart_count) # etl.lookall(chart_count) chart_gross = etl.fold(sales_data, 'date', operator.add, 'gross_sales', presorted=True) chart_gross = etl.rename(chart_gross, {'key': 'x', 'value': 'y'}) chart_gross, chart_gross_missing_date = etl.biselect( chart_gross, lambda rec: rec.x is not None) # print(chart_gross) # etl.lookall(chart_gross) chart_profit = etl.fold(sales_data, 'date', operator.add, 'profit', presorted=True) chart_profit = etl.rename(chart_profit, {'key': 'x', 'value': 'y'}) chart_profit, chart_profit_missing_date = etl.biselect( chart_profit, lambda rec: rec.x is not None) # tabulate some figures gross_sales = 0 profits = 0 for sale in etl.dicts(sales_data): profits += calculate_profit(sale) gross_sales += calculate_gross_sales(sale) # for i in etl.dicts(chart_count): # print(i) # for i in etl.dicts(chart_gross): # print(i) return { 'gross_sales': gross_sales, 'profits': profits, 'chart_gross': list(etl.dicts(chart_gross)), 'chart_gross_missing_date': list(etl.dicts(chart_gross_missing_date)), 'chart_profit': list(etl.dicts(chart_profit)), 'chart_profit_missing_date': list(etl.dicts(chart_profit_missing_date)), 'chart_count': list(etl.dicts(chart_count)), 'chart_count_missing_date': list(etl.dicts(chart_count_missing_date)) }
def lee_convert(filepath): """ Takes the file path to a csv in the format used by Kansas City proper converts to universal format outputs csv. """ kclee = etl.fromcsv(filepath) kclee = etl.addfield(kclee, 'PoleID', lambda x: 'KCLEE' + x['OBJECTID']) kclee = etl.addfield(kclee, 'Longitude', lambda x: x['POINT_X']) kclee = etl.addfield(kclee, 'Latitude', lambda x: x['POINT_Y']) kclee = etl.addfield(kclee, 'LightbulbType', lambda x: x['LAMPTYPE']) kclee = etl.addfield(kclee, 'Wattage', lambda x: x['WATTS']) kclee = etl.addfield(kclee, 'Lumens', lambda x: x['LUMENS']) kclee = etl.addfield(kclee, 'AttachedTech', False) kclee = etl.addfield(kclee, 'LightAttributes', lambda x: x['FIXTURETYP']) kclee = etl.addfield(kclee, 'FiberWiFiEnable', False) kclee = etl.addfield(kclee, 'PoleType', None) kclee = etl.addfield(kclee, 'PoleOwner', 'Lee Summit') kclee = etl.addfield(kclee, 'DataSource', 'Lee Summit') kclee = etl.cut(kclee, 'PoleID', 'Longitude', 'Latitude', 'LightbulbType', 'Wattage', 'Lumens', 'AttachedTech', 'LightAttributes', 'FiberWiFiEnable', 'PoleType', 'PoleOwner', 'DataSource') etl.tocsv(kclee, 'data/kcleesummit_clean.csv')
def kcmo_convert(filepath, xtrapath): """ Takes the file path to a csv in the format used by Kansas City proper converts to universal format outputs csv. """ kcmo = etl.fromcsv(filepath) kcx = etl.fromxlsx(xtrapath) kcjoin = etl.join(kcmo, kcx, lkey='POLEID', rkey='IDNumber') del kcmo del kcx kcjoin = etl.addfield(kcjoin, 'PoleID', lambda x: x['POLEID']) kcjoin = etl.addfield(kcjoin, 'Longitude', lambda x: geom_to_tuple(x['the_geom'])[0]) kcjoin = etl.addfield(kcjoin, 'Latitude', lambda x: geom_to_tuple(x['the_geom'])[1]) kcjoin = etl.addfield(kcjoin, 'LightbulbType', lambda x: x['LUMINAIRE TYPE']) kcjoin = etl.addfield(kcjoin, 'Wattage', lambda x: x['WATTS']) kcjoin = etl.addfield(kcjoin, 'Lumens', None) kcjoin = etl.addfield( kcjoin, 'LightAttributes', lambda x: make_a_list( x['ATTACHMENT 10'], x['ATTACHMENT 9'], x['ATTACHMENT 8'], x[ 'ATTACHMENT 7'], x['ATTACHMENT 6'], x['ATTACHMENT 5'], x[ 'ATTACHMENT 4'], x['ATTACHMENT 3'], x['ATTACHMENT 2'], x[ 'ATTACHMENT 1'], x['SPECIAL_N2'], x['SPECIAL_NO'])) kcjoin = etl.addfield(kcjoin, 'AttachedTech', lambda x: bool(x['LightAttributes'])) kcjoin = etl.addfield( kcjoin, 'FiberWiFiEnable', lambda x: find_wifi(*x[ 'LightAttributes'], x['SPECIAL_N2'], x['SPECIAL_NO'])) kcjoin = etl.addfield(kcjoin, 'PoleType', lambda x: x['POLE TYPE']) kcjoin = etl.addfield(kcjoin, 'PoleOwner', lambda x: x['POLE OWNER']) kcjoin = etl.addfield(kcjoin, 'DataSource', 'Kansas City') kcjoin = etl.cut(kcjoin, 'PoleID', 'Longitude', 'Latitude', 'LightbulbType', 'Wattage', 'Lumens', 'AttachedTech', 'LightAttributes', 'FiberWiFiEnable', 'PoleType', 'PoleOwner', 'DataSource') etl.tocsv(kcjoin, 'data/kcmo_clean.csv')
import petl as etl table_header = [ "Fixed Acidity", "Volatile Acidity", "Citric Acid", "Sugar", "Chlorides", "Free SO2", "Total SO2", "Density", "pH", "Sulfates", "Alcohol", "Quality" ] table1 = etl.addfield( etl.convertnumbers( etl.setheader(etl.fromcsv('winequality-red.csv'), table_header)), "Type", "Red") table2 = etl.addfield( etl.convertnumbers( etl.setheader(etl.fromcsv('winequality-white.csv'), table_header)), "Type", "White") #print(etl.head(table1)) #print(etl.head(table2)) table1_filtered = etl.select(table1, "Quality", lambda v: v > 6) table2_filtered = etl.select(table2, "Quality", lambda v: v > 4) good_wines = etl.cat(table1_filtered, table2_filtered) good_wines_enhanced = etl.addfields( good_wines, [("Max Acidity", lambda rec: rec["Fixed Acidity"] + rec["Volatile Acidity"]), ("Locked SO2", lambda rec: rec["Total SO2"] - rec["Free SO2"])]) #print(etl.head(good_wines_enhanced)) #print(etl.tail(good_wines_enhanced))
import petl as etl #Extracting data from example csv file table1 = etl.fromcsv('example.csv') print table1 #etl.look(table1) #Transformation function to be applied on extracted data table2 = etl.convert(table1,'foo','upper') table3 = etl.convert(table2,'bar',int) table4 = etl.convert(table3,'baz',float) table5 = etl.addfield(table4, 'finally', lambda row: row.bar * row.baz) print table5 #etl.look(table5) #Writing above ETL pipeline in a functional style table = (etl .fromcsv('example.csv') .convert('foo', 'upper') .convert('bar', int) .convert('baz', float) .addfield('finally', lambda row: row.bar * row.baz) ) table.look() #look function only displays five rows. print table #OOP style programming l = [['foo','bar'], ['a', 1], ['b', 2], ['c', 2]] table6 = etl.wrap(l) print table6
look(table1) table2 = convertnumbers(table1) look(table2) # addfield table1 = [['foo', 'bar'], ['M', 12], ['F', 34], ['-', 56]] from petl import addfield, look look(table1) # using a fixed value table2 = addfield(table1, 'baz', 42) look(table2) # calculating the value table2 = addfield(table1, 'baz', lambda rec: rec['bar'] * 2) look(table2) # an expression string can also be used via expr from petl import expr table3 = addfield(table1, 'baz', expr('{bar} * 2')) look(table3) # rowslice table1 = [['foo', 'bar'], ['a', 1], ['b', 2],
import petl as etl readFile = etl.fromtsv("donedeal_data_sample.tsv") tmpTable = etl.addfield(readFile, 'InKms', lambda rec: rec['mileage']) tmpTable2File = etl.convert(tmpTable, 'InKms', lambda v: int(float(v) * 1.6), where=lambda r: r.mileageType == 'miles') etl.totsv(tmpTable2File, 'donedeal_inKms.tsv')
def transform(mmj_employees, organization_id, debug, fake_email, source_db): """ Load the transformed data into the destination(s) """ # source data table source_dt = utils.view_to_list(mmj_employees) cut_data = [ 'id', 'email', 'first_name', 'organization_id', 'last_name', 'created_at', 'updated_at', 'login' ] employee_data = etl.cut(source_dt, cut_data) employees = (etl.addfield( employee_data, 'keys').addfield('name').addfield('role').addfield('dateOfBirth')) mappings = OrderedDict() mappings['id'] = 'id' mappings['name'] = \ lambda name: _set_name(name.first_name, name.last_name, name.login) """ Roles: 1 = site-admin 2 = site-admin 3 = store-manager 4 = budtender """ mappings['role'] = lambda x: _assign_role(x.id, source_db) mappings['createdAt'] = 'created_at' mappings['updatedAt'] = 'updated_at' mappings['dateOfBirth'] = \ lambda _: datetime.datetime(year=1970, month=01, day=01, hour=02, minute=30) mappings['organization_id'] = 'organization_id' # keep mmj org mappings['accountStatus'] = lambda x: _active(x.id, source_db) fields = etl.fieldmap(employees, mappings) merged_employees = etl.merge(employees, fields, key='id') mapped_employees = [] for item in etl.dicts(merged_employees): item['keys'] = { 'id': item['id'], 'organization_id': item['organization_id'] } # remove any item['keys'] tuples with None values for key in item['keys'].keys(): if not item['keys'][key]: del item['keys'][key] item['email'] = _set_email(item['email'], fake_email, debug) del item['login'] del item['first_name'] del item['last_name'] del item['created_at'] del item['id'] del item['organization_id'] # set up final structure for API mapped_employees.append(item) if debug: result = json.dumps(mapped_employees, sort_keys=True, indent=4, default=utils.json_serial) print(result) return mapped_employees
# introduction example_data = """foo,bar,baz a,1,2.3 b,4,5.6 c,7,8.9 """ with open('example.csv', 'w') as f: f.write(example_data) import petl as etl table1 = etl.fromcsv('example.csv') print(table1) table2 = etl.convert(table1, 'foo', 'upper') print(table2) table3 = etl.convert(table2, {'bar': int, 'baz': float}) print(table3) table4 = etl.addfield(table3, 'bar*baz', lambda row: row.baz * row.bar) print(table4.look()) # 管道操作 table=( etl .fromcsv('example.csv') .convert({'foo':'upper','bar':int,'baz':float}) .addfield('bar*bar_new',lambda r:r.baz* r.bar) .convert('baz', lambda v, row: v * float(row.bar),pass_row=True) ) print(table.look())
def transform(mmj_menu_items, mmj_categories, prices, organization_id, source_db, debug): """ Transform data """ # source data table source_dt = utils.view_to_list(mmj_menu_items) cut_menu_data = [ 'id', 'vendor_id', 'menu_id', 'dispensary_id', 'strain_id', 'created_at', 'updated_at', 'category_id', 'name', 'sativa', 'indica', 'on_hold', 'product_type', 'image_file_name', 'medicine_amount', 'product_type' ] cut_prices = [ 'menu_item_id', 'dispensary_id', 'price_half_gram', 'price_gram', 'price_two_gram', 'price_eigth', 'price_quarter', 'price_half', 'price_ounce' ] # Cut out all the fields we don't need to load menu_items = etl.cut(source_dt, cut_menu_data) prices_data = etl.cut(prices, cut_prices) menu_items = (etl.addfield( menu_items, 'createdAtEpoch').addfield('unitOfMeasure').addfield( 'locationProductDetails').addfield('keys').addfield('restockLevel') ) # Two-step transform and cut. First we need to cut the name # and id from the source data to map to. cut_source_cats = etl.cut(mmj_categories, 'name', 'id', 'measurement') source_values = etl.values(cut_source_cats, 'name', 'id') # Then we nede a dict of categories to compare against. # id is stored to match against when transforming and mapping categories mmj_categories = dict([(value, id) for (value, id) in source_values]) mappings = OrderedDict() mappings['id'] = 'id' mappings['createdAt'] = 'created_at' mappings['updatedAt'] = 'updated_at' mappings['createdAtEpoch'] = lambda x: utils.create_epoch(x.created_at) mappings['name'] = 'name' mappings['shareOnWM'] = lambda x: _wm_integration(x.id, source_db) """ 1 = Units 2 = Grams (weight) """ mappings['unitOfMeasure'] = \ lambda x: _map_uom(x.category_id, source_db) fields = etl.fieldmap(menu_items, mappings) data = etl.merge(menu_items, fields, key='id') items = [] for item in etl.dicts(data): breakpoint_pricing = (etl.select( prices_data, lambda x: x.dispensary_id == item['dispensary_id']).rename({ 'price_eigth': 'price_eighth' }).cutout('menu_item_id')) # Set image url for load to download url = None if debug and item['image_file_name'] is not None: url = ("https://wm-mmjmenu-images-development.s3." "amazonaws.com/menu_items/images/{0}/large/" "{1}").format(item['id'], item['image_file_name']) elif item['image_file_name'] is not None: url = ("https://wm-mmjmenu-images-production.s3." "amazonaws.com/menu_items/images/{0}/large/" "{1}").format(item['id'], item['image_file_name']) item['image_file_name'] = url item['categoryId'] = _map_categories(item['category_id'], item['sativa'], item['indica'], mmj_categories, menu_items) item['keys'] = { 'dispensary_id': item['dispensary_id'], 'id': item['id'], 'menu_id': item['menu_id'], 'vendor_id': item['vendor_id'], 'strain_id': item['strain_id'], 'category_id': item['category_id'] } # set a default netMJ value if the menu item is a unit product if item['unitOfMeasure'] is 2: item['netMarijuana'] = int(item['medicine_amount']) for key in item['keys'].keys(): if not item['keys'][key]: del item['keys'][key] item['locationProductDetails'] = { 'id': item['id'], 'active': _active(item['on_hold']) } item['restockLevel'] = _restock_level(item['dispensary_id'], item['product_type'], source_db) if item['shareOnWM'] is None: item['shareOnWM'] = False for price in etl.dicts(breakpoint_pricing): try: price_two_gram = price['price_two_gram'] except KeyError: price_two_gram = 0.0 item['locationProductDetails']['weightPricing'] = { 'price_half_gram': utils.dollars_to_cents(price['price_half_gram']), 'price_two_gram': utils.dollars_to_cents(price_two_gram), 'price_gram': utils.dollars_to_cents(price['price_gram']), 'price_eighth': utils.dollars_to_cents(price['price_eighth']), 'price_quarter': utils.dollars_to_cents(price['price_quarter']), 'price_half': utils.dollars_to_cents(price['price_half']), 'price_ounce': utils.dollars_to_cents(price['price_ounce']) } del item['vendor_id'] del item['indica'] del item['dispensary_id'] del item['id'] del item['strain_id'] del item['on_hold'] del item['menu_id'] del item['sativa'] del item['category_id'] del item['updated_at'] del item['created_at'] del item['product_type'] if item['image_file_name'] is None: del item['image_file_name'] # set up final structure for API items.append(item) # Remove inactive items for item in items: if item['locationProductDetails']['active'] is False: items.remove(item) if debug: result = json.dumps(items, sort_keys=True, indent=4, default=utils.json_serial) print(result) return items
# Luego procedemos a agrupar y acumular los resultados por el país df_confirmed = etl.todataframe(t_confirmed) df = df_confirmed.groupby(['Country']).sum() t_confirmed = etl.fromdataframe(df, include_index=True) # Renombramos el campo de Country nuevamente t_confirmed = etl.rename(t_confirmed, {'index': 'Country'}) # Luego agregamos las columnas de fecha como datos y renombramos las nuevas columnas t_confirmed = etl.melt(t_confirmed, 'Country') t_confirmed = etl.rename(t_confirmed, {'variable': 'Date'}) t_confirmed = etl.rename(t_confirmed, {'value': 'Cases'}) # Luego agregamos el continente para agrupar t_confirmed = etl.addfield(t_confirmed, 'Continent', lambda rec: get_continent_code(rec['Country'])) # Y nuevamente nos aseguramos que sean del tipo de dato que deben ser. t_confirmed = etl.convert(t_confirmed, 'Cases', int) t_confirmed = etl.convert(t_confirmed, 'Date', lambda v: datetime.datetime.strptime(v, '%Y-%m-%d')) #Finalmente, subimos el archivo al repositorio de datos conn = pymysql.connect(password='******', database='covid', user='******') conn.cursor().execute('SET SQL_MODE=ANSI_QUOTES') etl.todb(t_confirmed, conn, 'confirmed', create=True, drop=True) conn.close() # Ejemplos de visualización para debugging #print(etl.header(t_confirmed)) #print(etl.records(t_confirmed))
def main(argv): parser = argparse.ArgumentParser() parser.add_argument("--pledges-filename", required=True, help="Input UTF8 CSV with pledges data " "dumped from Servant Keeper") parser.add_argument("--output-filename", required=True, help="Output CSV filename which will be loaded with " "pledges data in CCB import format ") parser.add_argument( '--trace', action='store_true', help="If specified, prints tracing/progress messages to " "stdout") args = parser.parse_args() assert os.path.isfile( args.pledges_filename ), "Error: cannot open file '" + args.pledges_filename + "'" table = petl.fromcsv(args.pledges_filename) table = petl.rename(table, { 'Frequency': 'SK Frequency', 'Individual ID': 'SK Individual ID' }) table = petl.addfield(table, 'Individual ID', lambda rec: rec['SK Individual ID']) table = petl.addfield(table, 'Campus', '') table = petl.addfield(table, 'Category Pledged To', lambda rec: rec['Account']) table = petl.addfield(table, 'Amount Pledged', convert_amount) table = petl.addfield(table, 'Total Amount Pledged', lambda rec: rec['Pledged']) table = petl.addfield( table, 'Frequency', lambda rec: { 1: 'Yearly', 4: 'Yearly', 12: 'Monthly', 52: 'Weekly', 24: 'Monthly', 2: 'Yearly' }[int(rec['Payments'])]) table = petl.addfield( table, 'Number of Gifts', lambda rec: { 'Yearly': 1, 'Monthly': 12, 'Weekly': 52 }[rec['Frequency']]) table = petl.addfield( table, 'Length Multiplier', lambda rec: { 'Yearly': 'Years', 'Monthly': 'Months', 'Weekly': 'Weeks' }[rec['Frequency']]) table = petl.addfield( table, 'Start Date', lambda rec: { 'Operating Income': '2013-01-01', 'Mortgage Principal': '2013-01-01', 'Operating Income 2015': '2015-01-01' }[rec['Account']]) table = petl.addfield( table, 'End Date', lambda rec: { 'Operating Income': '2013-12-31', 'Mortgage Principal': '2013-12-31', 'Operating Income 2015': '2015-12-31' }[rec['Account']]) trace('CONVERTING AND THEN EMITTING TO CSV FILE...', args.trace, banner=True) table.progress(200).tocsv(args.output_filename) trace('OUTPUT TO CSV COMPLETE.', args.trace, banner=True) trace('DONE!', args.trace, banner=True)
['C', True], ['D', False]] table10 = etl.cat(table8, table9, header=['A', 'foo', 'B', 'bar', 'C']) table10 # addfield() ############ import petl as etl table1 = [['foo', 'bar'], ['M', 12], ['F', 34], ['-', 56]] # using a fixed value table2 = etl.addfield(table1, 'baz', 42) table2 # calculating the value table2 = etl.addfield(table1, 'baz', lambda rec: rec['bar'] * 2) table2 # rowslice() ############ import petl as etl table1 = [['foo', 'bar'], ['a', 1], ['b', 2], ['c', 5], ['d', 7],