def DataIntegration(clinics_LOC, Services_LOC, Location_LOC): # Reading the clinics.csv file fileData = pt.fromcsv(clinics_LOC) # Reading the clinic_services.csv file servicesData = pt.fromcsv(Services_LOC) # reading the xml file cliniclocations.xml locationXML = pt.fromxml(Location_LOC, 'clinic', { "ClinicID": "ClinicID", "Lat": "Lat", "Lon": "Lon" }) # join the csv file's using the inbuilt function join using ClinicID as main key fileJoin = pt.join(servicesData, fileData, key="ClinicID") # join the csv file using the inbuilt function join using ClinicID as main key MainJoin = pt.join(fileJoin, locationXML, key="ClinicID") # acquire the required columns result = pt.cut(MainJoin, 'ClinicServiceID', 'Service', 'ClinicID', 'Suburb', 'Postcode', 'Lat', 'Lon') # creating the final csv file which is clinicservicelocations.csv pt.tocsv(result, "clinic_service_locations.csv") print('Csv file generated.!!!')
def _shape_data(self, raw_planets: etl.Table, raw_people: etl.Table) -> etl.Table: planets = etl.cut(raw_planets, ( Planet.Columns.NAME, Planet.Columns.URL, )) people = etl.cut( raw_people, ( Person.Columns.NAME, Person.Columns.HEIGHT, Person.Columns.MASS, Person.Columns.HAIR_COLOR, Person.Columns.SKIN_COLOR, Person.Columns.EYE_COLOR, Person.Columns.BIRTH_YEAR, Person.Columns.GENDER, Person.Columns.HOMEWORLD, Person.Columns.EDITED, ), ) combined = etl.join( planets, people, lkey=Planet.Columns.URL, rkey=Person.Columns.HOMEWORLD, lprefix=Planet.PREFIX, ) renamed = etl.rename( combined, { Person.Columns.EDITED: Person.RenamedColumns.DATE, Planet.prefix_value(Planet.Columns.NAME): Person.Columns.HOMEWORLD, }, ) converted = etl.convert( renamed, { Person.RenamedColumns.DATE: lambda v: parse(v).date(), }) return etl.cut( converted, ( Person.Columns.NAME, Person.Columns.HEIGHT, Person.Columns.MASS, Person.Columns.HAIR_COLOR, Person.Columns.SKIN_COLOR, Person.Columns.EYE_COLOR, Person.Columns.BIRTH_YEAR, Person.Columns.GENDER, Person.Columns.HOMEWORLD, Person.RenamedColumns.DATE, ), )
def join(data, strategy, source_left, source_right, destination, key_left, key_right, prefix_left, prefix_right, presorted, buffersize, tempdir, cache, missing): """Perform a join on two data tables.""" source_left = data.get(source_left) source_right = data.get(source_right) kwargs = {} if key_left == key_right: kwargs['key'] = key_left else: kwargs['lkey'] = key_left kwargs['rkey'] = key_right if presorted is True: kwargs['presorted'] = presorted if buffersize is not None: kwargs['buffersize'] = buffersize if tempdir: kwargs['tempdir'] = tempdir if 'anti' not in strategy: if prefix_left is not None: kwargs['lprefix'] = prefix_left if prefix_right is not None: kwargs['rprefix'] = prefix_right if strategy not in ['join', 'antijoin', 'hashjoin', 'hashantijoin']: kwargs['missing'] = missing if strategy == 'join': o = petl.join(source_left, source_right, **kwargs) elif strategy == 'leftjoin': o = petl.leftjoin(source_left, source_right, **kwargs) elif strategy == 'lookupjoin': o = petl.lookupjoin(source_left, source_right, **kwargs) elif strategy == 'rightjoin': o = petl.rightjoin(source_left, source_right, **kwargs) elif strategy == 'outerjoin': o = petl.outerjoin(source_left, source_right, **kwargs) elif strategy == 'antijoin': o = petl.antijoin(source_left, source_right, **kwargs) elif strategy == 'hashjoin': o = petl.antijoin(source_left, source_right, **kwargs) elif strategy == 'hashleftjoin': o = petl.hashleftjoin(source_left, source_right, **kwargs) elif strategy == 'hashlookupjoin': o = petl.hashlookupjoin(source_left, source_right, **kwargs) elif strategy == 'hashrightjoin': o = petl.hashrightjoin(source_left, source_right, **kwargs) data.set(destination, o)
def createFacts(events, users): try: events_uid = etl.cutout(events, 'tracking_id', 'utm_medium', 'utm_campaign') events_tui = etl.cutout(events, 'user_id') stage_uid = etl.join(users, events_uid, key='user_id') stage_tui = etl.join(users, events_tui, key='tracking_id') stage_utm = etl.cut(stage_tui, 'user_id', 'utm_medium', 'utm_campaign') stage_uid_utm = etl.join(stage_uid, stage_utm, key='user_id') stage_m_s = etl.mergesort(stage_uid_utm, stage_tui, key=['created_at', 'email']) mappings = OrderedDict() mappings['tid'] = 'tracking_id' mappings['uid'] = 'user_id' mappings['utm_medium'] = 'utm_medium' mappings['utm_campaign'] = 'utm_campaign', {'audio': 'none', 'social': 'none'} mappings['utm_campaigntype'] = 'utm_campaign' mappings['email'] = 'email' mappings['subscription'] = 'type' mappings['sub_order'] = 'type', {'Signup Completed': '1', 'Trial Started': '2', 'Subscription Started': '3', 'Subscription Ended': '4'} mappings['created_at'] = 'created_at' # Mapping stage_mapping = etl.fieldmap(stage_m_s, mappings) # Sort stage_mapping_ordered = etl.sort(stage_mapping, key=['created_at', 'email', 'sub_order']) # Datetime split t1 = etl.split(stage_mapping_ordered, 'created_at', 'T', ['date', 'time'], include_original=True) t2 = etl.split(t1, 'date', '-', ['year', 'month', 'day']) stage_ready = etl.split(t2, 'time', ':', ['hour', 'minute', 'second']) # Export as csv to load folder etl.tocsv(stage_ready, 'load/facts.csv') except Exception as e: print("Something went wrong. Error {0}".format(e))
def xref_symbol_reports(): symbol_reports = [ f for f in os.listdir() if re.match('OCLC Datasync Unresolved.*\.csv', f) ] today = str(date.today()) for report in symbol_reports: symbol_split = re.split('^.*processing.(M[A-Z]{2}).*$', report) symbol = symbol_split[1] xlsx_outfile = symbol + '_datasync_unresolved_' + today + '.xlsx' xls_outfile = symbol + '_datasync_unresolved_' + today + '.xls' txt_outfile = symbol + '_staging_OCNs_' + today + '.txt' symbol_table_raw = etl.fromcsv(report, encoding='utf-8') symbol_table = etl.rename(symbol_table_raw, '\ufeffMMS Id', 'MMS ID') symbol_table2 = etl.select(symbol_table, "{MMS ID} is not None") symbol_table_sorted = etl.sort(symbol_table2, 'MMS ID') xref_table = etl.fromcsv('unresxref.csv') xref_table2 = etl.select(xref_table, "{MMS ID} is not None") xref_table_sorted = etl.sort(xref_table2, 'MMS ID') symbol_xref_table = etl.join(symbol_table_sorted, xref_table_sorted, presorted=True, lkey="MMS ID", rkey="MMS ID") try: etl.toxlsx(symbol_xref_table, xlsx_outfile, encoding='utf-8') except TypeError: etl.toxls(symbol_xref_table, xls_outfile, 'Sheet1', encoding='utf-8') staging_ocns_table = etl.cut(symbol_xref_table, 'Staging OCN') template = '{Staging OCN}\n' etl.totext(staging_ocns_table, txt_outfile, template=template)
def kcmo_convert(filepath, xtrapath): """ Takes the file path to a csv in the format used by Kansas City proper converts to universal format outputs csv. """ kcmo = etl.fromcsv(filepath) kcx = etl.fromxlsx(xtrapath) kcjoin = etl.join(kcmo, kcx, lkey='POLEID', rkey='IDNumber') del kcmo del kcx kcjoin = etl.addfield(kcjoin, 'PoleID', lambda x: x['POLEID']) kcjoin = etl.addfield(kcjoin, 'Longitude', lambda x: geom_to_tuple(x['the_geom'])[0]) kcjoin = etl.addfield(kcjoin, 'Latitude', lambda x: geom_to_tuple(x['the_geom'])[1]) kcjoin = etl.addfield(kcjoin, 'LightbulbType', lambda x: x['LUMINAIRE TYPE']) kcjoin = etl.addfield(kcjoin, 'Wattage', lambda x: x['WATTS']) kcjoin = etl.addfield(kcjoin, 'Lumens', None) kcjoin = etl.addfield( kcjoin, 'LightAttributes', lambda x: make_a_list( x['ATTACHMENT 10'], x['ATTACHMENT 9'], x['ATTACHMENT 8'], x[ 'ATTACHMENT 7'], x['ATTACHMENT 6'], x['ATTACHMENT 5'], x[ 'ATTACHMENT 4'], x['ATTACHMENT 3'], x['ATTACHMENT 2'], x[ 'ATTACHMENT 1'], x['SPECIAL_N2'], x['SPECIAL_NO'])) kcjoin = etl.addfield(kcjoin, 'AttachedTech', lambda x: bool(x['LightAttributes'])) kcjoin = etl.addfield( kcjoin, 'FiberWiFiEnable', lambda x: find_wifi(*x[ 'LightAttributes'], x['SPECIAL_N2'], x['SPECIAL_NO'])) kcjoin = etl.addfield(kcjoin, 'PoleType', lambda x: x['POLE TYPE']) kcjoin = etl.addfield(kcjoin, 'PoleOwner', lambda x: x['POLE OWNER']) kcjoin = etl.addfield(kcjoin, 'DataSource', 'Kansas City') kcjoin = etl.cut(kcjoin, 'PoleID', 'Longitude', 'Latitude', 'LightbulbType', 'Wattage', 'Lumens', 'AttachedTech', 'LightAttributes', 'FiberWiFiEnable', 'PoleType', 'PoleOwner', 'DataSource') etl.tocsv(kcjoin, 'data/kcmo_clean.csv')
def join_execute(cl, cr, join, **kwargs): cl, cr = cl(), cr() if 'addLfields' in kwargs: cl = etl.addfields(cl, kwargs['addLfields']) if 'addRfields' in kwargs: cr = etl.addfields(cr, kwargs['addRfields']) args = cl, cr if join == Join.UNION: c = etl.crossjoin(*args) else: kwargs = filter_keys(kwargs, ("key", "lkey", "rkey", "missing", "presorted", "buffersize", "tempdir", "cache")) if join == Join.INNER: c = etl.join(*args, **kwargs) elif join == Join.LEFT: c = etl.leftjoin(*args, **kwargs) elif join == Join.RIGHT: c = etl.rightjoin(*args, **kwargs) elif join == Join.FULL: c = etl.outerjoin(*args, **kwargs) return c
def xls_tidy(xls,qvalue): d=etl.fromtsv(xls) sd=etl.select(d,lambda x: float(x.PepQValue) <=float(qvalue)) psmsummary=sd ssd=etl.cut(sd, 'Peptide', 'Protein', 'PepQValue') #remove the mod info in peptide. ssd=etl.transform.regex.sub(ssd,'Peptide', r'^[\w-]\.(.+)\.[\w-]$', r'\1') ssd=etl.transform.regex.sub(ssd,'Peptide', r'[\d\.\+]+', r'') aggregation = OrderedDict() aggregation['SpecCount'] = len cssd=etl.aggregate(ssd, 'Peptide', aggregation) fssd=etl.groupselectfirst(ssd, key=('Peptide','Protein',"PepQValue")) aggregation = OrderedDict() aggregation['Protein'] = 'Protein', etl.strjoin(';') aggregation['PepQValue'] = 'PepQValue', etl.strjoin(';') assd=etl.aggregate(fssd, 'Peptide', aggregation) pepsummary=etl.join(assd, cssd, key='Peptide') return (psmsummary, pepsummary)
# Convenience function to convert values under the given field using a regular expression substitution.""" def substitute(table, field, pattern, repl, count=0, flags=0): program = re.compile(pattern, flags) convert = lambda tempData: program.sub(repl, tempData, count=count) return pt.convert(table, field, convert) # use to read the csv file using the petl framework # the below code is use to read services csv file fileData = pt.fromcsv('services.csv') # the below code is use to read clinicservices csv file servicesData = pt.fromcsv('clinicservices.csv') # join the csv file using the inbuilt function join using ServiceID as main key fileJoin = pt.join(servicesData, fileData, key="ServiceID") # the below code is use to read clinic csv file readCsv = pt.fromcsv('clinics.csv') # join the csv file using the inbuilt function join using ClinicID as main key doubleJoin = pt.join(fileJoin, readCsv, key='ClinicID') # reading the xml file cliniclocations.xml locationXML = pt.fromxml('cliniclocations.xml', 'clinic', { "ClinicID": "ClinicID", "Lat": "Lat", "Lon": "Lon" }) # join the csv file using the inbuilt function join using ClinicID as main key doubleJoin2 = pt.join(doubleJoin, locationXML, key="ClinicID") # removing the spaces from the email field cleanOne = substitute(doubleJoin2, 'Email', '\s', '') # adding @myclinic.com.au behind every email id
def dimension_values(): connection = psycopg2.connect(dbname='voyager', user='******', password='******', host='172.16.0.45') engine = create_engine('postgresql://*****:*****@172.16.0.45:5432/voyager') com = 'select id as id_component, name as component from dim_com' table_com = etl.fromdb(connection, com) loc = 'select id as id_location, name as name from dim_loc' table_loc = etl.fromdb(connection, loc) tim = 'select id as id_time, time as timestamp from dim_time' table_time = etl.fromdb(connection, tim) print(table_com) print(table_loc) print(table_time) for ran in range(0, 65424, 1000): sql = "select * from KNMI_station_data kk " \ "RIGHT JOIN weatherstations w ON " \ " CAST (kk.weather_station_id AS INTEGER) = CAST (w.station_number AS INTEGER) " \ "WHERE w.station_number NOT LIKE \'NL%%\' AND date > 20190901 LIMIT 1000 OFFSET %s" % ran print(sql) table = etl.fromdb(connection, sql) print('knmi') print(table) table.log_progress() table = etl.convert(table, 'date', str) table = etl.convert(table, 'hour', str) table = etl.convert(table, 'temperature', int) table = etl.convert(table, 'temperature_dew', int) table = etl.convert(table, 'temperature_min', int) table = etl.convert(table, 'wind_speed_avg', int) table = etl.convert(table, 'wind_speed', int) table = etl.convert(table, 'wind_speed_max', int) table = etl.convert(table, 'temperature', lambda v: v / 10) table = etl.convert(table, 'temperature_dew', lambda v: v / 10) table = etl.convert(table, 'temperature_min', lambda v: v / 10) table = etl.convert(table, 'wind_speed_avg', lambda v: v / 10) table = etl.convert(table, 'wind_speed', lambda v: v / 10) table = etl.convert(table, 'wind_speed_max', lambda v: v / 10) df = pd.DataFrame(table) df.columns = df.iloc[0] df = df.drop(0) df['timestamp'] = df['date'] + df['hour'] df['weather_station_id'] = df['weather_station_id'].astype(str) df['timestamp'] = df['timestamp'].apply(custom_to_datetime) df['timestamp'] = df['timestamp'].astype(str) df = df.drop(columns=['date', 'hour'], axis=1) final_knmi_table = etl.fromdataframe(df) final_knmi_table = etl.melt(final_knmi_table, key=[ 'weather_station_id', 'timestamp', 'id', 'latitude', 'longitude', 'name', 'station_number', 'data_source_id', 'altitude' ]) final_knmi_table = etl.rename(final_knmi_table, 'variable', 'component') print(final_knmi_table) final_knmi_table2 = etl.join(final_knmi_table, table_com, key='component') final_knmi_table2 = etl.join(final_knmi_table2, table_loc, key='name') final_knmi_table2 = etl.join(final_knmi_table2, table_time, key='timestamp') print('dos') print(final_knmi_table2) df = pd.DataFrame(final_knmi_table2) df.columns = df.iloc[0] df = df.drop(0) fact_source = df[[ 'id_component', 'id_location', 'id_time', 'value', 'data_source_id', 'weather_station_id' ]] print(fact_source) fact_source.to_sql('fact_source', engine, if_exists='append', index=False, method='multi') for rn in range(0, 1148, 1000): print('lmn') final_lmn_table = etl.fromdb( connection, "select ld.id, ld.station_number, ld.value, ld.timestamp, ls.name as component, " "ws.id as lid, ws.latitude, ws.longitude, ws.data_source_id, ws.altitude, ws.name as name" " from luchtmeetnet_data ld " "right join luchtmeetnet_sensors ls on ld.formula = ls.formula " " join weatherstations ws on ld.station_number = ws.station_number " "where ws.station_number like \'NL%%\' AND timestamp > '2019-09-01' " "LIMIT 1000 OFFSET %s" % rn) final_lmn_table = etl.rename(final_lmn_table, {'station_number': 'weather_station_id'}) final_lmn_table = etl.movefield(final_lmn_table, 'timestamp', 1) # print(final_lmn_table) # print(final_lmn_table) # print(table_com) final_lmn_table2 = etl.join(final_lmn_table, table_com, key='component') # print(final_lmn_table2) final_lmn_table2 = etl.join(final_lmn_table2, table_loc, key='name') # print(final_lmn_table2) df = pd.DataFrame(final_lmn_table2) df.columns = df.iloc[0] df = df.drop(0) df['timestamp'] = df['timestamp'].str[:-6] # print(df) final_lmn_table2 = etl.fromdataframe(df) final_lmn_table2 = etl.join(final_lmn_table2, table_time, key='timestamp') # print(final_lmn_table2) print(final_lmn_table2) final_lmn_df = pd.DataFrame(final_lmn_table2) final_lmn_df.columns = final_lmn_df.iloc[0] final_lmn_df = final_lmn_df.drop(0) fact_source = final_lmn_df[[ 'id_component', 'id_location', 'id_time', 'value', 'data_source_id', 'weather_station_id' ]] print(fact_source) fact_source.to_sql('fact_source', engine, if_exists='append', index=False, method='multi')
def sales_summary(start_dt=None, end_dt=None): """tally up gross (sale over list) profits TODO: tally up net profites (gross profit vs inventory purchase total) TODO: Keyword Arguments: start_dt {[type]} -- datetime for start of query (default: {None}) end_dt {[type]} -- datetime for start of query [description] (default: {None}) Returns: [dict] -- various types of sales information, stored in a dictionary. """ # products = db.session.query(Product).all() # sales = db.session.query(Sale).all() # retrieve existing tables products_records = etl.fromdb(db.engine, 'SELECT * FROM product') sales_records = etl.fromdb(db.engine, 'SELECT * FROM sale') # join product info to sales data sales_data = etl.join(sales_records, products_records, lkey='product_id', rkey='id') # prep joined sales data for tabulation sales_data = etl.convert(sales_data, 'date', lambda dt: format_date(dt)) sales_data = etl.sort(sales_data, 'date') sales_data = etl.convert(sales_data, 'quantity', lambda q: handle_none(q, replace_with=1)) sales_data = etl.addfield(sales_data, 'profit', lambda rec: calculate_profit(rec)) sales_data = etl.addfield(sales_data, 'gross_sales', lambda rec: calculate_gross_sales(rec)) # summarize data into charting-friendly data structures chart_count = etl.fold(sales_data, 'date', operator.add, 'quantity', presorted=True) chart_count = etl.rename(chart_count, {'key': 'x', 'value': 'y'}) chart_count, chart_count_missing_date = etl.biselect( chart_count, lambda rec: rec.x is not None) # print(chart_count) # etl.lookall(chart_count) chart_gross = etl.fold(sales_data, 'date', operator.add, 'gross_sales', presorted=True) chart_gross = etl.rename(chart_gross, {'key': 'x', 'value': 'y'}) chart_gross, chart_gross_missing_date = etl.biselect( chart_gross, lambda rec: rec.x is not None) # print(chart_gross) # etl.lookall(chart_gross) chart_profit = etl.fold(sales_data, 'date', operator.add, 'profit', presorted=True) chart_profit = etl.rename(chart_profit, {'key': 'x', 'value': 'y'}) chart_profit, chart_profit_missing_date = etl.biselect( chart_profit, lambda rec: rec.x is not None) # tabulate some figures gross_sales = 0 profits = 0 for sale in etl.dicts(sales_data): profits += calculate_profit(sale) gross_sales += calculate_gross_sales(sale) # for i in etl.dicts(chart_count): # print(i) # for i in etl.dicts(chart_gross): # print(i) return { 'gross_sales': gross_sales, 'profits': profits, 'chart_gross': list(etl.dicts(chart_gross)), 'chart_gross_missing_date': list(etl.dicts(chart_gross_missing_date)), 'chart_profit': list(etl.dicts(chart_profit)), 'chart_profit_missing_date': list(etl.dicts(chart_profit_missing_date)), 'chart_count': list(etl.dicts(chart_count)), 'chart_count_missing_date': list(etl.dicts(chart_count_missing_date)) }
actors = etl.pushheader(actors, ['id', 'first_name', 'last_name', 'gender']) # actorfullname table originalCursor.execute('SELECT * FROM actorfullname') actorfullname = originalCursor.fetchall() actorfullname = etl.pushheader(actorfullname, ['full_name', 'id']) # roles table w/0 role originalCursor.execute('SELECT movie_id, actor_id FROM roles') actorIdOnly = originalCursor.fetchall() actorIdOnly = etl.pushheader(actorIdOnly, ['movie_id', 'actor_id']) #### Denormalizing Original Tables #### # Denormalize movies_directors into movies moviesAndDirectors = etl.join(movies, movies_directors, key='movie_id') # Denormalize roles into movies moviesAndDirectorsAndRoles = etl.join(moviesAndDirectors, actorIdOnly, key='movie_id') # Add fullname to actors actors = etl.join(actors, actorfullname, key='id') # Denormalize roles into actors """ actorsAndRoles = etl.join( actors, actorIdOnly, lkey='id', rkey='actor_id') """ # Add fullname to directors directors = etl.join(directors, directorfullname, key='id')
def append_tailings_reports_to_code_required_reports(connection, commit=False): src_table = etl.fromdb( connection, 'SELECT exp_doc.mine_guid, exp_doc.exp_document_guid, req_doc.req_document_name, exp_doc.due_date, exp_doc.exp_document_status_code, exp_doc.received_date, exp_doc.active_ind, exp_doc_x.mine_document_guid, exp_doc.create_user, exp_doc.create_timestamp, exp_doc.update_user, exp_doc.update_timestamp from mine_expected_document exp_doc \ inner join mine_expected_document_xref exp_doc_x on exp_doc.exp_document_guid = exp_doc_x.exp_document_guid\ inner join mds_required_document req_doc on req_doc.req_document_guid = exp_doc.req_document_guid' ) req_document_crr_defintion_map = [ ['req_document_name', 'mine_report_definition_id'], ['Summary of TSF and Dam Safety Recommendations', 28], ['ITRB Activities Report', 27], ['Register of Tailings Storage Facilities and Dams', 47], ['Dam Safety Inspection (DSI) Report', 26], ['Dam Safety Review (DSR) Report', 31], ['“As-built” Reports', 32], ['Annual Reclamation', 25], ['MERP Record of Testing', 3], #['Annual Manager\'s Report', __________________ ], no mapping or data, ignore. ['OMS Manual', 33], ['Annual reconciliation of water balance and water management plans', 44], ['TSF risk assessment', 46], ['Mine Emergency Preparedness and Response Plan (MERP)', 24], ['Performance of high risk dumps', 29] ] table1 = etl.join(src_table, req_document_crr_defintion_map, 'req_document_name') mine_report = etl.cutout(table1, 'req_document_name') #to be inserted into db mine_report = etl.addfield(mine_report, 'submission_year', 2019) mine_report = etl.rename(mine_report, 'exp_document_status_code', 'mine_report_submission_status_code') mine_report = etl.addfield(mine_report, 'deleted_ind', lambda x: not x.active_ind) mine_report = etl.cutout(mine_report, 'active_ind') #to determine what FK's will be so can insert into related tables max_report_id = etl.fromdb(connection, 'select last_value from public.mine_report_mine_report_id_seq')[1][0] max_report_submission_id = etl.fromdb( connection, 'select last_value from public.mine_report_submission_mine_report_submission_id_seq')[1][0] #if sequence hasn't been used yet, fix off by one if max_report_id == 1: max_report_id = 0 if max_report_submission_id == 1: max_report_submission_id = 0 #get one-to-many mine_report, mine_report_submission_documents = etl.unjoin(mine_report, 'mine_document_guid', key='exp_document_guid') #add PK's for mappings mine_report_with_ids = etl.addrownumbers(mine_report, start=max_report_id + 1, step=1, field='mine_report_id') mine_report_with_ids = etl.addrownumbers(mine_report_with_ids, start=max_report_submission_id + 1, step=1, field='mine_report_submission_id') print(f'max_report_id= {max_report_id}, max_report_submission_id={max_report_submission_id}') #copy out fields for submission tables mine_report_submissions = etl.cut(mine_report_with_ids, [ 'mine_report_id', 'exp_document_guid', 'mine_report_submission_status_code', 'create_user', 'create_timestamp', 'update_user', 'update_timestamp' ]) mine_report_submissions = etl.addfield(mine_report_submissions, 'submission_date', lambda x: x.create_timestamp) #remove fields not in mine_report mine_report = etl.cutout(mine_report, 'mine_report_submission_status_code') #replace exp_document_guid FK with mine_report_submission FK submission_id_lookup = etl.cut(mine_report_with_ids, ['mine_report_submission_id', 'exp_document_guid']) mine_report_submission_documents = etl.join(submission_id_lookup, mine_report_submission_documents, key='exp_document_guid') mine_report_submission_documents = etl.cutout(mine_report_submission_documents, 'exp_document_guid') #removed original PK mine_report = etl.cutout(mine_report, 'exp_document_guid') mine_report_submissions = etl.cutout(mine_report_submissions, 'exp_document_guid') print(etl.valuecounter(etl.distinct(table1, key='exp_document_guid'), 'req_document_name')) print(etl.valuecounter(mine_report, 'mine_report_definition_id')) print(table1) print(mine_report) print(mine_report_submissions) print(mine_report_submission_documents) etl.appenddb(mine_report, connection, 'mine_report', commit=False) print('INSERT mine_report staged') etl.appenddb(mine_report_submissions, connection, 'mine_report_submission', commit=False) print('INSERT mine_report_submission staged') etl.appenddb(mine_report_submission_documents, connection, 'mine_report_document_xref', commit=False) print('INSERT mine_report_document_xref staged') if commit: connection.commit() print('DATA CREATION COMPLETE') else: connection.rollback() print('NO DATA CREATED: add --commit=true to insert report rows')
table2 = (etl.fromcsv('current_covid.csv').convert( 'median_age', float).convert('aged_65_older', float).convert('aged_70_older', float)) # same as above table , list is declared with header table2_header = [['iso_code', 'median_age', 'aged_65_older', 'aged_70_older']] table2_data = etl.cut(table2, 'iso_code', 'date', 'median_age', 'aged_65_older', 'aged_70_older') table2_dated = etl.select(table2_data, 'date', lambda v: v == '2020-04-30') table2_sort = etl.sort(table2_dated, key='iso_code') count = 0 for j in etl.values(table2_sort, 'iso_code', 'median_age', 'aged_65_older', 'aged_70_older'): if count == 15: break table2_header.append(j) count = count + 1 table_new = etl.head(table2_header, 15) # adding 3 column in table as per requirement # join is used to join the table final_table = etl.join(table_old, table_new, key='iso_code') print(final_table) # opening csv file in write mode and exporting data to csv file with open('covid.csv', 'w') as f: writer = csv.writer(f) writer.writerows(final_table)
def transfer_data(from_db_conn, to_db_conn): ''' Transfer data from databases given cursor to execute queries to connected databases Limitations: 1. poc.address_id is currently marked as -1 since it was not provided in test data and is a FK non-null constraint 2. institution2poc table is not available in old schema 3. role table is already populated in bill.sql file so that table is skipped by this script 4. poc_poc_id is currently set to be poc_id since no relevant information is available about the column 5. project2moc_project.role_id column is not available in old schema and is a not null field in new schema so we default it to 1 for now. 6. project2moc_project.username is not available from old schema so currently set to empty 7. raw_item_ts.item_id has duplicates when imported from item_ts. So we currently filter out and insert only uniques. :param from_db_conn: source database connection :param to_db_conn: destination database connection ''' # Emptying out tables with possible foreign key constraint issues fk_dep_tables = [ 'poc2project', 'poc2moc_project', 'poc', 'raw_item_ts', 'item', 'project', 'institution2moc_project' ] for table_name in fk_dep_tables: table = etl.fromdb(to_db_conn, "select * from {} where 1=0".format(table_name)) etl.todb(table, to_db_conn, table_name) # Tables with no change in schema insert_as_tables = [ 'institution', 'address', 'item_type', 'item2item', 'catalog_item' ] for table_name in insert_as_tables: table = etl.fromdb(from_db_conn, "select * from {}".format(table_name)) etl.todb(table, to_db_conn, table_name) # inserting dummy address for constraint matching dummy_address = [{'address_id': -1}] dummy_address_table = etl.fromdicts(dummy_address) etl.appenddb(dummy_address_table, to_db_conn, 'address') poc = etl.fromdb(from_db_conn, 'select * from poc') poc_transformed = etl.cutout(poc, 'domain_id', 'user_uid') poc_dummy_address = etl.replace(poc_transformed, 'address_id', None, -1) etl.todb(poc_dummy_address, to_db_conn, 'poc') project_names_table = etl.fromdb( from_db_conn, "select distinct project_name from project") moc_project_transformed = etl.addrownumbers(project_names_table) moc_project_transformed = etl.rename(moc_project_transformed, {'row': 'moc_project_id'}) etl.todb(moc_project_transformed, to_db_conn, 'moc_project') domain = etl.fromdb(from_db_conn, "select * from domain") domain_table_transformed = etl.cutout(domain, 'domain_uid') domain_table_transformed = etl.rename(domain_table_transformed, { 'domain_id': 'service_id', 'domain_name': 'service_name' }) etl.todb(domain_table_transformed, to_db_conn, 'service') project = etl.fromdb(from_db_conn, "select * from project") moc_project = etl.fromdb(to_db_conn, "select * from moc_project") project_moc_project_joined = etl.join(project, moc_project, key='project_name') project_table_transformed = etl.cutout(project_moc_project_joined, 'project_name') project_table_transformed = etl.rename(project_table_transformed, { 'domain_id': 'service_id', 'project_uid': 'project_uuid' }) etl.todb(project_table_transformed, to_db_conn, 'project') institution2project = etl.fromdb(from_db_conn, "Select * from institution2project") project = etl.fromdb(to_db_conn, "select project_id, moc_project_id from project") inst2project_project_joined = etl.join(institution2project, project, key='project_id') inst2moc_project = etl.cutout(inst2project_project_joined, 'domain_id') etl.todb(inst2moc_project, to_db_conn, 'institution2moc_project') project2poc = etl.fromdb(from_db_conn, "select * from project2poc") project2poc_project_joined = etl.join(project2poc, project, key='project_id') poc2moc_project = etl.cutout(project2poc_project_joined, 'project_id', 'domain_id') poc2moc_project = etl.addfield(poc2moc_project, 'role_id', 1) poc2moc_project = etl.addfield(poc2moc_project, 'poc_poc_id', lambda rec: rec['poc_id']) etl.todb(poc2moc_project, to_db_conn, 'poc2moc_project') poc2project = etl.cutout(project2poc, 'domain_id') poc2project = etl.addfield(poc2project, 'role_id', 1) poc2project = etl.addfield(poc2project, 'username', '') etl.todb(poc2project, to_db_conn, 'poc2project') item = etl.fromdb(from_db_conn, "select * from item") item_transformed = etl.cutout(item, 'domain_id') etl.todb(item_transformed, to_db_conn, 'item') raw_item_ts_unique = etl.fromdb( from_db_conn, "WITH summary AS ( SELECT its.item_id, its.start_ts, its.end_ts, its.state, its.catalog_item_id, ROW_NUMBER() OVER(PARTITION BY its.item_id) AS rk FROM ITEM_TS its) SELECT s.* FROM summary s WHERE s.rk = 1" ) raw_item_ts_unique = etl.cutout(raw_item_ts_unique, 'rk') etl.todb(raw_item_ts_unique, to_db_conn, 'raw_item_ts')
d_date = etl.addcolumn(d_date, 'date_id', generated) # country d_country = etl.distinct(etl.cut(m_table, 'country')) rows = etl.nrows(d_country) generated =[] for i in range(rows): uuid = out_cursor.execute('SELECT UUID();') uuid = out_cursor.fetchone()[0] generated.append(uuid) d_country = etl.addcolumn(d_country, 'country_id', generated) # movie (fact table) f_movie = etl.cut(m_table, 'imdb_title_id', 'year', 'date_published', 'genre', 'country') # foreign key for imdb names id (movie personnel) f_movie = etl.join(f_movie, tp_table, key='imdb_title_id') # get only the necessary ones f_movie = etl.cut(f_movie, 'imdb_title_id', 'imdb_name_id', 'year', 'date_published', 'genre', 'country') # foreign key for genre id (genre) f_movie = etl.join(f_movie, d_genre, key='genre') # get only the necessary ones f_movie = etl.cut(f_movie, 'imdb_title_id', 'imdb_name_id', 'year', 'date_published', 'genre_id', 'country') # foreign key for country id (country) f_movie = etl.join(f_movie, d_date, key=['year', 'date_published']) # get only the necessary ones f_movie = etl.cut(f_movie, 'imdb_title_id', 'imdb_name_id', 'date_id', 'genre_id', 'country') # foreign key for country id (country) f_movie = etl.join(f_movie, d_country, key='country') # get only the necessary ones f_movie = etl.cut(f_movie, 'imdb_title_id', 'imdb_name_id', 'date_id', 'genre_id', 'country_id') # get the four ratings
dor_parcel_read_stmt = ''' select parcel_id, street_address, address_low, address_low_suffix, address_low_frac, address_high, street_predir, street_name, street_suffix, street_postdir, street_full from {dor_parcel_table} '''.format(dor_parcel_table='dor_parcel') engine_dor_parcel_rows = etl.fromdb(pg_db, dor_parcel_read_stmt) if DEV: print(etl.look(engine_dor_parcel_rows)) # Get duplicate parcel_ids: non_unique_parcel_id_rows = engine_dor_parcel_rows.duplicates(key='parcel_id') unique_parcel_id_rows = etl.complement(engine_dor_parcel_rows, non_unique_parcel_id_rows) # Get address comps for condos by joining to dor_parcel with unique parcel_id on parcel_id: print("Relating condos to parcels...") joined = etl.join(source_dor_condo_rows, unique_parcel_id_rows, key='parcel_id') \ .convert('street_address', lambda a, row: row.street_address + ' # ' + row.unit_num, pass_row=True) print("joined rowcount: ", etl.nrows(joined)) if DEV: print(etl.look(joined)) # Calculate errors print("Calculating errors...") unjoined = etl.antijoin(source_dor_condo_rows, joined, key='source_object_id') print("unjoined rowcount: ", etl.nrows(unjoined)) dor_condos_unjoined_unmatched = etl.antijoin(unjoined, non_unique_parcel_id_rows, key='parcel_id').addfield( 'reason', 'non-active/remainder mapreg') print("non-active/remainder mapreg error rowcount: ", etl.nrows(dor_condos_unjoined_unmatched))
def retrieve_rna_data(self): """ Parse 'rna_tissue' csv file, RNA levels in 56 cell lines and 37 tissues based on RNA-seq from HPA. :return: dict """ self.logger.info('get rna tissue rows into dicts') self.logger.debug('melting rna level table into geneid tissue level') t_level = (petl.fromcsv(URLZSource( self.rna_level_url), delimiter='\t').melt( key='ID', variablefield='tissue', valuefield='rna_level').rename({ 'ID': 'gene' }).addfield( 'tissue_label', lambda rec: name_from_tissue( rec['tissue'].strip(), self.t2m)).addfield( 'tissue_code', lambda rec: code_from_tissue( rec['tissue_label'], self.t2m)).addfield( 'anatomical_systems', lambda rec: asys_from_tissue( rec['tissue_label'], self.t2m)). addfield( 'organs', lambda rec: organs_from_tissue( rec['tissue_label'], self.t2m)).cutout('tissue')) t_value = (petl.fromcsv(URLZSource( self.rna_value_url), delimiter='\t').melt( key='ID', variablefield='tissue', valuefield='rna_value').rename({ 'ID': 'gene' }).addfield( 'tissue_label', lambda rec: name_from_tissue( rec['tissue'].strip(), self.t2m)).addfield( 'tissue_code', lambda rec: code_from_tissue( rec['tissue_label'], self.t2m)).addfield( 'rna_unit', 'TPM').cutout('tissue')) t_zscore = (petl.fromcsv( URLZSource(self.rna_zscore_url), delimiter='\t').melt( key='ID', variablefield='tissue', valuefield='zscore_level').rename({ 'ID': 'gene' }).addfield( 'tissue_label', lambda rec: name_from_tissue(rec['tissue'].strip( ), self.t2m)).addfield( 'tissue_code', lambda rec: code_from_tissue( rec['tissue_label'], self.t2m)).cutout('tissue')) t_vl = petl.join(t_level, t_value, key=('gene', 'tissue_code', 'tissue_label'), presorted=True) t_join = (petl.join(t_vl, t_zscore, key=('gene', 'tissue_code', 'tissue_label'), presorted=True).aggregate( 'gene', aggregation={ 'data': (('tissue_code', 'tissue_label', 'rna_level', 'rna_value', 'rna_unit', 'anatomical_systems', 'organs', 'zscore_level'), list) }, presorted=True)) return t_join
import petl as etl import csv table1=(etl.fromcsv('covid.csv')) # importing data from xml file and creating table table2 = etl.fromxml('Country_location.xml','.//tr',('th','td')) # print(table2) # removing column country from table table3=etl.cutout(table2,'country') # merging the covid table with xml data table4=etl.join(table1,table3,key='location') print(table4) # writing result to csv file with open('covid_countries.csv','w') as f: writer=csv.writer(f) writer.writerows(table4)
stores = etl.fromcsv('stores.csv') # Open XML document locations = etl.fromxml('locations.xml', 'store', {'Name': 'Name', 'Lat': 'Lat', 'Lon': 'Lon'}) print(locations) # Set output output_table = [["ID", "Name", "Suburb", "State", "Postcode"]] store_id = 1 # Read through the store.csv to generate output_table store = etl.cut(stores, 'Name', 'Suburb', 'State', 'Postcode').distinct() print(store) for s in etl.values(store, 'Name', 'Suburb', 'State', 'Postcode'): output_table.append([store_id, s]) store_id += 1 print (output_table) # Merge and join XML and CSV together merge_output = etl.join(stores, locations, key="Name") print(merge_output) store_table = etl.cut(merge_output, 'ID', 'Name', 'Suburb', 'State', 'Postcode', 'Lat', 'Lon') print(etl.head(store_table, 5)) # Export to CSV file etl.tocsv(merge_output, 'store_locations.csv')
# Load a full year (2018) with the most simple datetime analysis # Year, month, day, hour, minute, second # For the full loading process , use the reference on the references.txt # This should be a processure with all the validation logic there, to create the next X months when it is called # Facts # This facts table will be the staging with all the needed info to quickly update with the dimension keys and load to the facts table # The facts table will have columns to match each column on the dim Time table, to make it easier to get the reference key # events_uid = etl.cutout(events, 'tracking_id', 'utm_medium', 'utm_campaign') events_tui = etl.cutout(events, 'user_id') stage_uid = etl.join(users, events_uid, key='user_id') stage_tui = etl.join(users, events_tui, key='tracking_id') stage_utm = etl.cut(stage_tui, 'user_id', 'utm_medium', 'utm_campaign') stage_uid_utm = etl.join(stage_uid, stage_utm, key='user_id') stage_m_s = etl.mergesort(stage_uid_utm, stage_tui, key=['created_at', 'email']) # Mapping definitions mappings = OrderedDict() mappings['tid'] = 'tracking_id' mappings['uid'] = 'user_id' mappings['utm_medium'] = 'utm_medium' mappings['utm_campaign'] = 'utm_campaign', {'audio': 'none', 'social': 'none'} mappings['utm_campaign_type'] = 'utm_campaign' mappings['email'] = 'email'
def joinTables(tableOne, tableTwo, key): return etl.join(tableOne, tableTwo, key=key)
def extract_backend(offline=OFFLINE): # Done in 4 steps: (1) grab the driver table from the CloudSQL, # (2) use the user uuids to query for users one by one through # the API, (3) get the fleet table from CloudSQL and (4) join # everything together. def extract_drivers(): query = SQLReader('sql.drivers_from_cloudsql') drivers_df = sql.execute(query.statements[0]) drivers_tb = fromdataframe(drivers_df) mappings = { 'driver_uuid': lambda rec: str(UUID(bytes=rec['uuid'], version=4)), 'fleet_uuid': lambda rec: str(UUID(bytes=rec['fleet_uuid'], version=4)), 'user_uuid': lambda rec: str(UUID(bytes=rec['user_ds_uuid'], version=4)), 'fullname': lambda rec: rec['last_name'].strip() + ', ' + rec['first_name'].strip(), } drivers_tb = drivers_tb.fieldmap(mappings) drivers_tb = drivers_tb.suffixheader('_in_backend') return drivers_tb def extract_users(): users_records = [api.get_record('users', driver.user_uuid_in_backend) for driver in drivers.namedtuples()] users_df = DataFrame().from_records(users_records) users_tb = fromdataframe(users_df) mappings = { 'driver_uuid': 'driver', 'user_uuid': 'uuid', 'backend_username': '******' } users_tb = users_tb.fieldmap(mappings) users_tb = users_tb.suffixheader('_in_backend') return users_tb def extract_fleets_from_dwh(): query = SQLReader('sql.fleets_from_tableau') fleets_df = dwh.execute(query.statements[0]) fleets_tb = fromdataframe(fleets_df) mappings = { 'fleet_uuid': 'uuid', 'fleetname': lambda rec: rec['backend_name'].replace('_', ' '), 'country_code': 'country_code', } fleets_tb = fleets_tb.cutout('country_code') fleets_tb = fleets_tb.fieldmap(mappings) fleets_tb = fleets_tb.suffixheader('_in_backend') return fleets_tb if not offline: sql = CloudSQLConnector() api = ValkfleetConnector() dwh = WarehouseConnector() drivers = extract_drivers() fleets = extract_fleets_from_dwh() users = extract_users() drivers.topickle(DRIVERS_IN_BACKEND_FILEPATH) fleets.topickle(FLEETS_IN_BACKEND_FILEPATH) users.topickle(USERS_IN_BACKEND_FILEPATH) else: drivers = frompickle(DRIVERS_IN_BACKEND_FILEPATH) fleets = frompickle(FLEETS_IN_BACKEND_FILEPATH) users = frompickle(USERS_IN_BACKEND_FILEPATH) write_to_log(drivers, 'drivers', 'backend') write_to_log(fleets, 'fleets', 'backend') write_to_log(users, 'users', 'backend') drivers_without_fleet = antijoin(drivers, fleets, key='fleet_uuid_in_backend') drivers_without_user = antijoin(drivers, users, key='user_uuid_in_backend') write_to_log(drivers_without_fleet, 'drivers without fleet', 'backend') write_to_log(drivers_without_user, 'drivers without user', 'backend') drivers_n_fleets = join(drivers, fleets, key='fleet_uuid_in_backend').cutout('fleet_uuid_in_backend') backend_drivers = join(drivers_n_fleets, users, key='user_uuid_in_backend') backend_drivers = backend_drivers.addfield('backend_username', lambda rec: rec['backend_username_in_backend']) backend_drivers = backend_drivers.cutout('driver_uuid_in_backend') backend_drivers = standardize_missing_values(backend_drivers) write_to_log(backend_drivers, 'drivers', 'backend') return backend_drivers
[1, 'circle'], [1, 'square'], [2, 'ellipse']] table8 = [['id', 'time', 'height'], [1, 1, 12.3], [1, 2, 34.5], [2, 1, 56.7]] table9 = [['id', 'time', 'weight'], [1, 2, 4.5], [2, 1, 6.7], [2, 2, 8.9]] from petl import join, look look(table1) look(table2) table3 = join(table1, table2, key='id') look(table3) # if no key is given, a natural join is tried table4 = join(table1, table2) look(table4) # note behaviour if the key is not unique in either or both tables look(table5) look(table6) table7 = join(table5, table6, key='id') look(table7) # compound keys are supported look(table8) look(table9) table10 = join(table8, table9, key=['id', 'time']) look(table10)
print("Reading parcels...") dor_parcel_read_stmt = ''' select parcel_id, street_address, address_low, address_low_suffix, address_low_frac, address_high, street_predir, street_name, street_suffix, street_postdir, street_full from {dor_parcel_table} '''.format(dor_parcel_table='dor_parcel') engine_dor_parcel_rows = etl.fromdb(pg_db, dor_parcel_read_stmt) if DEV: print(etl.look(engine_dor_parcel_rows)) # Get duplicate parcel_ids: non_unique_parcel_id_rows = engine_dor_parcel_rows.duplicates(key='parcel_id') unique_parcel_id_rows = etl.complement(engine_dor_parcel_rows, non_unique_parcel_id_rows) # Get address comps for condos by joining to dor_parcel with unique parcel_id on parcel_id: print("Relating condos to parcels...") joined = etl.join(source_dor_condo_rows, unique_parcel_id_rows, key='parcel_id') \ .convert('street_address', lambda a, row: row.street_address + ' # ' + row.unit_num, pass_row=True) print("joined rowcount: ", etl.nrows(joined)) if DEV: print(etl.look(joined)) # Calculate errors print("Calculating errors...") unjoined = etl.antijoin(source_dor_condo_rows, joined, key='source_object_id') print("unjoined rowcount: ", etl.nrows(unjoined)) dor_condos_unjoined_unmatched = etl.antijoin(unjoined, non_unique_parcel_id_rows, key='parcel_id').addfield('reason', 'non-active/remainder mapreg') print("non-active/remainder mapreg error rowcount: ", etl.nrows(dor_condos_unjoined_unmatched)) if DEV: print(etl.look(dor_condos_unjoined_unmatched)) dor_condos_unjoined_duplicates = etl.antijoin(unjoined, dor_condos_unjoined_unmatched, key='source_object_id').addfield('reason', 'non-unique active/remainder mapreg') print("non-unique active/remainder mapreg error rowcount: ", etl.nrows(dor_condos_unjoined_duplicates)) if DEV:
international_code = "(+61)" with open(IN_FILE, 'r') as infile, open(OUT_FILE, "w") as outfile: csv_reader = csv.reader(infile) writer = csv.writer(outfile) headers = next(csv_reader, None) #skipping header row writer.writerow(headers) for row in csv_reader: number_column = row[5] state_column = row[3] clean_num = re.sub("\D", "", row[5])[-8:] formatted_num = international_code + " " + regional_code[ state_column] + " " + clean_num row[5] = formatted_num writer.writerow(row) services = petl.fromcsv(SERVICES_FILE) offices = petl.fromcsv(OUT_FILE) offices = offices.rename({"Contact Name": "Office", "Phone Number": "Phone"}) offices = petl.cutout(offices,"State","Postcode") locations = petl.fromcsv(LOC_FILE) locations = locations.rename({"officeID": "OfficeID"}) office_service = petl.join(services, offices, key='OfficeID') office_service_locations = petl.join( office_service, locations, key='OfficeID') office_service_locations = petl.convert(office_service_locations,'OfficeServiceID',int) office_service_locations = petl.sort(office_service_locations,'OfficeServiceID') petl.tocsv(office_service_locations, 'office_service_locations.csv')