def _petl_transform(self, record_set): if "transform" in self.task: transform = self.task["transform"] if "convert" in transform: conversions = {} for field, func in transform["convert"]: conversions[field] = func record_set = etl.convert(record_set, conversions) if "filter" in transform: record_set = etl.select(record_set, transform["filter"]) if "remove" in transform: cuts = [] for field in transform["remove"]: cuts.append(field) record_set = etl.cutout(record_set, cuts) if "rename" in transform: names = {} for old, new_one in transform["rename"]: names[old] = new_one record_set = etl.rename(record_set, names) return record_set
def synctable(self, sourceDb, targetDb, sourceTable, targetTable): sourceCursor = sourceDb.cursor() targetCursor = targetDb.cursor() affected_total = 0 init_rowCount = targetTable.rowCount if targetTable.rowCount < sourceTable.rowCount else sourceTable.rowCount pbar = tqdm(total=sourceTable.rowCount, unit='records') pbar.update(init_rowCount) while sourceTable.lastUpdatetime > targetTable.lastUpdatetime: affected_rows = 0 batchSize = 100000 sql = "SELECT * FROM (SELECT * FROM {schema}.{tablename} WHERE {timestamp}>=to_timestamp('{last_updatetime}','yyyy-mm-dd hh24:mi:ss.ff6') ORDER BY {timestamp}) WHERE ROWNUM<={batch_size}".format( timestamp=sourceTable.timestampField, schema=sourceTable.schema, tablename=sourceTable.tablename, last_updatetime=targetTable.lastUpdatetime, batch_size=batchSize) sourceRecord = etl.fromdb(lambda: CursorProxy(sourceDb.cursor()), sql) targetRecord = etl.fromdb( lambda: CursorProxy(targetDb.cursor()), "SELECT * FROM {schema}.{tablename} WHERE 1=0".format( schema=targetTable.schema, tablename=targetTable.tablename)) sourceTable.columns = etl.header(sourceRecord) targetTable.columns = etl.header(targetRecord) for column in list( set(sourceTable.columns) - set(targetTable.columns)): sourceRecord = etl.cutout(sourceRecord, column) max_updatetime = sourceRecord.cut( sourceTable.timestampField).skip(1).max()[0] sourceRecord = sourceRecord.sort(sourceTable.timestampField) etl.appenddb(sourceRecord, CursorProxy(targetCursor), targetTable.tablename, schema=targetTable.schema, commit=True) affected_rows += targetCursor.rowcount targetTable.lastUpdatetime = max_updatetime.strftime( '%Y-%m-%d %H:%M:%S.%f') targetTable.rowCount += affected_rows pbar.update(affected_rows if init_rowCount + affected_total + affected_rows < sourceTable.rowCount else sourceTable.rowCount - init_rowCount - affected_total) affected_total += affected_rows pbar.set_description("%s |%d records updated." % (targetTable.tablename, affected_total)) if targetTable.lastUpdatetime > sourceTable.lastUpdatetime: pbar.set_description("%s |timestamp >, skip." % (targetTable.tablename)) elif targetTable.lastUpdatetime == sourceTable.lastUpdatetime and targetTable.rowCount == sourceTable.rowCount: pbar.set_description("%s |no data change." % (targetTable.tablename)) elif targetTable.lastUpdatetime == sourceTable.lastUpdatetime and targetTable.rowCount > sourceTable.rowCount: pbar.set_description("%s |RowCount > but timestamp ==, skip." % (targetTable.tablename)) elif targetTable.lastUpdatetime == sourceTable.lastUpdatetime and targetTable.rowCount < sourceTable.rowCount: pbar.set_description("%s |RowCount < but timestamp ==, skip." % (targetTable.tablename)) pbar.close()
def drained_entries(ctx: typer.Context, issues, entries, project): config = ctx.meta['config'] empty_entries, unset_entries = petl.biselect( entries, lambda row: row['issue_id'] is None) drain_issues = list( petl.dicts( transform.select_drain_issues( issues, assignee_id=ctx.meta['rdm_user']['id'], drain_cf_id=get_proj_attr(config, project, 'rdm_drain_cf_id')))) if not len(drain_issues): log.error('No drain issues found') return petl.head(unset_entries, 0), entries if len(drain_issues) > 1: log.warning( f'Found {len(drain_issues)} drain issues. Will use only first one') drain_issue = drain_issues[0] drained = petl.addfield(petl.cutout(empty_entries, 'issue_id'), 'issue_id', drain_issue['id']) return drained, unset_entries
def lookup_and_transform(ts_kv_table): """The table has the following structure: +---------------------------------+---------------+---------------+--------+ | entity_id | key | ts | value | +=================================+===============+===============+========+ | 1ea47494dc14d40bd76a73c738b665f | Temperature | 1583010011665 | -1.8 | +---------------------------------+---------------+---------------+--------+ | 1ea47494dc14d40bd76a73c738b665f | WindDirection | 1583010000692 | 227 | +---------------------------------+---------------+---------------+--------+ The output is a dictionary {device_id:table} of tables like that: +--------------+--------------+---------------+ | ts | Temperature | WindDirection | +--------------+--------------+---------------+ |1583010011665 | -1.8 | 230 | +--------------+--------------+---------------+ |1583010000692 | -2.5 | 227 | +--------------+--------------+---------------+ """ lkp = petl.lookup(ts_kv_table, 'entity_id', value=('key', 'ts', 'value')) for entity_id in lkp: tbl = [('key', 'ts', 'value')] + lkp[entity_id] tbl = petl.recast(tbl, variablefield='key', valuefield='value') cut_keys = KEYS_TO_REMOVE & set(petl.fieldnames(tbl)) tbl = petl.cutout(tbl, *cut_keys) tbl = petl.transform.headers.sortheader(tbl) tbl = petl.transform.basics.movefield(tbl, 'ts', 0) lkp[entity_id] = petl.sort(tbl, 'ts') return lkp
def transform_fields_254(tbl, ts_kv_dict): """The input is a dump of ts_kv table for TB version 2.5.4: +----------------------------------+---------------+---------------+--------+-------+--------+-------+ | entity_id | key | ts | bool_v | str_v | long_v | dbl_v | +==================================+===============+===============+========+=======+========+=======+ | 1ea47494dc14d40bd76a73c738b665f | 25 | 1583010011665 | | | | -1.8 | +----------------------------------+---------------+---------------+--------+-------+--------+-------+ | 1ea47494dc14d40bd76a73c738b665f | 36 | 1583010000692 | | | 227 | | +----------------------------------+---------------+---------------+--------+-------+--------+-------+ The output: +---------------------------------+---------------+---------------+--------+ | entity_id | key | ts | value | +=================================+===============+===============+========+ | 1ea47494dc14d40bd76a73c738b665f | Temperature | 1583010011665 | -1.8 | +---------------------------------+---------------+---------------+--------+ | 1ea47494dc14d40bd76a73c738b665f | WindDirection | 1583010000692 | 227 | +---------------------------------+---------------+---------------+--------+ ts_kv_dict is a dict like {25:'Temperature', 36:'WindDirection'} """ ts_kv_table = petl.transform.conversions.convert(tbl, {'ts': int, 'key': lambda k: ts_kv_dict[k]}) ts_kv_table = petl.addfield(ts_kv_table, 'value', lambda row: get_value(row)) ts_kv_table = petl.cutout(ts_kv_table, 'bool_v', 'str_v', 'long_v', 'dbl_v') return ts_kv_table
def get_counter_table(fields, csv_name): if not fields: return 'No data to count' csv_file = f'{CSV_DIR}/{csv_name}' csv_data = petl.fromcsv(csv_file) cut_csv_data = petl.cutout(petl.valuecounts(csv_data, *fields), 'frequency') html_data = get_html_data(cut_csv_data) return html_data
def dataPreProcessing(fileName): inputData = fromcsv(fileName) table1 = cutout(inputData, 'member_id', 'grade', 'sub_grade', 'emp_title', 'url', 'desc', 'title', 'accept_d', 'exp_d', 'list_d', 'issue_d', 'purpose', 'addr_city', 'addr_state', 'earliest_cr_line', 'last_pymnt_d', 'next_pymnt_d', 'last_credit_pull_d') table2 = select( table1, lambda i: i['term'] == ' 36 months' and i['loan_status'] is not "") labelMapping = OrderedDict() labelMapping['loan_status'] = 'loan_status' labelMapping['id'] = 'id' table6 = fieldmap(table2, labelMapping) table8 = sort(table6, 'id') table10 = cutout(table8, 'id') mappings = OrderedDict() mappings['id'] = 'id' mappings['home_ownership'] = 'ownership', { 'MORTGAGE': '-1', 'RENT': '0', 'OWN': '1' } mappings['emp_length'] = 'empLength', {'n/a': 0} mappings['is_inc_v'] = 'verificationStatus', { 'Source Verified': 1, 'Verified': 0, 'Not Verified': -1 } mappings['pymnt_plan'] = 'paymentPlan', {'n': 0, 'y': 1} mappings['initial_list_status'] = 'listStatus', {'f': 0, 'w': 1} table3 = fieldmap(table2, mappings) table4 = cutout(table2, 'home_ownership', 'is_inc_v', 'pymnt_plan', 'initial_list_status', 'term', 'loan_status') table5 = merge(table3, table4, key='id') table7 = sort(table5, 'id') table9 = cutout(table7, 'id') featureFileCsv = tocsv(table9, 'featureFileCsv.csv') labelsFileCsv = tocsv(table10, 'labelsFileCsv.csv') return featureFileCsv, labelsFileCsv
def createFacts(events, users): try: events_uid = etl.cutout(events, 'tracking_id', 'utm_medium', 'utm_campaign') events_tui = etl.cutout(events, 'user_id') stage_uid = etl.join(users, events_uid, key='user_id') stage_tui = etl.join(users, events_tui, key='tracking_id') stage_utm = etl.cut(stage_tui, 'user_id', 'utm_medium', 'utm_campaign') stage_uid_utm = etl.join(stage_uid, stage_utm, key='user_id') stage_m_s = etl.mergesort(stage_uid_utm, stage_tui, key=['created_at', 'email']) mappings = OrderedDict() mappings['tid'] = 'tracking_id' mappings['uid'] = 'user_id' mappings['utm_medium'] = 'utm_medium' mappings['utm_campaign'] = 'utm_campaign', {'audio': 'none', 'social': 'none'} mappings['utm_campaigntype'] = 'utm_campaign' mappings['email'] = 'email' mappings['subscription'] = 'type' mappings['sub_order'] = 'type', {'Signup Completed': '1', 'Trial Started': '2', 'Subscription Started': '3', 'Subscription Ended': '4'} mappings['created_at'] = 'created_at' # Mapping stage_mapping = etl.fieldmap(stage_m_s, mappings) # Sort stage_mapping_ordered = etl.sort(stage_mapping, key=['created_at', 'email', 'sub_order']) # Datetime split t1 = etl.split(stage_mapping_ordered, 'created_at', 'T', ['date', 'time'], include_original=True) t2 = etl.split(t1, 'date', '-', ['year', 'month', 'day']) stage_ready = etl.split(t2, 'time', ':', ['hour', 'minute', 'second']) # Export as csv to load folder etl.tocsv(stage_ready, 'load/facts.csv') except Exception as e: print("Something went wrong. Error {0}".format(e))
def remove_column(self, *columns): """ Remove a column from your table `Args:` \*columns: str Column names `Returns:` `Parsons Table` and also updates self """ # noqa: W605 self.table = petl.cutout(self.table, *columns) return self
def load_devices(file): """ File header: id;additional_info;customer_id;type;name;label;search_text;tenant_id Output: dictionary like {'id_1': {name:'Device 1'}, ...} """ tbl_devices = petl.io.csv.fromcsv(file, delimiter=';', encoding='utf-8') tbl_devices = petl.cutout(tbl_devices, 'customer_id', 'search_text', 'tenant_id') # PETL docs: # https://petl.readthedocs.io/en/stable/util.html#petl.util.lookups.dictlookupone devices = petl.dictlookupone(tbl_devices, 'id') return devices
def group_entries_by_day(inp): hdr = petl.header(inp) agg = OrderedDict() for field in hdr: # using first found value agg[field] = field, next agg['dur'] = 'dur', lambda durs: sum(durs, timedelta()) agg['start'] = 'start', min with_day = petl.addfield(inp, 'start_date', lambda row: row.get('start').date()) index_keys = ('start_date', 'description') result = petl.aggregate(with_day, index_keys, agg) return petl.cutout(result, 'start_date')
def order_by_constraint(base_path, table, schema, self_dep_set): file_name = base_path + "/content/data/" + table + ".tsv" tempfile = NamedTemporaryFile(mode='w', dir=base_path + "/content/data/", delete=False) table = etl.fromcsv(file_name, delimiter='\t', skipinitialspace=True, quoting=csv.QUOTE_NONE, quotechar='', escapechar='') key_dep_dict = {} # print(file_name) for constraint in self_dep_set: child_dep, parent_dep = constraint.split(':') data = etl.values(table, child_dep, parent_dep) for d in data: key_dep_set = {d[1]} key_dep_dict.update({d[0]: key_dep_set}) key_dep_list = toposort_flatten(key_dep_dict) table = etl.addfield(table, 'pwb_index', lambda rec: int(key_dep_list.index(rec[child_dep]))) table = etl.sort(table, 'pwb_index') table = etl.cutout(table, 'pwb_index') writer = csv.writer(tempfile, delimiter='\t', quoting=csv.QUOTE_NONE, quotechar='', lineterminator='\n', escapechar='') writer.writerows(table) shutil.move(tempfile.name, file_name)
def transform_fields_old(tbl): """The input is a dump of ts_kv table for TB version <= 2.5.4: +-------------+---------------------------------+---------------+---------------+--------+-------+--------+-------+ | entity_type | entity_id | key | ts | bool_v | str_v | long_v | dbl_v | +=============+=================================+===============+===============+========+=======+========+=======+ | DEVICE | 1ea47494dc14d40bd76a73c738b665f | Temperature | 1583010011665 | | | | -1.8 | +-------------+---------------------------------+---------------+---------------+--------+-------+--------+-------+ | DEVICE | 1ea47494dc14d40bd76a73c738b665f | WindDirection | 1583010000692 | | | 227 | | +-------------+---------------------------------+---------------+---------------+--------+-------+--------+-------+ The output: +-------------+---------------------------------+---------------+---------------+--------+ | entity_type | entity_id | key | ts | value | +=============+=================================+===============+===============+========+ | DEVICE | 1ea47494dc14d40bd76a73c738b665f | Temperature | 1583010011665 | -1.8 | +-------------+---------------------------------+---------------+---------------+--------+ | DEVICE | 1ea47494dc14d40bd76a73c738b665f | WindDirection | 1583010000692 | 227 | +-------------+---------------------------------+---------------+---------------+--------+ """ ts_kv_table = petl.transform.conversions.convert(tbl, 'ts', int) ts_kv_table = petl.addfield(ts_kv_table, 'value', lambda row: get_value(row)) ts_kv_table = petl.cutout(ts_kv_table, 'bool_v', 'str_v', 'long_v', 'dbl_v') return ts_kv_table
file_name = 'datasets-%s.csv' % datetime.now().strftime('%Y%m%d%H%M%S') directory = 'csv' if not os.path.exists(directory): os.makedirs(directory) # etl.tocsv(aggregated_summary, './%s/%s' % (directory, file_name)) # logging.info('This %s has been exported' % file_name) rooms, participations = storing_data_preparation(aggregated_summary) participations = etl.leftjoin(participations, external_ids, lkey='participant_id', rkey='id', rprefix='r_') participations = etl.cutout(participations, 'participant_id') participations = etl.rename(participations, 'r_external_id', 'participant_id') rooms = etl.leftjoin(rooms, external_ids, lkey='creator', rkey='id', rprefix='r_') rooms = etl.cutout(rooms, 'creator') rooms = etl.rename(rooms, 'r_external_id', 'creator') logging.info('Storing data %s to database' % file_name) loader.truncate_table(dBConnection.connect(**config)) loader.store_to_db(dBConnection.connect(**config), tablename='rooms',
print("Writing tables to Databridge...") # address standardization report: # etl.todb(processed_rows, get_cursor, address_standardization_report_table_name) # other tables go to ais_sources account: dsn = get_dsn('ais_sources') connection = cx_Oracle.Connection(dsn) # zip4: etl.fromcsv(temp_zip4_outfile_path).todb(get_cursor, zip4_write_table_name) # cityzip: etl.fromcsv(cityzip_outfile_path).todb(get_cursor, cityzip_write_table_name) # alias: etl.fromcsv(alias_outfile_path).todb(get_cursor, alias_write_table_name) # Write processed_rows to uspszip4.csv: print("Writing cleaned_usps output to {zip4_outfile_path}".format(zip4_outfile_path=zip4_outfile_path)) etl.cutout(processed_rows, 'base', 'pre', 'name', 'suffix', 'post', 'change_pre', 'change_name', 'change_suffix', 'change_post') \ .rename({'std_base': 'base', 'std_pre': 'pre', 'std_name': 'name', 'std_suffix': 'suffix', 'std_post': 'post'}) \ .cut('street_full', 'pre', 'name', 'suffix', 'post', 'low', 'high', 'oeb', 'unit', 'unitlow', 'unithigh', 'unitoeb', 'buildingorfirm', 'recordtype', 'zipcode', 'zip4') \ .convert('low', int) \ .select("{low} is not None") \ .sort(key=['name', 'pre', 'suffix', 'post', 'low', 'high', 'unit', 'unitlow', 'unithigh']) \ .tocsv(zip4_outfile_path, write_header=False) # Write processed_rows to s3: print("Writing {zip4_outfile_path} to s3".format(zip4_outfile_path=zip4_outfile_path)) # s3 = boto3.resource('s3', config=Config(proxies={'http': os.environ['HTTP_PROXY'], 'https': os.environ['HTTPS_PROXY']})) s3 = boto3.resource('s3') s3.meta.client.upload_file(zip4_outfile_path, s3_bucket, 'static files/' + zip4_outfile_path) # Clean up: os.remove(temp_zip4_outfile_path)
table = etl.convert(table, 'rep_tm', timeparser('%H:%M')) table = etl.addfield( table, 'reported_timestamp', lambda x: datetime.combine( x['rep_dt'], (x['rep_tm'] or time(0, 0))) + timedelta(hours=8)) debug(table, ['rep_dt', 'rep_tm', 'reported_timestamp']) table = clean_up(table, 'rep_dt') table = clean_up(table, 'rep_tm') ####### Number of fatalities print('JOINING number_of_fatalities from mms.mssoccd checkbox') fatalities_table = etl.fromcsv('do_fatalities.csv', encoding='utf-8') fatalities_table = etl.convert(fatalities_table, 'min_acc_no', str) table = etl.leftjoin(table, fatalities_table, key='min_acc_no') table = etl.addfield(table, 'number_of_fatalities', lambda x: 1 if x['chk'] else 0) #this is a boolean in src table = etl.cutout(table, 'chk') print('\t# OF INCIDENTS WITH FATALITIES ' + str(etl.valuecount(table, 'number_of_fatalities', 1)[0])) ####### Number of Injuries print('JOINING number_of_injuries from mms.mssoccd occ_typ D02 textbox') injuries_table = etl.fromcsv('do_injuries.csv', encoding='utf-8') injuries_table = etl.cutout(injuries_table, 'occ_typ') injuries_table = etl.convert(injuries_table, 'min_acc_no', str) injuries_table = etl.convert(injuries_table, 'val', lambda x: int(x.strip()) if x.strip() else 0) table = etl.leftjoin(table, injuries_table, key='min_acc_no') table = etl.addfield(table, 'number_of_injuries', lambda x: x['val'] or 0) num_val_zero_or_none = etl.valuecount(table, 'val', 0)[0] + etl.valuecount( table, 'val', None)[0]
def concat_columns(table, new_column, column_left, column_right): new_table = etl.addfield( table, new_column, lambda rec: rec[column_left] + ', ' + rec[column_right], 1) return etl.cutout(new_table, column_left, column_right)
# Use US births only table2 = etl.select(table, lambda rec: rec.birthCountry == 'USA') # Only use these fields table3 = etl.cut(table2, 'nameFirst', 'nameLast', 'debut', 'bbrefID', 'weight', 'height', 'finalGame', 'birthCity', 'birthState', 'birthYear') # Remove null birth city and birth year table4 = etl.select(table3, lambda rec: rec.birthCity != "" and rec.birthYear != "") # Add Baseball Reference URL table5 = etl.addfield(table4, 'baseball_ref_url', add_bbreflink) # Remove unnecessary bbrefid table6 = etl.cutout(table5, "bbrefID") # Load city,state lat long table. city = etl.fromcsv(sys.argv[2]) # Only use these fields city2 = etl.cut(city, "city", "state", "lat", "long") # Join tables by two keys lat_table = etl.leftjoin(table6, city2, lkey=["birthCity", "birthState"], rkey=["city", "state"]) # Output merged file to csv lat_table.tocsv(sys.argv[3])
# Load Master.csv from the Lahman database. table = etl.fromcsv(sys.argv[1]) # Use US births only table2 = etl.select(table, lambda rec: rec.birthCountry == 'USA') # Only use these fields table3 = etl.cut(table2, 'nameFirst', 'nameLast', 'debut', 'bbrefID', 'weight', 'height', 'finalGame', 'birthCity', 'birthState', 'birthYear') # Remove null birth city and birth year table4 = etl.select(table3, lambda rec: rec.birthCity != "" and rec.birthYear != "") # Add Baseball Reference URL table5 = etl.addfield(table4, 'baseball_ref_url', add_bbreflink) # Remove unnecessary bbrefid table6 = etl.cutout(table5, "bbrefID") # Load city,state lat long table. city = etl.fromcsv(sys.argv[2]) # Only use these fields city2 = etl.cut(city, "city", "state", "lat", "long") # Join tables by two keys lat_table = etl.leftjoin(table6, city2, lkey=["birthCity", "birthState"], rkey=["city", "state"]) # Output merged file to csv lat_table.tocsv(sys.argv[3])
def append_tailings_reports_to_code_required_reports(connection, commit=False): src_table = etl.fromdb( connection, 'SELECT exp_doc.mine_guid, exp_doc.exp_document_guid, req_doc.req_document_name, exp_doc.due_date, exp_doc.exp_document_status_code, exp_doc.received_date, exp_doc.active_ind, exp_doc_x.mine_document_guid, exp_doc.create_user, exp_doc.create_timestamp, exp_doc.update_user, exp_doc.update_timestamp from mine_expected_document exp_doc \ inner join mine_expected_document_xref exp_doc_x on exp_doc.exp_document_guid = exp_doc_x.exp_document_guid\ inner join mds_required_document req_doc on req_doc.req_document_guid = exp_doc.req_document_guid' ) req_document_crr_defintion_map = [ ['req_document_name', 'mine_report_definition_id'], ['Summary of TSF and Dam Safety Recommendations', 28], ['ITRB Activities Report', 27], ['Register of Tailings Storage Facilities and Dams', 47], ['Dam Safety Inspection (DSI) Report', 26], ['Dam Safety Review (DSR) Report', 31], ['“As-built” Reports', 32], ['Annual Reclamation', 25], ['MERP Record of Testing', 3], #['Annual Manager\'s Report', __________________ ], no mapping or data, ignore. ['OMS Manual', 33], ['Annual reconciliation of water balance and water management plans', 44], ['TSF risk assessment', 46], ['Mine Emergency Preparedness and Response Plan (MERP)', 24], ['Performance of high risk dumps', 29] ] table1 = etl.join(src_table, req_document_crr_defintion_map, 'req_document_name') mine_report = etl.cutout(table1, 'req_document_name') #to be inserted into db mine_report = etl.addfield(mine_report, 'submission_year', 2019) mine_report = etl.rename(mine_report, 'exp_document_status_code', 'mine_report_submission_status_code') mine_report = etl.addfield(mine_report, 'deleted_ind', lambda x: not x.active_ind) mine_report = etl.cutout(mine_report, 'active_ind') #to determine what FK's will be so can insert into related tables max_report_id = etl.fromdb(connection, 'select last_value from public.mine_report_mine_report_id_seq')[1][0] max_report_submission_id = etl.fromdb( connection, 'select last_value from public.mine_report_submission_mine_report_submission_id_seq')[1][0] #if sequence hasn't been used yet, fix off by one if max_report_id == 1: max_report_id = 0 if max_report_submission_id == 1: max_report_submission_id = 0 #get one-to-many mine_report, mine_report_submission_documents = etl.unjoin(mine_report, 'mine_document_guid', key='exp_document_guid') #add PK's for mappings mine_report_with_ids = etl.addrownumbers(mine_report, start=max_report_id + 1, step=1, field='mine_report_id') mine_report_with_ids = etl.addrownumbers(mine_report_with_ids, start=max_report_submission_id + 1, step=1, field='mine_report_submission_id') print(f'max_report_id= {max_report_id}, max_report_submission_id={max_report_submission_id}') #copy out fields for submission tables mine_report_submissions = etl.cut(mine_report_with_ids, [ 'mine_report_id', 'exp_document_guid', 'mine_report_submission_status_code', 'create_user', 'create_timestamp', 'update_user', 'update_timestamp' ]) mine_report_submissions = etl.addfield(mine_report_submissions, 'submission_date', lambda x: x.create_timestamp) #remove fields not in mine_report mine_report = etl.cutout(mine_report, 'mine_report_submission_status_code') #replace exp_document_guid FK with mine_report_submission FK submission_id_lookup = etl.cut(mine_report_with_ids, ['mine_report_submission_id', 'exp_document_guid']) mine_report_submission_documents = etl.join(submission_id_lookup, mine_report_submission_documents, key='exp_document_guid') mine_report_submission_documents = etl.cutout(mine_report_submission_documents, 'exp_document_guid') #removed original PK mine_report = etl.cutout(mine_report, 'exp_document_guid') mine_report_submissions = etl.cutout(mine_report_submissions, 'exp_document_guid') print(etl.valuecounter(etl.distinct(table1, key='exp_document_guid'), 'req_document_name')) print(etl.valuecounter(mine_report, 'mine_report_definition_id')) print(table1) print(mine_report) print(mine_report_submissions) print(mine_report_submission_documents) etl.appenddb(mine_report, connection, 'mine_report', commit=False) print('INSERT mine_report staged') etl.appenddb(mine_report_submissions, connection, 'mine_report_submission', commit=False) print('INSERT mine_report_submission staged') etl.appenddb(mine_report_submission_documents, connection, 'mine_report_document_xref', commit=False) print('INSERT mine_report_document_xref staged') if commit: connection.commit() print('DATA CREATION COMPLETE') else: connection.rollback() print('NO DATA CREATED: add --commit=true to insert report rows')
# A partir de la columna 5, el tipo de dato es integer, que es el número de personas/casos # Adicionalmente aprovechamos para cambiar el formato de la fecha de 1/23/20 a 2020-01-23 en el header headers = etl.fieldnames(t_confirmed) i = 0 for header in headers: if i >= 4: t_confirmed = etl.convert(t_confirmed, header, int) # corregimos el tipo de dato fecha = datetime.datetime.strptime( header, '%m/%d/%y') # calculamos la fecha en formato correcto t_confirmed = etl.rename(t_confirmed, header, fecha.strftime('%Y-%m-%d')) i = i + 1 # Eliminamos las columnas de Province/State, Lat y Lon que no vamos a utilizar t_confirmed = etl.cutout(t_confirmed, 0, 2, 3) # Ajustamos algunos nombres de países para luego asignarles una región/continente t_confirmed = etl.convert(t_confirmed, 'Country', 'replace', 'Congo (Brazzaville)', 'Congo') t_confirmed = etl.convert(t_confirmed, 'Country', 'replace', 'Congo (Kinshasa)', 'Democratic Republic of the Congo') t_confirmed = etl.convert(t_confirmed, 'Country', 'replace', 'Cote d\'Ivoire', 'Ivory Coast') t_confirmed = etl.convert(t_confirmed, 'Country', 'replace', 'Korea, South', 'South Korea') t_confirmed = etl.convert(t_confirmed, 'Country', 'replace', 'West Bank and Gaza', 'Palestine') t_confirmed = etl.convert(t_confirmed, 'Country', 'replace', 'Burma', 'Myanmar')
import petl as etl import csv table1=(etl.fromcsv('covid.csv')) # importing data from xml file and creating table table2 = etl.fromxml('Country_location.xml','.//tr',('th','td')) # print(table2) # removing column country from table table3=etl.cutout(table2,'country') # merging the covid table with xml data table4=etl.join(table1,table3,key='location') print(table4) # writing result to csv file with open('covid_countries.csv','w') as f: writer=csv.writer(f) writer.writerows(table4)
key_dep_dict = {} print(file_name) for constraint in value: child_dep, parent_dep = constraint.split(':') data = etl.values(table, child_dep, parent_dep) for d in data: key_dep_set = {d[1]} key_dep_dict.update({d[0]: key_dep_set}) key_dep_list = toposort_flatten(key_dep_dict) table = etl.addfield( table, 'pwb_index', lambda rec: int(key_dep_list.index(rec[child_dep]))) table = etl.sort(table, 'pwb_index') table = etl.cutout(table, 'pwb_index') writer = csv.writer(tempfile, delimiter='\t', quoting=csv.QUOTE_NONE, quotechar='', lineterminator='\n', escapechar='') writer.writerows(table) shutil.move(tempfile.name, file_name) open(tsv_done_file, 'a').close() ddl = [] for table in deps_list:
international_code = "(+61)" with open(IN_FILE, 'r') as infile, open(OUT_FILE, "w") as outfile: csv_reader = csv.reader(infile) writer = csv.writer(outfile) headers = next(csv_reader, None) #skipping header row writer.writerow(headers) for row in csv_reader: number_column = row[5] state_column = row[3] clean_num = re.sub("\D", "", row[5])[-8:] formatted_num = international_code + " " + regional_code[ state_column] + " " + clean_num row[5] = formatted_num writer.writerow(row) services = petl.fromcsv(SERVICES_FILE) offices = petl.fromcsv(OUT_FILE) offices = offices.rename({"Contact Name": "Office", "Phone Number": "Phone"}) offices = petl.cutout(offices,"State","Postcode") locations = petl.fromcsv(LOC_FILE) locations = locations.rename({"officeID": "OfficeID"}) office_service = petl.join(services, offices, key='OfficeID') office_service_locations = petl.join( office_service, locations, key='OfficeID') office_service_locations = petl.convert(office_service_locations,'OfficeServiceID',int) office_service_locations = petl.sort(office_service_locations,'OfficeServiceID') petl.tocsv(office_service_locations, 'office_service_locations.csv')
def get_value_counts_from_csv_file(collection_file_name: str, fields_to_fetch: List[str]) -> List: table_data = etl.fromcsv(collection_file_name) value_counts = etl.valuecounts(table_data, *fields_to_fetch) return list(etl.cutout(value_counts, "frequency").dicts())
def procesar_fuente(path, nombre): try: # Procesamos primero casos confirmados tabla = etl.fromcsv(path) # Cambiamos el nombre a los encabezados tabla = etl.rename(tabla, {'Country/Region': 'Country'}) # Ajustamos los tipos de datos # A partir de la columna 5, el tipo de dato es integer, que es el número de personas/casos # Adicionalmente aprovechamos para cambiar el formato de la fecha de 1/23/20 a 2020-01-23 en el header headers = etl.fieldnames(tabla) i=0 for header in headers: if i>=4: tabla = etl.convert(tabla, header, int) # corregimos el tipo de dato fecha = datetime.datetime.strptime(header, '%m/%d/%y') # calculamos la fecha en formato correcto tabla = etl.rename(tabla, header, fecha.strftime('%Y-%m-%d')) i = i + 1 # Eliminamos las columnas de Province/State, Lat y Lon que no vamos a utilizar tabla = etl.cutout(tabla, 0, 2, 3) # Ajustamos algunos nombres de países para luego asignarles una región/continente tabla = etl.convert(tabla, 'Country', 'replace', 'Congo (Brazzaville)', 'Congo') tabla = etl.convert(tabla, 'Country', 'replace', 'Congo (Kinshasa)', 'Democratic Republic of the Congo') tabla = etl.convert(tabla, 'Country', 'replace', 'Cote d\'Ivoire', 'Ivory Coast') tabla = etl.convert(tabla, 'Country', 'replace', 'Korea, South', 'South Korea') tabla = etl.convert(tabla, 'Country', 'replace', 'West Bank and Gaza', 'Palestine') tabla = etl.convert(tabla, 'Country', 'replace', 'Burma', 'Myanmar') tabla = etl.convert(tabla, 'Country', 'replace', 'US', 'USA') tabla = etl.convert(tabla, 'Country', 'replace', 'Taiwan*', 'Taiwan') # Luego procedemos a agrupar y acumular los resultados por el país df_confirmed = etl.todataframe(tabla) df = df_confirmed.groupby(['Country']).sum() tabla = etl.fromdataframe(df, include_index=True) # Renombramos el campo de Country nuevamente tabla = etl.rename(tabla, {'index': 'Country'}) # Luego agregamos las columnas de fecha como datos y renombramos las nuevas columnas tabla = etl.melt(tabla, 'Country') tabla = etl.rename(tabla, {'variable': 'Date'}) tabla = etl.rename(tabla, {'value': 'Cases'}) # Luego agregamos el continente para agrupar tabla = etl.addfield(tabla, 'Continent', lambda rec: get_continent_code(rec['Country'])) # Y nuevamente nos aseguramos que sean del tipo de dato que deben ser. tabla = etl.convert(tabla, 'Cases', int) tabla = etl.convert(tabla, 'Date', lambda v: datetime.datetime.strptime(v, '%Y-%m-%d') ) #Finalmente, subimos el archivo al repositorio de datos conn = pymysql.connect(password='******', database='covid', user='******') conn.cursor().execute('SET SQL_MODE=ANSI_QUOTES') etl.todb(tabla, conn, nombre, create=True, drop=True) conn.close() except: print('Se ha presentado un error! ', sys.exc_info()[0]) raise
def remove_fields(fields, table): for field in fields: if field in etl.fieldnames(table): table = etl.cutout(table, field) return table
def transfer_data(from_db_conn, to_db_conn): ''' Transfer data from databases given cursor to execute queries to connected databases Limitations: 1. poc.address_id is currently marked as -1 since it was not provided in test data and is a FK non-null constraint 2. institution2poc table is not available in old schema 3. role table is already populated in bill.sql file so that table is skipped by this script 4. poc_poc_id is currently set to be poc_id since no relevant information is available about the column 5. project2moc_project.role_id column is not available in old schema and is a not null field in new schema so we default it to 1 for now. 6. project2moc_project.username is not available from old schema so currently set to empty 7. raw_item_ts.item_id has duplicates when imported from item_ts. So we currently filter out and insert only uniques. :param from_db_conn: source database connection :param to_db_conn: destination database connection ''' # Emptying out tables with possible foreign key constraint issues fk_dep_tables = [ 'poc2project', 'poc2moc_project', 'poc', 'raw_item_ts', 'item', 'project', 'institution2moc_project' ] for table_name in fk_dep_tables: table = etl.fromdb(to_db_conn, "select * from {} where 1=0".format(table_name)) etl.todb(table, to_db_conn, table_name) # Tables with no change in schema insert_as_tables = [ 'institution', 'address', 'item_type', 'item2item', 'catalog_item' ] for table_name in insert_as_tables: table = etl.fromdb(from_db_conn, "select * from {}".format(table_name)) etl.todb(table, to_db_conn, table_name) # inserting dummy address for constraint matching dummy_address = [{'address_id': -1}] dummy_address_table = etl.fromdicts(dummy_address) etl.appenddb(dummy_address_table, to_db_conn, 'address') poc = etl.fromdb(from_db_conn, 'select * from poc') poc_transformed = etl.cutout(poc, 'domain_id', 'user_uid') poc_dummy_address = etl.replace(poc_transformed, 'address_id', None, -1) etl.todb(poc_dummy_address, to_db_conn, 'poc') project_names_table = etl.fromdb( from_db_conn, "select distinct project_name from project") moc_project_transformed = etl.addrownumbers(project_names_table) moc_project_transformed = etl.rename(moc_project_transformed, {'row': 'moc_project_id'}) etl.todb(moc_project_transformed, to_db_conn, 'moc_project') domain = etl.fromdb(from_db_conn, "select * from domain") domain_table_transformed = etl.cutout(domain, 'domain_uid') domain_table_transformed = etl.rename(domain_table_transformed, { 'domain_id': 'service_id', 'domain_name': 'service_name' }) etl.todb(domain_table_transformed, to_db_conn, 'service') project = etl.fromdb(from_db_conn, "select * from project") moc_project = etl.fromdb(to_db_conn, "select * from moc_project") project_moc_project_joined = etl.join(project, moc_project, key='project_name') project_table_transformed = etl.cutout(project_moc_project_joined, 'project_name') project_table_transformed = etl.rename(project_table_transformed, { 'domain_id': 'service_id', 'project_uid': 'project_uuid' }) etl.todb(project_table_transformed, to_db_conn, 'project') institution2project = etl.fromdb(from_db_conn, "Select * from institution2project") project = etl.fromdb(to_db_conn, "select project_id, moc_project_id from project") inst2project_project_joined = etl.join(institution2project, project, key='project_id') inst2moc_project = etl.cutout(inst2project_project_joined, 'domain_id') etl.todb(inst2moc_project, to_db_conn, 'institution2moc_project') project2poc = etl.fromdb(from_db_conn, "select * from project2poc") project2poc_project_joined = etl.join(project2poc, project, key='project_id') poc2moc_project = etl.cutout(project2poc_project_joined, 'project_id', 'domain_id') poc2moc_project = etl.addfield(poc2moc_project, 'role_id', 1) poc2moc_project = etl.addfield(poc2moc_project, 'poc_poc_id', lambda rec: rec['poc_id']) etl.todb(poc2moc_project, to_db_conn, 'poc2moc_project') poc2project = etl.cutout(project2poc, 'domain_id') poc2project = etl.addfield(poc2project, 'role_id', 1) poc2project = etl.addfield(poc2project, 'username', '') etl.todb(poc2project, to_db_conn, 'poc2project') item = etl.fromdb(from_db_conn, "select * from item") item_transformed = etl.cutout(item, 'domain_id') etl.todb(item_transformed, to_db_conn, 'item') raw_item_ts_unique = etl.fromdb( from_db_conn, "WITH summary AS ( SELECT its.item_id, its.start_ts, its.end_ts, its.state, its.catalog_item_id, ROW_NUMBER() OVER(PARTITION BY its.item_id) AS rk FROM ITEM_TS its) SELECT s.* FROM summary s WHERE s.rk = 1" ) raw_item_ts_unique = etl.cutout(raw_item_ts_unique, 'rk') etl.todb(raw_item_ts_unique, to_db_conn, 'raw_item_ts')
# other tables go to ais_sources account: dsn = get_dsn('ais_sources') connection = cx_Oracle.Connection(dsn) # zip4: etl.fromcsv(temp_zip4_outfile_path).todb(get_cursor, zip4_write_table_name) # cityzip: etl.fromcsv(cityzip_outfile_path).todb(get_cursor, cityzip_write_table_name) # alias: etl.fromcsv(alias_outfile_path).todb(get_cursor, alias_write_table_name) # Write processed_rows to uspszip4.csv: print("Writing cleaned_usps output to {zip4_outfile_path}".format( zip4_outfile_path=zip4_outfile_path)) etl.cutout(processed_rows, 'base', 'pre', 'name', 'suffix', 'post', 'change_pre', 'change_name', 'change_suffix', 'change_post') \ .rename({'std_base': 'base', 'std_pre': 'pre', 'std_name': 'name', 'std_suffix': 'suffix', 'std_post': 'post'}) \ .cut('street_full', 'pre', 'name', 'suffix', 'post', 'low', 'high', 'oeb', 'unit', 'unitlow', 'unithigh', 'unitoeb', 'buildingorfirm', 'recordtype', 'zipcode', 'zip4') \ .convert('low', int) \ .select("{low} is not None") \ .sort(key=['name', 'pre', 'suffix', 'post', 'low', 'high', 'unit', 'unitlow', 'unithigh']) \ .tocsv(zip4_outfile_path, write_header=False) # Write processed_rows to s3: print("Writing {zip4_outfile_path} to s3".format( zip4_outfile_path=zip4_outfile_path)) # s3 = boto3.resource('s3', config=Config(proxies={'http': os.environ['HTTP_PROXY'], 'https': os.environ['HTTPS_PROXY']})) s3 = boto3.resource('s3') s3.meta.client.upload_file(zip4_outfile_path, s3_bucket, 'static files/' + zip4_outfile_path) # Clean up: os.remove(temp_zip4_outfile_path)
def clean_up(table, column): if CLEAN_UP: return etl.cutout(table, column)
table5 = cut(table1, *range(0, 2)) look(table5) # cutout table1 = [['foo', 'bar', 'baz'], ['A', 1, 2.7], ['B', 2, 3.4], ['B', 3, 7.8], ['D', 42, 9.0], ['E', 12]] from petl import cutout, look look(table1) table2 = cutout(table1, 'bar') look(table2) # cat table1 = [['foo', 'bar'], [1, 'A'], [2, 'B']] table2 = [['bar', 'baz'], ['C', True], ['D', False]] table4 = [['foo', 'bar', 'baz'], ['A', 1, 2], ['B', '2', '3.4'], [u'B', u'3', u'7.8', True],
# Dim Time # TO DO # Load a full year (2018) with the most simple datetime analysis # Year, month, day, hour, minute, second # For the full loading process , use the reference on the references.txt # This should be a processure with all the validation logic there, to create the next X months when it is called # Facts # This facts table will be the staging with all the needed info to quickly update with the dimension keys and load to the facts table # The facts table will have columns to match each column on the dim Time table, to make it easier to get the reference key # events_uid = etl.cutout(events, 'tracking_id', 'utm_medium', 'utm_campaign') events_tui = etl.cutout(events, 'user_id') stage_uid = etl.join(users, events_uid, key='user_id') stage_tui = etl.join(users, events_tui, key='tracking_id') stage_utm = etl.cut(stage_tui, 'user_id', 'utm_medium', 'utm_campaign') stage_uid_utm = etl.join(stage_uid, stage_utm, key='user_id') stage_m_s = etl.mergesort(stage_uid_utm, stage_tui, key=['created_at', 'email']) # Mapping definitions mappings = OrderedDict() mappings['tid'] = 'tracking_id' mappings['uid'] = 'user_id' mappings['utm_medium'] = 'utm_medium'
# select a range of fields table5 = etl.cut(table1, *range(0, 2)) table5 # cutout() ########## import petl as etl table1 = [['foo', 'bar', 'baz'], ['A', 1, 2.7], ['B', 2, 3.4], ['B', 3, 7.8], ['D', 42, 9.0], ['E', 12]] table2 = etl.cutout(table1, 'bar') table2 # cat() ####### import petl as etl table1 = [['foo', 'bar'], [1, 'A'], [2, 'B']] table2 = [['bar', 'baz'], ['C', True], ['D', False]] table3 = etl.cat(table1, table2) table3