def main(argv): global g parser = argparse.ArgumentParser() parser.add_argument("--contributions-filename", required=True, help="Input UTF8 CSV with contributions data " "dumped from Servant Keeper") parser.add_argument("--split-detail-files", required=False, nargs='*', default=argparse.SUPPRESS, help="List of CSV files which have records that can be used to replace top-level 'Split Transaction' " "records in the main contributions file.") parser.add_argument("--chart-of-accounts-filename", required=True, help="Input UTF8 CSV with Chart of Accounts " "data from Servant Keeper") parser.add_argument("--output-filename", required=True, help="Output CSV filename which will be loaded with " "contributions data in CCB import format ") parser.add_argument('--trace', action='store_true', help="If specified, prints tracing/progress messages to " "stdout") args = parser.parse_args() assert os.path.isfile(args.contributions_filename), "Error: cannot open file '" + args.contributions_filename + "'" dict_split_transaction_details = load_split_transaction_details(args.split_detail_files) table = petl.fromcsv(args.contributions_filename) table = petl.rename(table, { 'Individual ID': 'SK Individual ID', 'Amount': 'SK Amount' }) trace('REMOVING SPLIT TRANSACTIONS...', args.trace, banner=True) table = replace_split_transactions(table, dict_split_transaction_details) table_coa = petl.fromcsv(args.chart_of_accounts_filename) table = petl.leftjoin(table, table_coa, lkey='Account', rkey='SK Account') table = petl.addfield(table, 'Individual ID', lambda rec: rec['SK Individual ID']) table = petl.addfield(table, 'Date of Contribution', lambda rec: rec['Batch Date']) table = petl.addfield(table, 'Amount', lambda rec: rec['SK Amount']) table = petl.addfield(table, 'Type of Gift', lambda rec: rec['Type']) table = petl.addfield(table, 'Check Number', lambda rec: rec['Check #']) table = petl.addfield(table, 'Fund', convert_fund) table = petl.addfield(table, 'Sub Fund', convert_sub_fund) table = petl.addfield(table, 'Campus', '') table = petl.addfield(table, 'Transaction Grouping', '') table = petl.addfield(table, 'Batch Number/Name', '') table = petl.addfield(table, 'Tax Deductible', lambda rec: rec['Tax']) table = petl.addfield(table, 'Memo', convert_notes) trace('CONVERTING AND THEN EMITTING TO CSV FILE...', args.trace, banner=True) table.progress(200).tocsv(args.output_filename) trace('OUTPUT TO CSV COMPLETE.', args.trace, banner=True) if len(g.set_unfound_accounts) > 0: trace('UNMATCHED SK ACCOUNTS!', args.trace, banner=True) for acct in g.set_unfound_accounts: trace(acct, args.trace) trace('DONE!', args.trace, banner=True)
def produce_delivery_count_table(): log.addFilter(MultilineFilter()) # useful for tables log.info('Starting to generate the monthly german payroll table') # ------------------------------ # Extract driver names from Odoo # ------------------------------ log.info('Extracting driver names from Odoo') odoo = OdooConnector() filters = [('supplier', '=', True), ('active', '=', True), ('company_id', '=', 5)] # 5 is germany df = odoo.extract('res.partner', filters) odoo_drivers = fromdataframe(df) mappings = { 'driver_app_username': '******', 'planday_salary_id_in_odoo': 'x_salary_id', 'odoo_id': 'id', 'fullname_in_odoo': 'display_name' } odoo_drivers = odoo_drivers.fieldmap(mappings) # cache the results odoo_drivers.toxlsx(odoo_cache_file) log.info('%s drivers found in Odoo', odoo_drivers.nrows()) log.debug(odoo_drivers.look()) # ------------------------------------------ # Extract delivery counts from the warehouse # ------------------------------------------ log.info('Extracting delivery counts from the DWH') dwh = WarehouseConnector() query = SQLReader('sql.german_drivers_delivery_counts').statements[0] log.debug(query) df = dwh.execute(query) driver_counts = fromdataframe(df) # cache the results driver_counts.toxlsx(dwh_cache_file) log.info('%s drivers found in the DWH', driver_counts.nrows()) log.info('Deliveries per driver %s', driver_counts.stats('number_of_deliveries')) log.debug(driver_counts.look()) # ---------------------------- # Join the two tables together # ---------------------------- payroll = leftjoin(driver_counts, odoo_drivers, key='driver_app_username') # Some usernames appear multiple times in Odoo payroll = payroll.distinct('driver_app_username') log.debug(payroll.look()) payroll.toxlsx(output_file) log.info('Payroll table saved to %s', output_file) log.removeFilter(MultilineFilter())
def join_mine_guids(connection, application_table): current_mines = etl.fromdb( connection, 'select distinct on (minenumber) mine_guid, mine_no as minenumber from public.mine order by minenumber, create_timestamp;' ) application_table_guid_lookup = etl.leftjoin(application_table, current_mines, key='minenumber') return application_table_guid_lookup
def join(data, strategy, source_left, source_right, destination, key_left, key_right, prefix_left, prefix_right, presorted, buffersize, tempdir, cache, missing): """Perform a join on two data tables.""" source_left = data.get(source_left) source_right = data.get(source_right) kwargs = {} if key_left == key_right: kwargs['key'] = key_left else: kwargs['lkey'] = key_left kwargs['rkey'] = key_right if presorted is True: kwargs['presorted'] = presorted if buffersize is not None: kwargs['buffersize'] = buffersize if tempdir: kwargs['tempdir'] = tempdir if 'anti' not in strategy: if prefix_left is not None: kwargs['lprefix'] = prefix_left if prefix_right is not None: kwargs['rprefix'] = prefix_right if strategy not in ['join', 'antijoin', 'hashjoin', 'hashantijoin']: kwargs['missing'] = missing if strategy == 'join': o = petl.join(source_left, source_right, **kwargs) elif strategy == 'leftjoin': o = petl.leftjoin(source_left, source_right, **kwargs) elif strategy == 'lookupjoin': o = petl.lookupjoin(source_left, source_right, **kwargs) elif strategy == 'rightjoin': o = petl.rightjoin(source_left, source_right, **kwargs) elif strategy == 'outerjoin': o = petl.outerjoin(source_left, source_right, **kwargs) elif strategy == 'antijoin': o = petl.antijoin(source_left, source_right, **kwargs) elif strategy == 'hashjoin': o = petl.antijoin(source_left, source_right, **kwargs) elif strategy == 'hashleftjoin': o = petl.hashleftjoin(source_left, source_right, **kwargs) elif strategy == 'hashlookupjoin': o = petl.hashlookupjoin(source_left, source_right, **kwargs) elif strategy == 'hashrightjoin': o = petl.hashrightjoin(source_left, source_right, **kwargs) data.set(destination, o)
def get_relationships(self): "Parses a list of `Relationship` objects." core_file = _find_loinc_table_core_file(self.uri.path) core = etl.fromcsv(core_file, delimiter=',') core = etl.cut(core, ['LOINC_NUM', 'LONG_COMMON_NAME']) hierarchy_file = _find_multi_axial_hierarchy_file(self.uri.path) hierarchy = etl.fromcsv(hierarchy_file, delimiter=',') hierarchy = etl.leftjoin(hierarchy, core, lkey='CODE', rkey='LOINC_NUM') hierarchy = etl.cut(hierarchy, ['IMMEDIATE_PARENT', 'CODE', 'CODE_TEXT', 'LONG_COMMON_NAME']) hierarchy = etl.fillright(hierarchy) hierarchy = etl.cut(hierarchy, ['IMMEDIATE_PARENT', 'CODE', 'LONG_COMMON_NAME']) hierarchy = etl.rename(hierarchy, 'LONG_COMMON_NAME', 'CODE_TEXT') parents = etl.cut(hierarchy, ['CODE', 'CODE_TEXT']) hierarchy = etl.selectne(hierarchy, 'IMMEDIATE_PARENT', '') hierarchy = etl.leftjoin(hierarchy, parents, lkey='IMMEDIATE_PARENT', rkey='CODE', lprefix='source.', rprefix='target.') hierarchy = etl.distinct(hierarchy) if self.versioned: version = _parse_version(hierarchy_file) hierarchy = etl.addfield(hierarchy, 'version', version) hierarchy = etl.rowmapmany(hierarchy, _to_json, ['relationship']) return hierarchy
def join_execute(cl, cr, join, **kwargs): cl, cr = cl(), cr() if 'addLfields' in kwargs: cl = etl.addfields(cl, kwargs['addLfields']) if 'addRfields' in kwargs: cr = etl.addfields(cr, kwargs['addRfields']) args = cl, cr if join == Join.UNION: c = etl.crossjoin(*args) else: kwargs = filter_keys(kwargs, ("key", "lkey", "rkey", "missing", "presorted", "buffersize", "tempdir", "cache")) if join == Join.INNER: c = etl.join(*args, **kwargs) elif join == Join.LEFT: c = etl.leftjoin(*args, **kwargs) elif join == Join.RIGHT: c = etl.rightjoin(*args, **kwargs) elif join == Join.FULL: c = etl.outerjoin(*args, **kwargs) return c
# leftjoin table1 = [['id', 'colour'], [1, 'blue'], [2, 'red'], [3, 'purple']] table2 = [['id', 'shape'], [1, 'circle'], [3, 'square'], [4, 'ellipse']] from petl import leftjoin, look look(table1) look(table2) table3 = leftjoin(table1, table2, key='id') look(table3) # rightjoin table1 = [['id', 'colour'], [1, 'blue'], [2, 'red'], [3, 'purple']] table2 = [['id', 'shape'], [1, 'circle'], [3, 'square'], [4, 'ellipse']] from petl import rightjoin, look
table6 = [['id', 'shape'], [1, 'circle'], [1, 'square'], [2, 'ellipse']] table7 = etl.join(table5, table6, key='id') table7 # compound keys are supported table8 = [['id', 'time', 'height'], [1, 1, 12.3], [1, 2, 34.5], [2, 1, 56.7]] table9 = [['id', 'time', 'weight'], [1, 2, 4.5], [2, 1, 6.7], [2, 2, 8.9]] table10 = etl.join(table8, table9, key=['id', 'time']) table10 # leftjoin() ############ import petl as etl table1 = [['id', 'colour'], [1, 'blue'], [2, 'red'], [3, 'purple']] table2 = [['id', 'shape'], [1, 'circle'], [3, 'square'], [4, 'ellipse']] table3 = etl.leftjoin(table1, table2, key='id') table3 # rightjoin() ############# import petl as etl table1 = [['id', 'colour'], [1, 'blue'], [2, 'red'], [3, 'purple']] table2 = [['id', 'shape'], [1, 'circle'], [3, 'square'], [4, 'ellipse']] table3 = etl.rightjoin(table1, table2, key='id') table3 # outerjoin() ############# import petl as etl
# Load Master.csv from the Lahman database. table = etl.fromcsv(sys.argv[1]) # Use US births only table2 = etl.select(table, lambda rec: rec.birthCountry == 'USA') # Only use these fields table3 = etl.cut(table2, 'nameFirst', 'nameLast', 'debut', 'bbrefID', 'weight', 'height', 'finalGame', 'birthCity', 'birthState', 'birthYear') # Remove null birth city and birth year table4 = etl.select(table3, lambda rec: rec.birthCity != "" and rec.birthYear != "") # Add Baseball Reference URL table5 = etl.addfield(table4, 'baseball_ref_url', add_bbreflink) # Remove unnecessary bbrefid table6 = etl.cutout(table5, "bbrefID") # Load city,state lat long table. city = etl.fromcsv(sys.argv[2]) # Only use these fields city2 = etl.cut(city, "city", "state", "lat", "long") # Join tables by two keys lat_table = etl.leftjoin(table6, city2, lkey=["birthCity", "birthState"], rkey=["city", "state"]) # Output merged file to csv lat_table.tocsv(sys.argv[3])
# Use US births only table2 = etl.select(table, lambda rec: rec.birthCountry == 'USA') # Only use these fields table3 = etl.cut(table2, 'nameFirst', 'nameLast', 'debut', 'bbrefID', 'weight', 'height', 'finalGame', 'birthCity', 'birthState', 'birthYear') # Remove null birth city and birth year table4 = etl.select(table3, lambda rec: rec.birthCity != "" and rec.birthYear != "") # Add Baseball Reference URL table5 = etl.addfield(table4, 'baseball_ref_url', add_bbreflink) # Remove unnecessary bbrefid table6 = etl.cutout(table5, "bbrefID") # Load city,state lat long table. city = etl.fromcsv(sys.argv[2]) # Only use these fields city2 = etl.cut(city, "city", "state", "lat", "long") # Join tables by two keys lat_table = etl.leftjoin(table6, city2, lkey=["birthCity", "birthState"], rkey=["city", "state"]) # Output merged file to csv lat_table.tocsv(sys.argv[3])
table10 # leftjoin() ############ import petl as etl table1 = [['id', 'colour'], [1, 'blue'], [2, 'red'], [3, 'purple']] table2 = [['id', 'shape'], [1, 'circle'], [3, 'square'], [4, 'ellipse']] table3 = etl.leftjoin(table1, table2, key='id') table3 # rightjoin() ############# import petl as etl table1 = [['id', 'colour'], [1, 'blue'], [2, 'red'], [3, 'purple']] table2 = [['id', 'shape'], [1, 'circle'], [3, 'square'], [4, 'ellipse']]
table = clean_up(table, 'geo_ind') table = clean_up(table, 'cid') table = clean_up(table, 'occ_typ') print('TRIMMED HEADERS = ' + str(etl.header(table))) table = etl.select(table, 'occ_dt', lambda x: x > datetime(2000, 1, 1)) print('ROWS POST YR 2000 = ' + str(etl.nrows(table))) mine_table = etl.fromcsv('mines.csv', encoding='utf-8') ##handle leading 0's mine_table = etl.convert(mine_table, 'mine_no', lambda x: str(int(x))) table = etl.convert(table, 'mine_no', lambda x: str(int(x))) #MAP mine_no to mine_guid table = etl.leftjoin(table, mine_table, key='mine_no') table = clean_up(table, 'mine_no') #make sure this is 0 if etl.valuecount(table, 'mine_guid', None)[0] > 0: print('mine_guid, mine_no pair missing from mines.csv') exit(1) ###### print('CONVERT AND RENAME descript1 to recommendation') table = etl.addfield(table, 'recommendation', lambda x: x['descript1']) table = clean_up(table, 'descript1') ###### print('CONVERTING sta_cd to status_code') table = etl.addfield(table, 'status_code', lambda x: x['sta_cd']) table = etl.convert(table, 'status_code', 'replace', 'O', 'F')
header=['id', 'external_id']) aggregated_summary = etl.unpackdict(aggregated_summary, 'creation_data') file_name = 'datasets-%s.csv' % datetime.now().strftime('%Y%m%d%H%M%S') directory = 'csv' if not os.path.exists(directory): os.makedirs(directory) # etl.tocsv(aggregated_summary, './%s/%s' % (directory, file_name)) # logging.info('This %s has been exported' % file_name) rooms, participations = storing_data_preparation(aggregated_summary) participations = etl.leftjoin(participations, external_ids, lkey='participant_id', rkey='id', rprefix='r_') participations = etl.cutout(participations, 'participant_id') participations = etl.rename(participations, 'r_external_id', 'participant_id') rooms = etl.leftjoin(rooms, external_ids, lkey='creator', rkey='id', rprefix='r_') rooms = etl.cutout(rooms, 'creator') rooms = etl.rename(rooms, 'r_external_id', 'creator') logging.info('Storing data %s to database' % file_name)