def save_characters_to_file(generated_file_path, characters_pages): etl.setheader( etl.empty(), settings.STAR_WARS_CHARACTERS_OUTPUT_FILE_HEADER_FIELDS, ).tocsv(generated_file_path) logger.info('Created file: %s', generated_file_path) for characters_page in characters_pages: etl.appendcsv( characters_page, generated_file_path, write_header=False, ) logger.info('Added data to file: %s', generated_file_path)
def alter_varchar_column_widths(self, tbl, table_name): """ Alter the width of a varchar columns in a Redshift table to match the widths of a Parsons table. The columns are matched by column name and not their index. `Args:` tbl: obj A Parsons table table_name: The target table name (e.g. ``my_schema.my_table``) `Returns:` ``None`` """ # Make the Parsons table column names match valid Redshift names tbl.table = petl.setheader(tbl.table, self.column_name_validate(tbl.columns)) # Create a list of column names and max width for string values. pc = {c: tbl.get_column_max_width(c) for c in tbl.columns} # Determine the max width of the varchar columns in the Redshift table s, t = self.split_full_table_name(table_name) cols = self.get_columns(s, t) rc = {k: v['max_length'] for k, v in cols.items() if v['data_type'] == 'character varying'} # noqa: E501, E261 # Figure out if any of the destination table varchar columns are smaller than the # associated Parsons table columns. If they are, then alter column types to expand # their width. for c in set(rc.keys()).intersection(set(pc.keys())): if rc[c] < pc[c]: logger.info(f'{c} not wide enough. Expanding column width.') self.alter_table_column_type(table_name, c, 'varchar', varchar_width=pc[c])
def create_statement(self, tbl, table_name, padding=None, distkey=None, sortkey=None, varchar_max=None, varchar_truncate=True, columntypes=None): # Generate a table create statement # Validate and rename column names if needed tbl.table = petl.setheader(tbl.table, self.column_name_validate(tbl.columns)) if tbl.num_rows == 0: raise ValueError('Table is empty. Must have 1 or more rows.') mapping = self.generate_data_types(tbl) if padding: mapping['longest'] = self.vc_padding(mapping, padding) if varchar_max: mapping['longest'] = self.vc_max(mapping, varchar_max) if varchar_truncate: mapping['longest'] = self.vc_trunc(mapping) mapping['longest'] = self.vc_validate(mapping) # Add any provided column type overrides if columntypes: for i in range(len(mapping['headers'])): col = mapping['headers'][i] if columntypes.get(col): mapping['type_list'][i] = columntypes[col] # Enclose in quotes mapping['headers'] = ['"{}"'.format(h) for h in mapping['headers']] return self.create_sql(table_name, mapping, distkey=distkey, sortkey=sortkey)
def create_statement(self, tbl, table_name, strict_length=True): # Generate create statement SQL for a given Parsons table. # Validate and rename column names if needed tbl.table = petl.setheader(tbl.table, self.columns_convert(tbl.columns)) # Generate the table map table_map = self.evaluate_table(tbl) # Generate the column syntax column_syntax = [] for c in table_map: if strict_length: col_width = int(c['width'] + (self.VARCHAR_PAD * c['width'])) else: col_width = self.round_longest(c['width']) if c['type'] == 'varchar': column_syntax.append( f"{c['name']} {c['type']}({col_width}) \n") else: column_syntax.append(f"{c['name']} {c['type']} \n") # Generate full statement return f"CREATE TABLE {table_name} ( \n {','.join(column_syntax)});"
def set_header(self, new_header): """ Replace the header row of the table. `Args:` new_header: list List of new header column names `Returns:` `Parsons Table` and also updates self """ self.table = petl.setheader(self.table, new_header) return self
def create_statement(self, tbl, table_name, padding=None, distkey=None, sortkey=None, varchar_max=None, varchar_truncate=True, columntypes=None, strict_length=True): # Generate a table create statement. Distkeys and sortkeys are only used by # Redshift and should not be passed when generating a create statement for # Postgres. if tbl.num_rows == 0: raise ValueError('Table is empty. Must have 1 or more rows.') # Validate and rename column names if needed tbl.table = petl.setheader(tbl.table, self.column_name_validate(tbl.columns)) mapping = self.generate_data_types(tbl) if padding: mapping['longest'] = self.vc_padding(mapping, padding) elif not strict_length: mapping['longest'] = self.vc_step(mapping) if varchar_max: mapping['longest'] = self.vc_max(mapping, varchar_max) if varchar_truncate: mapping['longest'] = self.vc_trunc(mapping) mapping['longest'] = self.vc_validate(mapping) # Add any provided column type overrides if columntypes: for i in range(len(mapping['headers'])): col = mapping['headers'][i] if columntypes.get(col): mapping['type_list'][i] = columntypes[col] # Enclose in quotes mapping['headers'] = [f'"{h}"' for h in mapping['headers']] return self.create_sql(table_name, mapping, distkey=distkey, sortkey=sortkey)
yield [rec['id'], 'age_months', rec['age'] * 12] yield [rec['id'], 'bmi', rec['weight'] / rec['height'] ** 2] table2 = recordmapmany(table1, rowgenerator, fields=['subject_id', 'variable', 'value']) look(table2) # setheader table1 = [['foo', 'bar'], ['a', 1], ['b', 2]] from petl import setheader, look look(table1) table2 = setheader(table1, ['foofoo', 'barbar']) look(table2) # extendheader table1 = [['foo'], ['a', 1, True], ['b', 2, False]] from petl import extendheader, look look(table1) table2 = extendheader(table1, ['bar', 'baz']) look(table2)
def main(argv): parser = argparse.ArgumentParser() parser.add_argument("--input-csv-filename", required=True, help="Input UTF8 CSV to summarize") parser.add_argument( "--sep-columns", required=False, nargs='*', default=argparse.SUPPRESS, help= "Column names of columns containing comma- or semi-colon-separated values" ) parser.add_argument("--sep-character", required=False, help="Character used to separate values in multi-value " \ "fields. Defaults to ';' if not specified.") parser.add_argument("--skip-columns", required=False, nargs='*', default=argparse.SUPPRESS, help="Column names to NOT generate stats for") parser.add_argument("--skip-num-rows", required=False, type=int, help="Skip specified number " "of header rows") parser.add_argument( "--first-ccb-column", required=False, help="String name of first CCB column. If " "specified, all preceeding columns will be labeled 'Servant Keeper' and this column " "and all subsequent will be labeled 'CCB'") args = parser.parse_args() if args.first_ccb_column is not None: column_prefix = 'Servant Keeper ' else: column_prefix = '' assert os.path.isfile( args.input_csv_filename ), "Error: cannot open file '" + args.input_csv_filename + "'" table = petl.fromcsv(args.input_csv_filename) # Skip header rows if args.skip_num_rows: skip_num = args.skip_num_rows assert skip_num > 0, "--skip-num-rows value '" + str( skip_num) + "' is invalid. Must be positive." it = iter(table) while skip_num >= 0: row = next(it) skip_num -= 1 table = petl.setheader(table, row) table = petl.tail(table, petl.nrows(table) - args.skip_num_rows) # Print nicely formatted stats for each column sep = '' args_dict = vars(args) skip_columns_specified = 'skip_columns' in args_dict sep_char_specified = 'sep_character' in args_dict for column in petl.header(table): if args.first_ccb_column is not None and column == args.first_ccb_column: column_prefix = 'CCB ' if not skip_columns_specified or column not in args.skip_columns: output_str = column_prefix + "Column '" + column + "'" print sep + output_str print >> sys.stderr, output_str if args.sep_columns is not None and column in args.sep_columns: if sep_char_specified: sep_character = args.sep_character else: sep_character = ';' output_str = num_dict2str( dict_dump(sep_valuecounter(table, column, sep_character))) print output_str else: output_str = num_dict2str(dict_dump(valuecounts(table, column))) print output_str sep = '\n' # Flush to ensure all output is written sys.stdout.flush() sys.stderr.flush()
def match_columns(self, desired_columns, fuzzy_match=True, if_extra_columns='remove', if_missing_columns='add'): """ Changes the column names and ordering in this Table to match a list of desired column names. `Args:` desired_columns: list Ordered list of desired column names fuzzy_match: bool Whether to normalize column names when matching against the desired column names, removing whitespace and non-alphanumeric characters, and lowercasing everything. Eg. With this flag set, "FIRST NAME" would match "first_name". If the Table has two columns that normalize to the same string (eg. "FIRST NAME" and "first_name"), the latter will be considered an extra column. if_extra_columns: string If the Table has columns that don't match any desired columns, either 'remove' them, 'ignore' them, or 'fail' (raising an error). if_missing_columns: string If the Table is missing some of the desired columns, either 'add' them (with a value of None), 'ignore' them, or 'fail' (raising an error). `Returns:` `Parsons Table` and also updates self """ from parsons.etl import Table # Just trying to avoid recursive imports. normalize_fn = Table.get_normalized_column_name if fuzzy_match else ( lambda s: s) # Create a mapping of our "normalized" name to the original column name current_columns_normalized = { normalize_fn(col): col for col in self.columns } # Track any columns we need to add to our current table from our desired columns columns_to_add = [] # We are going to do a "cut" later to trim our table and re-order the columns, but # we won't have renamed our columns yet, so we need to remember their un-normalized # form cut_columns = [] # We are going to also rename our columns AFTER we cut, so we want to remember their # normalized names final_header = [] # Loop through our desired columns -- the columns we want to see in our final table for desired_column in desired_columns: normalized_desired = normalize_fn(desired_column) # Try to find our desired column in our Table if normalized_desired not in current_columns_normalized: # If we can't find our desired column in our current columns, then it's "missing" if if_missing_columns == 'fail': # If our missing strategy is to fail, raise an exception raise TypeError( f"Table is missing column {desired_column}") elif if_missing_columns == 'add': # We have to add to our table columns_to_add.append(desired_column) # We will need to remember this column when we cut down to desired columns cut_columns.append(desired_column) # This will be in the final table final_header.append(desired_column) elif if_missing_columns != 'ignore': # If it's not ignore, add, or fail, then it's not a valid strategy raise TypeError(f"Invalid option {if_missing_columns} for " "argument `if_missing_columns`") else: # We have found this in our current columns, so take it out of our list to search current_column = current_columns_normalized.pop( normalized_desired) # Add the column to our intermediate table as the old column name cut_columns.append(current_column) # Add to our final header list as the "desired" name final_header.append(desired_column) # Look for any "extra" columns from our current table that aren't in our desired columns for current_column in current_columns_normalized.values(): # Figure out what to do with our "extra" columns if if_extra_columns == 'fail': # If our missing strategy is to fail, raise an exception raise TypeError(f"Table has extra column {current_column}") elif if_extra_columns == 'ignore': # If we're "ignore"ing our extra columns, we should keep them by adding them to # our intermediate and final columns list cut_columns.append(current_column) final_header.append(current_column) elif if_extra_columns != 'remove': # If it's not ignore, add, or fail, then it's not a valid strategy raise TypeError(f"Invalid option {if_extra_columns} for " "argument `if_extra_columns`") # Add any columns we need to add for column in columns_to_add: self.table = petl.addfield(self.table, column, None) # Cut down to just the columns we care about self.table = petl.cut(self.table, *cut_columns) # Rename any columns self.table = petl.setheader(self.table, final_header) return self
def get_registration_report(self, report_id, block=False, poll_interval_seconds=60, report_timeout_seconds=3600): """ Get data from an existing registration report. `Args:` report_id: int The ID of the report to get data from block: bool Whether or not to block execution until the report is complete poll_interval_seconds: int If blocking, how long to pause between attempts to check if the report is done report_timeout_seconds: int If blocking, how long to wait for the report before timing out `Returns:` Parsons Table Parsons table with the report data. """ logger.info(f"Getting report with id {report_id}...") credentials = { 'partner_id': self.partner_id, 'partner_API_key': self.partner_api_key, } status_url = f'registrant_reports/{report_id}' download_url = None # Let's figure out at what time should we just give up because we waited # too long end_time = datetime.datetime.now() + datetime.timedelta( seconds=report_timeout_seconds) # If we have a download URL, we can move on and just download the # report. Otherwise, as long as we haven't run out of time, we will # check the status. while not download_url and datetime.datetime.now() < end_time: logger.debug( f'Registrations report not ready yet, sleeping %s seconds', poll_interval_seconds) # Check the status again via the status endpoint status_response = self.client.request(status_url, 'get', params=credentials) # Check to make sure the call got a valid response if status_response.status_code == requests.codes.ok: status_json = status_response.json() # Grab the download_url from the response. download_url = status_json.get('download_url') if not download_url and not block: return None else: raise RTVFailure("Couldn't get report status") if not download_url: # We just got the status, so we should wait a minute before # we check it again. time.sleep(poll_interval_seconds) # If we never got a valid download_url, then we timed out waiting for # the report to generate. We will log an error and exit. if not download_url: raise RTVFailure('Timed out waiting for report') # Download the report data download_response = self.client.request(download_url, 'get', params=credentials) # Check to make sure the call got a valid response if download_response.status_code == requests.codes.ok: report_data = download_response.text # Load the report data into a Parsons Table table = Table.from_csv_string(report_data) # Transform the data from the report's CSV format to something more # Pythonic (snake case) normalized_column_names = [ re.sub(r'\s', '_', name).lower() for name in table.columns ] normalized_column_names = [ re.sub(r'[^A-Za-z\d_]', '', name) for name in normalized_column_names ] table.table = petl.setheader(table.table, normalized_column_names) return table else: raise RTVFailure('Unable to download report data')
import petl as etl table_header = [ "Fixed Acidity", "Volatile Acidity", "Citric Acid", "Sugar", "Chlorides", "Free SO2", "Total SO2", "Density", "pH", "Sulfates", "Alcohol", "Quality" ] table1 = etl.addfield( etl.convertnumbers( etl.setheader(etl.fromcsv('winequality-red.csv'), table_header)), "Type", "Red") table2 = etl.addfield( etl.convertnumbers( etl.setheader(etl.fromcsv('winequality-white.csv'), table_header)), "Type", "White") #print(etl.head(table1)) #print(etl.head(table2)) table1_filtered = etl.select(table1, "Quality", lambda v: v > 6) table2_filtered = etl.select(table2, "Quality", lambda v: v > 4) good_wines = etl.cat(table1_filtered, table2_filtered) good_wines_enhanced = etl.addfields( good_wines, [("Max Acidity", lambda rec: rec["Fixed Acidity"] + rec["Volatile Acidity"]), ("Locked SO2", lambda rec: rec["Total SO2"] - rec["Free SO2"])]) #print(etl.head(good_wines_enhanced)) #print(etl.tail(good_wines_enhanced))
def main(argv): parser = argparse.ArgumentParser() parser.add_argument("--input-csv-filename", required=True, help="Input UTF8 CSV to summarize") parser.add_argument("--sep-columns", required=False, nargs = '*', default=argparse.SUPPRESS, help="Column names of columns containing comma- or semi-colon-separated values") parser.add_argument("--sep-character", required=False, help="Character used to separate values in multi-value " \ "fields. Defaults to ';' if not specified.") parser.add_argument("--skip-columns", required=False, nargs='*', default=argparse.SUPPRESS, help="Column names to NOT generate stats for") parser.add_argument("--skip-num-rows", required=False, type=int, help="Skip specified number " "of header rows") parser.add_argument("--first-ccb-column", required=False, help="String name of first CCB column. If " "specified, all preceeding columns will be labeled 'Servant Keeper' and this column " "and all subsequent will be labeled 'CCB'") args = parser.parse_args() if args.first_ccb_column is not None: column_prefix = 'Servant Keeper ' else: column_prefix = '' assert os.path.isfile(args.input_csv_filename), "Error: cannot open file '" + args.input_csv_filename + "'" table = petl.fromcsv(args.input_csv_filename) # Skip header rows if args.skip_num_rows: skip_num = args.skip_num_rows assert skip_num > 0, "--skip-num-rows value '" + str(skip_num) + "' is invalid. Must be positive." it = iter(table) while skip_num >= 0: row = next(it) skip_num -= 1 table = petl.setheader(table, row) table = petl.tail(table, petl.nrows(table) - args.skip_num_rows) # Print nicely formatted stats for each column sep = '' args_dict = vars(args) skip_columns_specified = 'skip_columns' in args_dict sep_char_specified = 'sep_character' in args_dict for column in petl.header(table): if args.first_ccb_column is not None and column == args.first_ccb_column: column_prefix = 'CCB ' if not skip_columns_specified or column not in args.skip_columns: output_str = column_prefix + "Column '" + column + "'" print sep + output_str print >> sys.stderr, output_str if args.sep_columns is not None and column in args.sep_columns: if sep_char_specified: sep_character = args.sep_character else: sep_character = ';' output_str = num_dict2str(dict_dump(sep_valuecounter(table, column, sep_character))) print output_str else: output_str = num_dict2str(dict_dump(valuecounts(table, column))) print output_str sep = '\n' # Flush to ensure all output is written sys.stdout.flush() sys.stderr.flush()