Exemplo n.º 1
0
def save_characters_to_file(generated_file_path, characters_pages):
    etl.setheader(
        etl.empty(),
        settings.STAR_WARS_CHARACTERS_OUTPUT_FILE_HEADER_FIELDS,
    ).tocsv(generated_file_path)
    logger.info('Created file: %s', generated_file_path)
    for characters_page in characters_pages:
        etl.appendcsv(
            characters_page,
            generated_file_path,
            write_header=False,
        )
        logger.info('Added data to file: %s', generated_file_path)
Exemplo n.º 2
0
    def alter_varchar_column_widths(self, tbl, table_name):
        """
        Alter the width of a varchar columns in a Redshift table to match the widths
        of a Parsons table. The columns are matched by column name and not their
        index.

        `Args:`
            tbl: obj
                A Parsons table
            table_name:
                The target table name (e.g. ``my_schema.my_table``)
        `Returns:`
            ``None``
        """

        # Make the Parsons table column names match valid Redshift names
        tbl.table = petl.setheader(tbl.table, self.column_name_validate(tbl.columns))

        # Create a list of column names and max width for string values.
        pc = {c: tbl.get_column_max_width(c) for c in tbl.columns}

        # Determine the max width of the varchar columns in the Redshift table
        s, t = self.split_full_table_name(table_name)
        cols = self.get_columns(s, t)
        rc = {k: v['max_length'] for k, v in cols.items() if v['data_type'] == 'character varying'} # noqa: E501, E261

        # Figure out if any of the destination table varchar columns are smaller than the
        # associated Parsons table columns. If they are, then alter column types to expand
        # their width.
        for c in set(rc.keys()).intersection(set(pc.keys())):
            if rc[c] < pc[c]:
                logger.info(f'{c} not wide enough. Expanding column width.')
                self.alter_table_column_type(table_name, c, 'varchar', varchar_width=pc[c])
Exemplo n.º 3
0
    def create_statement(self, tbl, table_name, padding=None, distkey=None, sortkey=None,
                         varchar_max=None, varchar_truncate=True, columntypes=None):
        # Generate a table create statement

        # Validate and rename column names if needed
        tbl.table = petl.setheader(tbl.table, self.column_name_validate(tbl.columns))

        if tbl.num_rows == 0:
            raise ValueError('Table is empty. Must have 1 or more rows.')

        mapping = self.generate_data_types(tbl)

        if padding:
            mapping['longest'] = self.vc_padding(mapping, padding)

        if varchar_max:
            mapping['longest'] = self.vc_max(mapping, varchar_max)

        if varchar_truncate:
            mapping['longest'] = self.vc_trunc(mapping)

        mapping['longest'] = self.vc_validate(mapping)

        # Add any provided column type overrides
        if columntypes:
            for i in range(len(mapping['headers'])):
                col = mapping['headers'][i]
                if columntypes.get(col):
                    mapping['type_list'][i] = columntypes[col]

        # Enclose in quotes
        mapping['headers'] = ['"{}"'.format(h) for h in mapping['headers']]

        return self.create_sql(table_name, mapping, distkey=distkey, sortkey=sortkey)
Exemplo n.º 4
0
    def create_statement(self, tbl, table_name, strict_length=True):
        # Generate create statement SQL for a given Parsons table.

        # Validate and rename column names if needed
        tbl.table = petl.setheader(tbl.table,
                                   self.columns_convert(tbl.columns))

        # Generate the table map
        table_map = self.evaluate_table(tbl)

        # Generate the column syntax
        column_syntax = []
        for c in table_map:
            if strict_length:
                col_width = int(c['width'] + (self.VARCHAR_PAD * c['width']))
            else:
                col_width = self.round_longest(c['width'])

            if c['type'] == 'varchar':
                column_syntax.append(
                    f"{c['name']} {c['type']}({col_width}) \n")
            else:
                column_syntax.append(f"{c['name']} {c['type']} \n")

        # Generate full statement
        return f"CREATE TABLE {table_name} ( \n {','.join(column_syntax)});"
Exemplo n.º 5
0
    def set_header(self, new_header):
        """
        Replace the header row of the table.

        `Args:`
            new_header: list
                List of new header column names
        `Returns:`
            `Parsons Table` and also updates self
        """
        self.table = petl.setheader(self.table, new_header)
        return self
    def create_statement(self,
                         tbl,
                         table_name,
                         padding=None,
                         distkey=None,
                         sortkey=None,
                         varchar_max=None,
                         varchar_truncate=True,
                         columntypes=None,
                         strict_length=True):
        # Generate a table create statement. Distkeys and sortkeys are only used by
        # Redshift and should not be passed when generating a create statement for
        # Postgres.

        if tbl.num_rows == 0:
            raise ValueError('Table is empty. Must have 1 or more rows.')

        # Validate and rename column names if needed
        tbl.table = petl.setheader(tbl.table,
                                   self.column_name_validate(tbl.columns))

        mapping = self.generate_data_types(tbl)

        if padding:
            mapping['longest'] = self.vc_padding(mapping, padding)
        elif not strict_length:
            mapping['longest'] = self.vc_step(mapping)

        if varchar_max:
            mapping['longest'] = self.vc_max(mapping, varchar_max)

        if varchar_truncate:
            mapping['longest'] = self.vc_trunc(mapping)

        mapping['longest'] = self.vc_validate(mapping)

        # Add any provided column type overrides
        if columntypes:
            for i in range(len(mapping['headers'])):
                col = mapping['headers'][i]
                if columntypes.get(col):
                    mapping['type_list'][i] = columntypes[col]

        # Enclose in quotes
        mapping['headers'] = [f'"{h}"' for h in mapping['headers']]

        return self.create_sql(table_name,
                               mapping,
                               distkey=distkey,
                               sortkey=sortkey)
Exemplo n.º 7
0
    yield [rec['id'], 'age_months', rec['age'] * 12]
    yield [rec['id'], 'bmi', rec['weight'] / rec['height'] ** 2]

table2 = recordmapmany(table1, rowgenerator, fields=['subject_id', 'variable', 'value'])  
look(table2)


# setheader

table1 = [['foo', 'bar'],
          ['a', 1],
          ['b', 2]]

from petl import setheader, look
look(table1)
table2 = setheader(table1, ['foofoo', 'barbar'])
look(table2)


# extendheader

table1 = [['foo'],
          ['a', 1, True],
          ['b', 2, False]]

from petl import extendheader, look
look(table1)
table2 = extendheader(table1, ['bar', 'baz'])
look(table2)

Exemplo n.º 8
0
    yield [rec['id'], 'age_months', rec['age'] * 12]
    yield [rec['id'], 'bmi', rec['weight'] / rec['height'] ** 2]

table2 = recordmapmany(table1, rowgenerator, fields=['subject_id', 'variable', 'value'])  
look(table2)


# setheader

table1 = [['foo', 'bar'],
          ['a', 1],
          ['b', 2]]

from petl import setheader, look
look(table1)
table2 = setheader(table1, ['foofoo', 'barbar'])
look(table2)


# extendheader

table1 = [['foo'],
          ['a', 1, True],
          ['b', 2, False]]

from petl import extendheader, look
look(table1)
table2 = extendheader(table1, ['bar', 'baz'])
look(table2)

Exemplo n.º 9
0
def main(argv):

    parser = argparse.ArgumentParser()
    parser.add_argument("--input-csv-filename",
                        required=True,
                        help="Input UTF8 CSV to summarize")
    parser.add_argument(
        "--sep-columns",
        required=False,
        nargs='*',
        default=argparse.SUPPRESS,
        help=
        "Column names of columns containing comma- or semi-colon-separated values"
    )
    parser.add_argument("--sep-character", required=False, help="Character used to separate values in multi-value " \
        "fields.  Defaults to ';' if not specified.")
    parser.add_argument("--skip-columns",
                        required=False,
                        nargs='*',
                        default=argparse.SUPPRESS,
                        help="Column names to NOT generate stats for")
    parser.add_argument("--skip-num-rows",
                        required=False,
                        type=int,
                        help="Skip specified number "
                        "of header rows")
    parser.add_argument(
        "--first-ccb-column",
        required=False,
        help="String name of first CCB column.  If "
        "specified, all preceeding columns will be labeled 'Servant Keeper' and this column "
        "and all subsequent will be labeled 'CCB'")
    args = parser.parse_args()

    if args.first_ccb_column is not None:
        column_prefix = 'Servant Keeper '
    else:
        column_prefix = ''

    assert os.path.isfile(
        args.input_csv_filename
    ), "Error: cannot open file '" + args.input_csv_filename + "'"

    table = petl.fromcsv(args.input_csv_filename)

    # Skip header rows
    if args.skip_num_rows:
        skip_num = args.skip_num_rows
        assert skip_num > 0, "--skip-num-rows value '" + str(
            skip_num) + "' is invalid.  Must be positive."
        it = iter(table)
        while skip_num >= 0:
            row = next(it)
            skip_num -= 1
        table = petl.setheader(table, row)
        table = petl.tail(table, petl.nrows(table) - args.skip_num_rows)

    # Print nicely formatted stats for each column
    sep = ''
    args_dict = vars(args)
    skip_columns_specified = 'skip_columns' in args_dict
    sep_char_specified = 'sep_character' in args_dict
    for column in petl.header(table):
        if args.first_ccb_column is not None and column == args.first_ccb_column:
            column_prefix = 'CCB '
        if not skip_columns_specified or column not in args.skip_columns:
            output_str = column_prefix + "Column '" + column + "'"
            print sep + output_str
            print >> sys.stderr, output_str
            if args.sep_columns is not None and column in args.sep_columns:
                if sep_char_specified:
                    sep_character = args.sep_character
                else:
                    sep_character = ';'
                output_str = num_dict2str(
                    dict_dump(sep_valuecounter(table, column, sep_character)))
                print output_str
            else:
                output_str = num_dict2str(dict_dump(valuecounts(table,
                                                                column)))
                print output_str
        sep = '\n'

    # Flush to ensure all output is written
    sys.stdout.flush()
    sys.stderr.flush()
Exemplo n.º 10
0
    def match_columns(self,
                      desired_columns,
                      fuzzy_match=True,
                      if_extra_columns='remove',
                      if_missing_columns='add'):
        """
        Changes the column names and ordering in this Table to match a list of desired column
        names.

        `Args:`
            desired_columns: list
                Ordered list of desired column names
            fuzzy_match: bool
                Whether to normalize column names when matching against the desired column names,
                removing whitespace and non-alphanumeric characters, and lowercasing everything.
                Eg. With this flag set, "FIRST NAME" would match "first_name".
                If the Table has two columns that normalize to the same string (eg. "FIRST NAME"
                and "first_name"), the latter will be considered an extra column.
            if_extra_columns: string
                If the Table has columns that don't match any desired columns, either 'remove'
                them, 'ignore' them, or 'fail' (raising an error).
            if_missing_columns: string
                If the Table is missing some of the desired columns, either 'add' them (with a
                value of None), 'ignore' them, or 'fail' (raising an error).

        `Returns:`
            `Parsons Table` and also updates self
        """

        from parsons.etl import Table  # Just trying to avoid recursive imports.

        normalize_fn = Table.get_normalized_column_name if fuzzy_match else (
            lambda s: s)

        # Create a mapping of our "normalized" name to the original column name
        current_columns_normalized = {
            normalize_fn(col): col
            for col in self.columns
        }

        # Track any columns we need to add to our current table from our desired columns
        columns_to_add = []
        # We are going to do a "cut" later to trim our table and re-order the columns, but
        # we won't have renamed our columns yet, so we need to remember their un-normalized
        # form
        cut_columns = []
        # We are going to also rename our columns AFTER we cut, so we want to remember their
        # normalized names
        final_header = []

        # Loop through our desired columns -- the columns we want to see in our final table
        for desired_column in desired_columns:
            normalized_desired = normalize_fn(desired_column)
            # Try to find our desired column in our Table
            if normalized_desired not in current_columns_normalized:
                # If we can't find our desired column in our current columns, then it's "missing"
                if if_missing_columns == 'fail':
                    # If our missing strategy is to fail, raise an exception
                    raise TypeError(
                        f"Table is missing column {desired_column}")
                elif if_missing_columns == 'add':
                    # We have to add to our table
                    columns_to_add.append(desired_column)
                    # We will need to remember this column when we cut down to desired columns
                    cut_columns.append(desired_column)
                    # This will be in the final table
                    final_header.append(desired_column)
                elif if_missing_columns != 'ignore':
                    # If it's not ignore, add, or fail, then it's not a valid strategy
                    raise TypeError(f"Invalid option {if_missing_columns} for "
                                    "argument `if_missing_columns`")
            else:
                # We have found this in our current columns, so take it out of our list to search
                current_column = current_columns_normalized.pop(
                    normalized_desired)
                # Add the column to our intermediate table as the old column name
                cut_columns.append(current_column)
                # Add to our final header list as the "desired" name
                final_header.append(desired_column)

        # Look for any "extra" columns from our current table that aren't in our desired columns
        for current_column in current_columns_normalized.values():
            # Figure out what to do with our "extra" columns
            if if_extra_columns == 'fail':
                # If our missing strategy is to fail, raise an exception
                raise TypeError(f"Table has extra column {current_column}")
            elif if_extra_columns == 'ignore':
                # If we're "ignore"ing our extra columns, we should keep them by adding them to
                # our intermediate and final columns list
                cut_columns.append(current_column)
                final_header.append(current_column)
            elif if_extra_columns != 'remove':
                # If it's not ignore, add, or fail, then it's not a valid strategy
                raise TypeError(f"Invalid option {if_extra_columns} for "
                                "argument `if_extra_columns`")

        # Add any columns we need to add
        for column in columns_to_add:
            self.table = petl.addfield(self.table, column, None)

        # Cut down to just the columns we care about
        self.table = petl.cut(self.table, *cut_columns)

        # Rename any columns
        self.table = petl.setheader(self.table, final_header)

        return self
Exemplo n.º 11
0
    def get_registration_report(self,
                                report_id,
                                block=False,
                                poll_interval_seconds=60,
                                report_timeout_seconds=3600):
        """
        Get data from an existing registration report.

        `Args:`
            report_id: int
                The ID of the report to get data from
            block: bool
                Whether or not to block execution until the report is complete
            poll_interval_seconds: int
                If blocking, how long to pause between attempts to check if the report is done
            report_timeout_seconds: int
                If blocking, how long to wait for the report before timing out
        `Returns:`
            Parsons Table
                Parsons table with the report data.
        """
        logger.info(f"Getting report with id {report_id}...")
        credentials = {
            'partner_id': self.partner_id,
            'partner_API_key': self.partner_api_key,
        }
        status_url = f'registrant_reports/{report_id}'
        download_url = None

        # Let's figure out at what time should we just give up because we waited
        # too long
        end_time = datetime.datetime.now() + datetime.timedelta(
            seconds=report_timeout_seconds)

        # If we have a download URL, we can move on and just download the
        # report. Otherwise, as long as we haven't run out of time, we will
        # check the status.
        while not download_url and datetime.datetime.now() < end_time:
            logger.debug(
                f'Registrations report not ready yet, sleeping %s seconds',
                poll_interval_seconds)

            # Check the status again via the status endpoint
            status_response = self.client.request(status_url,
                                                  'get',
                                                  params=credentials)

            # Check to make sure the call got a valid response
            if status_response.status_code == requests.codes.ok:
                status_json = status_response.json()

                # Grab the download_url from the response.
                download_url = status_json.get('download_url')

                if not download_url and not block:
                    return None
            else:
                raise RTVFailure("Couldn't get report status")

            if not download_url:
                # We just got the status, so we should wait a minute before
                # we check it again.
                time.sleep(poll_interval_seconds)

        # If we never got a valid download_url, then we timed out waiting for
        # the report to generate. We will log an error and exit.
        if not download_url:
            raise RTVFailure('Timed out waiting for report')

        # Download the report data
        download_response = self.client.request(download_url,
                                                'get',
                                                params=credentials)

        # Check to make sure the call got a valid response
        if download_response.status_code == requests.codes.ok:
            report_data = download_response.text

            # Load the report data into a Parsons Table
            table = Table.from_csv_string(report_data)

            # Transform the data from the report's CSV format to something more
            # Pythonic (snake case)
            normalized_column_names = [
                re.sub(r'\s', '_', name).lower() for name in table.columns
            ]
            normalized_column_names = [
                re.sub(r'[^A-Za-z\d_]', '', name)
                for name in normalized_column_names
            ]
            table.table = petl.setheader(table.table, normalized_column_names)
            return table
        else:
            raise RTVFailure('Unable to download report data')
Exemplo n.º 12
0
import petl as etl

table_header = [
    "Fixed Acidity", "Volatile Acidity", "Citric Acid", "Sugar", "Chlorides",
    "Free SO2", "Total SO2", "Density", "pH", "Sulfates", "Alcohol", "Quality"
]

table1 = etl.addfield(
    etl.convertnumbers(
        etl.setheader(etl.fromcsv('winequality-red.csv'), table_header)),
    "Type", "Red")
table2 = etl.addfield(
    etl.convertnumbers(
        etl.setheader(etl.fromcsv('winequality-white.csv'), table_header)),
    "Type", "White")

#print(etl.head(table1))
#print(etl.head(table2))

table1_filtered = etl.select(table1, "Quality", lambda v: v > 6)
table2_filtered = etl.select(table2, "Quality", lambda v: v > 4)

good_wines = etl.cat(table1_filtered, table2_filtered)

good_wines_enhanced = etl.addfields(
    good_wines,
    [("Max Acidity",
      lambda rec: rec["Fixed Acidity"] + rec["Volatile Acidity"]),
     ("Locked SO2", lambda rec: rec["Total SO2"] - rec["Free SO2"])])
#print(etl.head(good_wines_enhanced))
#print(etl.tail(good_wines_enhanced))
Exemplo n.º 13
0
def main(argv):

    parser = argparse.ArgumentParser()
    parser.add_argument("--input-csv-filename", required=True, help="Input UTF8 CSV to summarize")
    parser.add_argument("--sep-columns", required=False, nargs = '*', default=argparse.SUPPRESS,
        help="Column names of columns containing comma- or semi-colon-separated values")
    parser.add_argument("--sep-character", required=False, help="Character used to separate values in multi-value " \
        "fields.  Defaults to ';' if not specified.")
    parser.add_argument("--skip-columns", required=False, nargs='*', default=argparse.SUPPRESS,
        help="Column names to NOT generate stats for")
    parser.add_argument("--skip-num-rows", required=False, type=int, help="Skip specified number "
        "of header rows")
    parser.add_argument("--first-ccb-column", required=False, help="String name of first CCB column.  If "
        "specified, all preceeding columns will be labeled 'Servant Keeper' and this column "
        "and all subsequent will be labeled 'CCB'")
    args = parser.parse_args()

    if args.first_ccb_column is not None:
        column_prefix = 'Servant Keeper '
    else:
        column_prefix = ''

    assert os.path.isfile(args.input_csv_filename), "Error: cannot open file '" + args.input_csv_filename + "'"

    table = petl.fromcsv(args.input_csv_filename)

    # Skip header rows
    if args.skip_num_rows:
        skip_num = args.skip_num_rows
        assert skip_num > 0, "--skip-num-rows value '" + str(skip_num) + "' is invalid.  Must be positive."
        it = iter(table)
        while skip_num >= 0:
            row = next(it)
            skip_num -= 1
        table = petl.setheader(table, row)
        table = petl.tail(table, petl.nrows(table) - args.skip_num_rows)

    # Print nicely formatted stats for each column
    sep = ''
    args_dict = vars(args)
    skip_columns_specified = 'skip_columns' in args_dict
    sep_char_specified = 'sep_character' in args_dict
    for column in petl.header(table):
        if args.first_ccb_column is not None and column == args.first_ccb_column:
            column_prefix = 'CCB '
        if not skip_columns_specified or column not in args.skip_columns:
            output_str = column_prefix + "Column '" + column + "'"
            print sep + output_str
            print >> sys.stderr, output_str
            if args.sep_columns is not None and column in args.sep_columns:
                if sep_char_specified:
                    sep_character = args.sep_character
                else:
                    sep_character = ';'
                output_str = num_dict2str(dict_dump(sep_valuecounter(table, column, sep_character)))
                print output_str
            else:
                output_str = num_dict2str(dict_dump(valuecounts(table, column)))
                print output_str
        sep = '\n'

    # Flush to ensure all output is written
    sys.stdout.flush()
    sys.stderr.flush()