Пример #1
0
    def cast(self, field_type: 'FieldType') -> 'RecordsSchemaFieldConstraints':
        from .integer import RecordsSchemaFieldIntegerConstraints
        from .decimal import RecordsSchemaFieldDecimalConstraints
        from .string import RecordsSchemaFieldStringConstraints
        required = self.required
        unique = self.unique
        constraints: RecordsSchemaFieldConstraints
        if field_type == 'integer':
            constraints =\
                RecordsSchemaFieldIntegerConstraints(required=required,
                                                     unique=unique,
                                                     min_=None,
                                                     max_=None)
        elif field_type == 'string':
            constraints =\
                RecordsSchemaFieldStringConstraints(required=required,
                                                    unique=unique,
                                                    max_length_bytes=None,
                                                    max_length_chars=None)
        elif field_type == 'decimal':
            constraints =\
                RecordsSchemaFieldDecimalConstraints(required=required,
                                                     unique=unique)
        elif (field_type == 'boolean' or field_type == 'date'
              or field_type == 'time' or field_type == 'timetz'
              or field_type == 'datetime' or field_type == 'datetimetz'):
            constraints =\
                RecordsSchemaFieldConstraints(required=required,
                                              unique=unique)
        else:
            _assert_never(
                field_type, 'Teach me how to downcast constraints '
                f'for {field_type}')

        return constraints
Пример #2
0
def rewound_decompressed_fileobj(
        fileobj: IO[bytes],
        compression: HintCompression) -> Iterator[IO[bytes]]:
    with rewound_fileobj(fileobj) as fileobj_after_rewind:
        if compression is None:
            yield fileobj
        elif compression == 'GZIP':
            yield gzip.GzipFile(mode='rb',
                                fileobj=fileobj_after_rewind)  # type: ignore
        elif compression == 'LZO':
            # This might be useful to implement this:
            #  https://github.com/ir193/python-lzo/blob/master/lzo.py#L44
            raise NotImplementedError(
                'Records mover does not currently know how '
                'to decompress LZO files for inspection')
        elif compression == 'BZIP':
            yield bz2.BZ2File(mode='rb', filename=fileobj_after_rewind)
        else:
            _assert_never(compression)
def add_load_job_csv_config(unhandled_hints: Set[str],
                            hints: ValidatedRecordsHints,
                            fail_if_cant_handle_hint: bool,
                            config: bigquery.LoadJobConfig) -> None:
    # source_format: File format of the data.
    config.source_format = 'CSV'

    # encoding: The character encoding of the data.
    # The supported values are UTF-8 or ISO-8859-1.
    # "UTF-8 or ISO-8859-1"
    #
    if hints.encoding == 'UTF8':
        config.encoding = 'UTF-8'
    else:
        # Currently records hints don't support ISO-8859-1
        cant_handle_hint(fail_if_cant_handle_hint, 'encoding', hints)
    quiet_remove(unhandled_hints, 'encoding')

    # field_delimiter: The separator for fields in a CSV file.
    assert isinstance(hints.field_delimiter, str)
    config.field_delimiter = hints.field_delimiter
    quiet_remove(unhandled_hints, 'field-delimiter')

    # allow_jagged_rows: Allow missing trailing optional columns (CSV only).

    # null_marker: Represents a null value (CSV only)
    #
    # (documentation is mangled for this one, but I assume the default is
    # '' or something sensible, so not messing with it)

    # quote_character: Character used to quote data sections (CSV
    # only).
    #
    # [Optional] The value that is used to quote data sections in
    # a CSV file. BigQuery converts the string to ISO-8859-1
    # encoding, and then uses the first byte of the encoded string
    # to split the data in its raw, binary state. The default
    # value is a double-quote ('"'). If your data does not contain
    # quoted sections, set the property value to an empty
    # string. If your data contains quoted newline characters, you
    # must also set the allowQuotedNewlines property to
    # true.
    #
    # @default "

    # I tried a few combinations and found that when you leave quote_character as the default
    #
    # * Fields quoted with "" are loaded without the surrounding quotes in the
    #   string
    # * "" becomes " in a quoted field
    # * "" stays "" in a non-quoted field
    # * nonnumeric quoting works fine
    # * full quoting works fine

    if hints.quoting is None:
        config.quote_character = ''
    elif hints.quoting == 'all' or hints.quoting == 'minimal' or hints.quoting == 'nonnumeric':
        # allow_quoted_newlines: Allow quoted data containing newline
        # characters (CSV only).

        config.allow_quoted_newlines = True

        assert isinstance(hints.quotechar, str)
        config.quote_character = hints.quotechar
        if hints.doublequote:
            pass
        else:
            cant_handle_hint(fail_if_cant_handle_hint, 'doublequote', hints)

    else:
        _assert_never(hints.quoting)
    quiet_remove(unhandled_hints, 'quoting')
    quiet_remove(unhandled_hints, 'quotechar')
    quiet_remove(unhandled_hints, 'doublequote')

    # No mention of escaping in BigQuery documentation, and in
    # practice backslashes come through without being interpreted.
    if hints.escape is None:
        pass
    else:
        cant_handle_hint(fail_if_cant_handle_hint, 'escape', hints)
    quiet_remove(unhandled_hints, 'escape')

    # skip_leading_rows: Number of rows to skip when reading data (CSV only).
    if hints.header_row:
        config.skip_leading_rows = 1
    else:
        config.skip_leading_rows = 0
    quiet_remove(unhandled_hints, 'header-row')

    # "When you load CSV or JSON data, values in DATE columns must
    #  use the dash (-) separator and the date must be in the
    # following format: YYYY-MM-DD (year-month-day)."
    if hints.dateformat == 'YYYY-MM-DD':
        pass
    else:
        cant_handle_hint(fail_if_cant_handle_hint, 'dateformat', hints)
    quiet_remove(unhandled_hints, 'dateformat')

    # "When you load JSON or CSV data, values in TIMESTAMP columns
    #  must use a dash (-) separator for the date portion of the
    #  timestamp, and the date must be in the following format:
    #  YYYY-MM-DD (year-month-day). The hh:mm:ss
    #  (hour-minute-second) portion of the timestamp must use a
    #  colon (:) separator."
    #
    #
    # To test, log into BigQuery web console and try SQL like this
    #   (assumption is that the same timestamp parser is used during
    #   CSV loads)
    #
    #      select TIMESTAMP("2000-01-02 16:34:56.789012US/Eastern") as a;
    #
    # Tests performed and result displayed on console query:
    #
    # DATE:
    # * 01-02-2019 (rejected):
    # * 01/02/19 (rejected):
    # * 2019-01-01 (accepted): 2019-01-01
    # DATETIME:
    # * 2019-01-01 1:00pm (rejected):
    # * 2019-01-01 1:00:00pm (rejected)
    # * 2019-01-01 1:00PM (rejected):
    # * 2019-01-01 13:00 (rejected):
    # * 2019-01-01 13:00:00 (accepted): 2019-01-01T13:00:00
    # * 2019-01-01 1:00pm US/Eastern (rejected):
    # * 2019-01-01 1:00:00pm US/Eastern (rejected):
    # * 2019-01-01 13:00:00 US/Eastern (rejected):
    # * 2019-01-01 13:00:00 EST (rejected):
    # * 1997-12-17 07:37:16-08 (rejected)
    # * 2019-01-01T13:00:00 (accepted): 2019-01-01T13:00:00
    #
    # TIME:
    # * 1:00pm (rejected):
    # * 1:00:00pm (rejected):
    # * 13:00 (rejected):
    # * 13:00:00 (accepted): 13:00:00
    # * 1:00pm US/Eastern (rejected):
    # * 1:00pm EST (rejected):
    # * 07:37:16-08 (rejected):
    #
    # TIMESTAMP ("Required format is YYYY-MM-DD
    # HH:MM[:SS[.SSSSSS]]", which is BS, as it doesn't specify the
    # timezone format):
    #
    # * 2019-01-01 1:00pm (rejected):
    # * 2019-01-01 1:00:00pm (rejected)
    # * 2019-01-01 1:00PM (rejected):
    # * 2019-01-01 13:00 (rejected):
    # * 2019-01-01 13:00:00 (accepted): 2019-01-01T13:00:00
    # * 2019-01-01 1:00pm US/Eastern (rejected):
    # * 2019-01-01 1:00:00pm US/Eastern (rejected):
    # * 2019-01-01 13:00:00 US/Eastern (rejected):
    # * 2019-01-01 13:00:00 EST (rejected):
    # * 1997-12-17 07:37:16-08 (accepted): 1997-12-17 15:37:16 UTC
    # * 2019-01-01T13:00:00-08 (accepted): 2019-01-01 21:00:00 UTC
    # * 2000-01-02 16:34:56.789012+0000 (rejected)
    # * 2000-01-02 16:34:56.789012+00:00 (accepted)
    # * 2000-01-02 16:34:56.789012EST (rejected)
    # * 2000-01-02 16:34:56.789012US/Eastern (rejected)
    # * 2000-01-02 16:34:56.789012UTC (accepted): 2000-01-02 16:34:56.789012 UTC
    # * 2000-01-02 16:34:56.789012 UTC (accepted: 2000-01-02 16:34:56.789012 UTC
    #
    # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#examples
    #
    # https://stackoverflow.com/questions/47466296/bigquery-datetime-format-csv-to-bigquery-yyyy-mm-dd-hhmmss-ssssss
    #
    # BigQuery supports exactly one format of ingesting timestamps
    # with timezones (what they call 'TIMESTAMP' they call timestamps
    # without timezones 'DATETIME'.
    #
    # That format they accept is ISO 8601, which sounds all nice and
    # standardy. Usable timestamps look like 2000-01-02
    # 16:34:56.789012+00:00.
    # Cool cool. The only issue is that Python's strftime doesn't
    # actually provide a way to add the ':' in the timezone
    # offset. The only timezone offset code, %z, does not provide the
    # colon. Other implementations (GNU libc) offers the %:z option,
    # but that doesn't exist in Python and thus in Pandas.
    #
    # So if you're using Python to export timestamps with timezones,
    # you should probably use the `YYYY-MM-DD HH24:MI:SS` format and
    # express them in UTC.
    #
    # https://stackoverflow.com/questions/44836581/does-python-time-strftime-process-timezone-options-correctly-for-rfc-3339
    # https://stackoverflow.com/questions/28729212/pandas-save-date-in-iso-format
    #
    if hints.datetimeformat in ['YYYY-MM-DD HH24:MI:SS', 'YYYY-MM-DD HH:MI:SS']:
        pass
    else:
        cant_handle_hint(fail_if_cant_handle_hint, 'datetimeformat', hints)
    quiet_remove(unhandled_hints, 'datetimeformat')

    if hints.datetimeformattz in ['YYYY-MM-DD HH:MI:SSOF',
                                  'YYYY-MM-DD HH24:MI:SSOF',
                                  'YYYY-MM-DD HH:MI:SS']:
        pass
    else:
        cant_handle_hint(fail_if_cant_handle_hint, 'datetimeformattz', hints)
    quiet_remove(unhandled_hints, 'datetimeformattz')

    if hints.timeonlyformat in ['HH24:MI:SS', 'HH:MI:SS']:
        pass
    else:
        cant_handle_hint(fail_if_cant_handle_hint, 'timeonlyformat', hints)
    quiet_remove(unhandled_hints, 'timeonlyformat')

    # No options to change this.  Tested with unix newlines, dos
    # newlines and mac newlines and all were understood.:
    if hints.record_terminator in ['\n', '\r\n', '\r', None]:
        pass
    else:
        cant_handle_hint(fail_if_cant_handle_hint, 'record-terminator', hints)
    quiet_remove(unhandled_hints, 'record-terminator')

    # No way to flag compression, but tested uncompressed, with
    # gzip and works great.  .bz2 gives "400 Unsupported
    # compression type".  Not sure about .lzo, but pandas can't
    # handle it regardless, so doubt it's handled.
    if hints.compression is None or hints.compression == 'GZIP':
        pass
    else:
        cant_handle_hint(fail_if_cant_handle_hint, 'compression', hints)
    quiet_remove(unhandled_hints, 'compression')
Пример #4
0
def redshift_unload_options(
        unhandled_hints: Set[str], records_format: BaseRecordsFormat,
        fail_if_cant_handle_hint: bool) -> RedshiftUnloadOptions:
    redshift_options: RedshiftUnloadOptions = {}
    if isinstance(records_format, ParquetRecordsFormat):
        redshift_options['format'] = Format.parquet
        return redshift_options

    if not isinstance(records_format, DelimitedRecordsFormat):
        raise NotImplementedError(
            "Redshift export only supported via Parquet and "
            "delimited currently")
    hints = records_format.\
        validate(fail_if_cant_handle_hint=fail_if_cant_handle_hint)
    if hints.escape == '\\':
        redshift_options['escape'] = True
    elif hints.escape is None:
        pass
    else:
        _assert_never(hints.escape)
    quiet_remove(unhandled_hints, 'escape')
    redshift_options['delimiter'] = hints.field_delimiter
    quiet_remove(unhandled_hints, 'field-delimiter')
    if hints.record_terminator == "\n":
        # This is Redshift's one and only export format
        pass
    else:
        cant_handle_hint(fail_if_cant_handle_hint, 'record-terminator', hints)
    quiet_remove(unhandled_hints, 'record-terminator')
    if hints.quoting == 'all':
        if hints.doublequote is not False:
            cant_handle_hint(fail_if_cant_handle_hint, 'doublequote', hints)
        if hints.quotechar != '"':
            cant_handle_hint(fail_if_cant_handle_hint, 'quotechar', hints)
        redshift_options['add_quotes'] = True
    elif hints.quoting is None:
        redshift_options['add_quotes'] = False
    else:
        cant_handle_hint(fail_if_cant_handle_hint, 'quoting', hints)
    quiet_remove(unhandled_hints, 'quoting')
    quiet_remove(unhandled_hints, 'doublequote')
    quiet_remove(unhandled_hints, 'quotechar')
    if hints.compression == 'GZIP':
        redshift_options['gzip'] = True
    elif hints.compression is None:
        # good to go
        pass
    else:
        cant_handle_hint(fail_if_cant_handle_hint, 'compression', hints)
    quiet_remove(unhandled_hints, 'compression')
    if hints.encoding != 'UTF8':
        cant_handle_hint(fail_if_cant_handle_hint, 'encoding', hints)
    quiet_remove(unhandled_hints, 'encoding')
    if hints.datetimeformattz not in [
            'YYYY-MM-DD HH:MI:SSOF', 'YYYY-MM-DD HH24:MI:SSOF'
    ]:
        cant_handle_hint(fail_if_cant_handle_hint, 'datetimeformattz', hints)
    quiet_remove(unhandled_hints, 'datetimeformattz')
    if hints.datetimeformat not in [
            'YYYY-MM-DD HH24:MI:SS', 'YYYY-MM-DD HH:MI:SS'
    ]:
        cant_handle_hint(fail_if_cant_handle_hint, 'datetimeformat', hints)
    quiet_remove(unhandled_hints, 'datetimeformat')
    if hints.dateformat != 'YYYY-MM-DD':
        cant_handle_hint(fail_if_cant_handle_hint, 'dateformat', hints)
    quiet_remove(unhandled_hints, 'dateformat')
    # Redshift doesn't have a time without date type, so there's
    # nothing that could be exported there.
    quiet_remove(unhandled_hints, 'timeonlyformat')

    if hints.header_row:
        # Redshift unload doesn't know how to add headers on an
        # unload.  Bleh.
        # https://stackoverflow.com/questions/24681214/unloading-from-redshift-to-s3-ith-headers
        cant_handle_hint(fail_if_cant_handle_hint, 'header-row', hints)
    else:
        quiet_remove(unhandled_hints, 'header-row')

    return redshift_options
Пример #5
0
def redshift_copy_options(
        unhandled_hints: Set[str], records_format: BaseRecordsFormat,
        fail_if_cant_handle_hint: bool, fail_if_row_invalid: bool,
        max_failure_rows: Optional[int]) -> RedshiftCopyOptions:
    redshift_options: RedshiftCopyOptions = {}

    if isinstance(records_format, AvroRecordsFormat):
        redshift_options['format'] = Format.avro
        return redshift_options

    if not isinstance(records_format, DelimitedRecordsFormat):
        raise NotImplementedError(f"Teach me how to COPY to {records_format}")

    hints = records_format.\
        validate(fail_if_cant_handle_hint=fail_if_cant_handle_hint)

    if hints.compression == 'GZIP':
        redshift_options['compression'] = Compression.gzip
    elif hints.compression == 'LZO':
        redshift_options['compression'] = Compression.lzop
    elif hints.compression == 'BZIP':
        redshift_options['compression'] = Compression.bzip2
    elif hints.compression is None:
        redshift_options['compression'] = None
    else:
        _assert_never(hints.compression)
    quiet_remove(unhandled_hints, 'compression')
    if hints.dateformat is None:
        redshift_options['date_format'] = 'auto'
    else:
        redshift_options['date_format'] = hints.dateformat
    quiet_remove(unhandled_hints, 'dateformat')
    if hints.encoding == 'UTF8':
        redshift_options['encoding'] = Encoding.utf8
    elif hints.encoding == 'UTF16':
        redshift_options['encoding'] = Encoding.utf16
    elif hints.encoding == 'UTF16LE':
        redshift_options['encoding'] = Encoding.utf16le
    elif hints.encoding == 'UTF16BE':
        redshift_options['encoding'] = Encoding.utf16be
    else:
        cant_handle_hint(fail_if_cant_handle_hint, 'encoding', hints)
        redshift_options['encoding'] = Encoding(hints.encoding)
    quiet_remove(unhandled_hints, 'encoding')
    redshift_options['quote'] = hints.quotechar
    quiet_remove(unhandled_hints, 'quotechar')
    if hints.quoting == 'minimal':
        if hints.escape is not None:
            cant_handle_hint(fail_if_cant_handle_hint, 'escape', hints)
        if hints.field_delimiter != ',':
            cant_handle_hint(fail_if_cant_handle_hint, 'field-delimiter',
                             hints)
        if hints.doublequote is not True:
            cant_handle_hint(fail_if_cant_handle_hint, 'doublequote', hints)

        redshift_options['format'] = Format.csv
    else:
        redshift_options['delimiter'] = hints.field_delimiter
        if hints.escape == '\\':
            redshift_options['escape'] = True
        elif hints.escape is None:
            redshift_options['escape'] = False
        else:
            _assert_never(hints.escape)
        if hints.quoting == 'all':
            redshift_options['remove_quotes'] = True
            if hints.doublequote is not False:
                cant_handle_hint(fail_if_cant_handle_hint, 'doublequote',
                                 hints)

        elif hints.quoting is None:
            redshift_options['remove_quotes'] = False
        else:
            cant_handle_hint(fail_if_cant_handle_hint, 'quoting', hints)
    quiet_remove(unhandled_hints, 'quoting')
    quiet_remove(unhandled_hints, 'escape')
    quiet_remove(unhandled_hints, 'field-delimiter')
    quiet_remove(unhandled_hints, 'doublequote')
    if hints.datetimeformat is None:
        redshift_options['time_format'] = 'auto'
    else:
        # After testing, Redshift's date/time parsing doesn't actually
        # support timezone parsing if you give it configuration - as
        # documented below, it doesn't accept a time zone as part of
        # the format string, and in experimentation, it silently drops
        # the offset when data into a timestamptz field if you specify
        # one directly.
        #
        # Its automatic parser seems to be smarter, though, and is
        # likely to handle a variety of formats:
        #
        # https://docs.aws.amazon.com/redshift/latest/dg/automatic-recognition.html
        if hints.datetimeformat != hints.datetimeformattz:
            # The Redshift auto parser seems to take a good handling
            # at our various supported formats, so let's give it a
            # shot if we're not able to specify a specific format due
            # to the Redshift timestamptz limitation:
            #
            # analytics=> create table formattest (test char(32));
            # CREATE TABLE
            # analytics=> insert into formattest values('2018-01-01 12:34:56');
            # INSERT 0 1
            # analytics=> insert into formattest values('01/02/18 15:34');
            # INSERT 0 1
            # analytics=> insert into formattest values('2018-01-02 15:34:12');
            # INSERT 0 1
            # analytics=> insert into formattest values('2018-01-02 10:34 PM');
            # INSERT 0 1
            # analytics=> select test, cast(test as timestamp) as timestamp,
            #             cast(test as date) as date from formattest;
            #
            #                test               |      timestamp      |    date
            # ----------------------------------+---------------------+------------
            #  2018-01-01 12:34:56              | 2018-01-01 12:34:56 | 2018-01-01
            #  01/02/18 15:34                   | 2018-01-02 15:34:00 | 2018-01-02
            #  2018-01-02 15:34:12              | 2018-01-02 15:34:12 | 2018-01-02
            #  2018-01-02 10:34 PM              | 2018-01-02 22:34:00 | 2018-01-02
            # (4 rows)
            #
            # analytics=>
            redshift_options['time_format'] = 'auto'
        else:
            redshift_options['time_format'] = hints.datetimeformat
    quiet_remove(unhandled_hints, 'datetimeformat')
    quiet_remove(unhandled_hints, 'datetimeformattz')
    # Redshift doesn't support time-only fields, so these will
    # come in as strings regardless.
    quiet_remove(unhandled_hints, 'timeonlyformat')
    if max_failure_rows is not None:
        redshift_options['max_error'] = max_failure_rows
    elif fail_if_row_invalid:
        redshift_options['max_error'] = 0
    else:
        # max allowed value
        redshift_options['max_error'] = 100000
    if hints.record_terminator is not None and \
       hints.record_terminator != "\n":

        cant_handle_hint(fail_if_cant_handle_hint, 'record-terminator', hints)
    quiet_remove(unhandled_hints, 'record-terminator')

    if hints.header_row:
        redshift_options['ignore_header'] = 1
    else:
        redshift_options['ignore_header'] = 0
    quiet_remove(unhandled_hints, 'header-row')

    return redshift_options
Пример #6
0
def pandas_to_csv_options(
        records_format: DelimitedRecordsFormat, unhandled_hints: Set[str],
        processing_instructions: ProcessingInstructions) -> Dict[str, object]:
    # https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html
    hints = records_format.\
        validate(fail_if_cant_handle_hint=processing_instructions.fail_if_cant_handle_hint)

    fail_if_cant_handle_hint = processing_instructions.fail_if_cant_handle_hint

    pandas_options: Dict[str, object] = {}

    pandas_options['encoding'] = hints.encoding
    quiet_remove(unhandled_hints, 'encoding')

    if hints.compression is None:
        # hints['compression']=None will output an uncompressed csv,
        # which is the pandas default.
        pass
    elif hints.compression == 'GZIP':
        pandas_options['compression'] = 'gzip'
    elif hints.compression == 'BZIP':
        pandas_options['compression'] = 'bz2'
    else:
        cant_handle_hint(fail_if_cant_handle_hint, 'compression', hints)
    quiet_remove(unhandled_hints, 'compression')

    if hints.quoting is None:
        pandas_options['quoting'] = csv.QUOTE_NONE
    elif hints.quoting == 'all':
        pandas_options['quoting'] = csv.QUOTE_ALL
    elif hints.quoting == 'minimal':
        pandas_options['quoting'] = csv.QUOTE_MINIMAL
    elif hints.quoting == 'nonnumeric':
        pandas_options['quoting'] = csv.QUOTE_NONNUMERIC
    else:
        _assert_never(hints.quoting)
    quiet_remove(unhandled_hints, 'quoting')

    pandas_options['doublequote'] = hints.doublequote
    quiet_remove(unhandled_hints, 'doublequote')
    pandas_options['quotechar'] = hints.quotechar
    quiet_remove(unhandled_hints, 'quotechar')

    if hints.escape is None:
        pass
    else:
        pandas_options['escapechar'] = hints.escape
    quiet_remove(unhandled_hints, 'escape')

    pandas_options['header'] = hints.header_row
    quiet_remove(unhandled_hints, 'header-row')

    if hints.dateformat is None:
        if hints.datetimeformattz == hints.datetimeformat:
            # BigQuery requires that timezone offsets have a colon;
            # Python (and thus Pandas) doesn't support adding the
            # colon with strftime.  However, we can specify things
            # without a timezone delimiter just fine.
            #
            # Unfortunately Python/Pandas will drop the timezone info
            # instead of converting the timestamp to UTC.  This
            # corrupts the time, as BigQuery assumes what it gets in
            # is UTC format.  Boo.
            #
            # $ python3
            # >>> import pytz
            # >>> us_eastern = pytz.timezone('US/Eastern')
            # >>> import datetime
            # >>> us_eastern.localize(datetime.datetime(2000, 1, 2, 12, 34, 56, 789012))
            #        .strftime('%Y-%m-%d %H:%M:%S.%f')
            # '2000-01-02 12:34:56.789012'
            # >>>
            #
            # https://github.com/bluelabsio/records-mover/issues/95
            pandas_options['date_format'] = '%Y-%m-%d %H:%M:%S.%f'
        else:
            pandas_options['date_format'] = '%Y-%m-%d %H:%M:%S.%f%z'
    elif hints.dateformat == 'YYYY-MM-DD':
        if hints.datetimeformattz == hints.datetimeformat:
            pandas_options['date_format'] = '%Y-%m-%d %H:%M:%S.%f'
        else:
            pandas_options['date_format'] = '%Y-%m-%d %H:%M:%S.%f%z'
    elif hints.dateformat == 'MM-DD-YYYY':
        if hints.datetimeformattz == hints.datetimeformat:
            pandas_options['date_format'] = '%m-%d-%Y %H:%M:%S.%f'
        else:
            pandas_options['date_format'] = '%m-%d-%Y %H:%M:%S.%f%z'
    elif hints.dateformat == 'DD-MM-YYYY':
        if hints.datetimeformattz == hints.datetimeformat:
            pandas_options['date_format'] = '%d-%m-%Y %H:%M:%S.%f'
        else:
            pandas_options['date_format'] = '%d-%m-%Y %H:%M:%S.%f%z'
    elif hints.dateformat == 'MM/DD/YY':
        if hints.datetimeformattz == hints.datetimeformat:
            pandas_options['date_format'] = '%m/%d/%y %H:%M:%S.%f'
        else:
            pandas_options['date_format'] = '%m/%d/%y %H:%M:%S.%f%z'
    else:
        cant_handle_hint(fail_if_cant_handle_hint, 'dateformat', hints)
    quiet_remove(unhandled_hints, 'dateformat')

    # pandas can't seem to export a date and time together :(
    #
    # might be nice someday to only emit the errors if the actual data
    # being moved is affected by whatever limitation...
    if (hints.datetimeformattz
            not in (f"{hints.dateformat} HH24:MI:SSOF",
                    f"{hints.dateformat} HH:MI:SSOF",
                    f"{hints.dateformat} HH24:MI:SS",
                    f"{hints.dateformat} HH:MI:SS",
                    f"{hints.dateformat} HH:MIOF", f"{hints.dateformat} HH:MI",
                    f"{hints.dateformat} HH24:MIOF",
                    f"{hints.dateformat} HH24:MI")):
        cant_handle_hint(fail_if_cant_handle_hint, 'datetimeformattz', hints)
    quiet_remove(unhandled_hints, 'datetimeformattz')

    valid_datetimeformat = [
        f"{hints.dateformat} HH24:MI:SS",
        f"{hints.dateformat} HH:MI:SS",
        f"{hints.dateformat} HH24:MI",
        f"{hints.dateformat} HH:MI",
    ]
    if (hints.datetimeformat not in valid_datetimeformat):
        cant_handle_hint(fail_if_cant_handle_hint, 'datetimeformat', hints)
    quiet_remove(unhandled_hints, 'datetimeformat')

    if hints.timeonlyformat != 'HH24:MI:SS':
        cant_handle_hint(fail_if_cant_handle_hint, 'timeonlyformat', hints)
    quiet_remove(unhandled_hints, 'timeonlyformat')

    pandas_options['sep'] = hints.field_delimiter
    quiet_remove(unhandled_hints, 'field-delimiter')

    pandas_options['line_terminator'] = hints.record_terminator
    quiet_remove(unhandled_hints, 'record-terminator')

    return pandas_options
Пример #7
0
def pandas_to_csv_options(records_format: DelimitedRecordsFormat,
                          unhandled_hints: Set[str],
                          processing_instructions: ProcessingInstructions) -> Dict[str, object]:
    # https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html
    hints = records_format.\
        validate(fail_if_cant_handle_hint=processing_instructions.fail_if_cant_handle_hint)

    fail_if_cant_handle_hint = processing_instructions.fail_if_cant_handle_hint

    pandas_options: Dict[str, object] = {}

    pandas_options['encoding'] = hints.encoding
    quiet_remove(unhandled_hints, 'encoding')

    if hints.compression is None:
        # hints['compression']=None will output an uncompressed csv,
        # which is the pandas default.
        pass
    elif hints.compression == 'GZIP':
        pandas_options['compression'] = 'gzip'
    elif hints.compression == 'BZIP':
        pandas_options['compression'] = 'bz2'
    else:
        cant_handle_hint(fail_if_cant_handle_hint, 'compression', hints)
    quiet_remove(unhandled_hints, 'compression')

    if hints.quoting is None:
        pandas_options['quoting'] = csv.QUOTE_NONE
    elif hints.quoting == 'all':
        pandas_options['quoting'] = csv.QUOTE_ALL
    elif hints.quoting == 'minimal':
        pandas_options['quoting'] = csv.QUOTE_MINIMAL
    elif hints.quoting == 'nonnumeric':
        pandas_options['quoting'] = csv.QUOTE_NONNUMERIC
    else:
        _assert_never(hints.quoting)
    quiet_remove(unhandled_hints, 'quoting')

    pandas_options['doublequote'] = hints.doublequote
    quiet_remove(unhandled_hints, 'doublequote')
    pandas_options['quotechar'] = hints.quotechar
    quiet_remove(unhandled_hints, 'quotechar')

    if hints.escape is None:
        pass
    else:
        pandas_options['escapechar'] = hints.escape
    quiet_remove(unhandled_hints, 'escape')

    pandas_options['header'] = hints.header_row
    quiet_remove(unhandled_hints, 'header-row')

    # Note the limitation on Pandas export with BigQuery around
    # datetimeformattz:
    #
    # https://github.com/bluelabsio/records-mover/issues/95

    # Pandas only gives us one parameter to set for formatting of its
    # Timestamp values, so we need the datetimeformat and
    # datetimeformattz hints to be nearly identical, modulo the
    # timezone at the end which will only appear if it is set in the
    # source data anyway:

    canonical_datetimeformattz = hints.datetimeformattz.replace('HH24', 'HH')
    canonical_datetimeformat = hints.datetimeformat.replace('HH24', 'HH')
    equivalent_with_timezone = f"{canonical_datetimeformat}OF"

    if (canonical_datetimeformattz not in
       [canonical_datetimeformat, equivalent_with_timezone]):
        cant_handle_hint(fail_if_cant_handle_hint, 'datetimeformat', hints)

    if 'AM' in hints.datetimeformattz:
        hour_specifier = '%I'
    else:
        hour_specifier = '%H'

    pandas_options['date_format'] = hints.datetimeformattz\
        .replace('YYYY', '%Y')\
        .replace('YY', '%y')\
        .replace('MM', '%m')\
        .replace('DD', '%d')\
        .replace('HH24', '%H')\
        .replace('HH12', '%I')\
        .replace('HH', hour_specifier)\
        .replace('MI', '%M')\
        .replace('SS', '%S.%f')\
        .replace('OF', '%z')\
        .replace('AM', '%p')
    quiet_remove(unhandled_hints, 'datetimeformat')
    quiet_remove(unhandled_hints, 'datetimeformattz')

    # timeonlyformat and dateformat are handled in prep_for_csv.py and
    # raw times and dates never appear in dataframes passed a
    # .to_csv() call.
    quiet_remove(unhandled_hints, 'timeonlyformat')
    quiet_remove(unhandled_hints, 'dateformat')

    pandas_options['sep'] = hints.field_delimiter
    quiet_remove(unhandled_hints, 'field-delimiter')

    pandas_options['line_terminator'] = hints.record_terminator
    quiet_remove(unhandled_hints, 'record-terminator')

    return pandas_options