def test_determine_output_date_order_style_datetime(self):
     unhandled_hints = set()
     # Records Mover only supports Postgres in ISO format at this
     # point (YYYY-MM-DD) - see comments in types.py and in
     # date_output_style.py for more detail.
     expected_failures = {
         'MM-DD-YYYY',
         'DD-MM-YYYY',
         'MM/DD/YY',
         'DD/MM/YY',
         'DD-MM-YY',
     }
     for dateformat in DATE_CASES:
         records_format = DelimitedRecordsFormat(
             hints={
                 'dateformat': dateformat,
                 'timeonlyformat': 'HH24:MI:SS',
                 'datetimeformattz': f'{dateformat} HH:MI:SSOF',
                 'datetimeformat': f'{dateformat} HH24:MI:SS'
             })
         fail_if_cant_handle_hint = True
         validated_hints =\
             records_format.validate(fail_if_cant_handle_hint=fail_if_cant_handle_hint)
         try:
             out = determine_date_output_style(unhandled_hints,
                                               validated_hints,
                                               fail_if_cant_handle_hint)
         except NotImplementedError:
             if dateformat in expected_failures:
                 pass
             else:
                 raise
         self.assertEqual(out, ('ISO', None))
 def test_determine_output_date_order_style_datetimeformat(self):
     unhandled_hints = set()
     # Records Mover only supports Postgres in ISO format at this
     # point (YYYY-MM-DD) - see comments in types.py and in
     # date_output_style.py for more detail.
     expected_failures = {
         # no timezone, even though otherwise in ISO format
         'YYYY-MM-DD HH:MI:SS',
         # not in ISO format
         'MM/DD/YY HH24:MI',
         # not in ISO format
         'YYYY-MM-DD HH12:MI AM',
     }
     natural_dateformat = {
         'YYYY-MM-DD HH:MI:SS': 'YYYY-MM-DD',
         'MM/DD/YY HH24:MI': 'MM/DD/YY',
         'YYYY-MM-DD HH24:MI:SS': 'YYYY-MM-DD',
         'YYYY-MM-DD HH12:MI AM': 'YYYY-MM-DD',
     }
     natural_timeonlyformat = {
         'YYYY-MM-DD HH:MI:SS': 'HH:MI:SS',
         'MM/DD/YY HH24:MI': 'HH24:MI',
         'YYYY-MM-DD HH24:MI:SS': 'HH24:MI:SS',
         'YYYY-MM-DD HH12:MI AM': 'HH12:MI AM',
     }
     natural_datetimeformattz = {
         'YYYY-MM-DD HH:MI:SS': 'YYYY-MM-DD HH:MI:SSOF',
         'MM/DD/YY HH24:MI': 'MM/DD/YY HH24:MIOF',
         'YYYY-MM-DD HH24:MI:SS': 'YYYY-MM-DD HH24:MI:SSOF',
         'YYYY-MM-DD HH12:MI AM': 'YYYY-MM-DD HH12:MI AM'
     }
     for datetimeformat in DATETIME_CASES:
         records_format = DelimitedRecordsFormat(
             hints={
                 'dateformat': natural_dateformat[datetimeformat],
                 'timeonlyformat': natural_timeonlyformat[datetimeformat],
                 'datetimeformattz':
                 natural_datetimeformattz[datetimeformat],
                 'datetimeformat': datetimeformat,
             })
         fail_if_cant_handle_hint = True
         validated_hints =\
             records_format.validate(fail_if_cant_handle_hint=fail_if_cant_handle_hint)
         try:
             out = determine_date_output_style(unhandled_hints,
                                               validated_hints,
                                               fail_if_cant_handle_hint)
         except NotImplementedError:
             if datetimeformat in expected_failures:
                 pass
             else:
                 raise
         self.assertEqual(out, ('ISO', None))
    def test_postgres_copy_options_csv_no_quoting(self):
        records_format = DelimitedRecordsFormat(variant='csv',
                                                hints={
                                                    'quoting': None,
                                                    'compression': None,
                                                })
        unhandled_hints = set(records_format.hints)
        fail_if_cant_handle_hint = True

        with self.assertRaises(NotImplementedError):
            postgres_copy_options_csv(
                unhandled_hints,
                records_format.validate(fail_if_cant_handle_hint=True),
                fail_if_cant_handle_hint, CopyOptionsMode.UNLOADING)
    def test_determine_output_date_order_style_iso(self):
        unhandled_hints = set()
        records_format = DelimitedRecordsFormat(hints={
            'dateformat': 'YYYY-MM-DD',
            'timeonlyformat': 'HH24:MI:SS',
            'datetimeformattz': 'YYYY-MM-DD HH:MI:SSOF',
            'datetimeformat': 'YYYY-MM-DD HH24:MI:SS'
        })
        fail_if_cant_handle_hint = True
        validated_hints = records_format.validate(fail_if_cant_handle_hint=fail_if_cant_handle_hint)

        out = determine_date_output_style(unhandled_hints,
                                          validated_hints,
                                          fail_if_cant_handle_hint)
        self.assertEqual(out, ('ISO', None))
Exemplo n.º 5
0
    def test_redshift_copy_options_encodings(self):
        tests = {
            'UTF16': Encoding.utf16,
            'UTF16LE': Encoding.utf16le,
            'UTF16BE': Encoding.utf16be
        }
        for hint_spelling, redshift_sqlalchemy_spelling in tests.items():

            records_format =\
                DelimitedRecordsFormat(variant='bluelabs',
                                       hints={
                                           'encoding': hint_spelling
                                       })
            unhandled_hints = set(records_format.hints.keys())
            out = redshift_copy_options(
                unhandled_hints,
                records_format.validate(fail_if_cant_handle_hint=True),
                fail_if_cant_handle_hint=True,
                fail_if_row_invalid=True,
                max_failure_rows=0)
            self.assertIs(out['encoding'], redshift_sqlalchemy_spelling)
    def test_postgres_copy_options_csv_minimal_quoting(self):
        records_format = DelimitedRecordsFormat(variant='csv',
                                                hints={
                                                    'quoting': 'minimal',
                                                    'compression': None,
                                                })
        unhandled_hints = set(records_format.hints)
        fail_if_cant_handle_hint = True
        mode = CopyOptionsMode.UNLOADING
        hints = records_format.validate(fail_if_cant_handle_hint=True)

        out = postgres_copy_options_csv(unhandled_hints, hints,
                                        fail_if_cant_handle_hint, mode)
        self.assertEqual(
            out, {
                'format': 'csv',
                'quote': '"',
                'delimiter': ',',
                'encoding': 'UTF8',
                'format': 'csv',
                'header': True,
            })
 def test_determine_output_date_order_style_timeonlyformat(self):
     unhandled_hints = set()
     # Records Mover only supports Postgres in ISO format at this
     # point (HH24:MI:SS aka HH:MI:SS) - see comments in types.py and in
     # date_output_style.py for more detail.
     expected_failures = {
         # only HH:MM:SS/HH24MM:SS are supported via the 'ISO'
         # format output
         'HH12:MI AM'
     }
     for timeonlyformat in TIMEONLY_CASES:
         if 'AM' in timeonlyformat:
             datetimeformattz = f"YYYY-MM-DD {timeonlyformat}"
         else:
             datetimeformattz = f"YYYY-MM-DD {timeonlyformat}OF"
         records_format = DelimitedRecordsFormat(
             hints={
                 'dateformat': "YYYY-MM-DD",
                 'timeonlyformat': timeonlyformat,
                 'datetimeformattz': datetimeformattz,
                 'datetimeformat': f"YYYY-MM-DD {timeonlyformat}",
             })
         fail_if_cant_handle_hint = True
         validated_hints =\
             records_format.validate(fail_if_cant_handle_hint=fail_if_cant_handle_hint)
         out = None
         try:
             out = determine_date_output_style(unhandled_hints,
                                               validated_hints,
                                               fail_if_cant_handle_hint)
         except NotImplementedError:
             if timeonlyformat in expected_failures:
                 pass
             else:
                 raise
         if out is not None:
             self.assertEqual(out, ('ISO', None))
def mysql_load_options(unhandled_hints: Set[str],
                       records_format: DelimitedRecordsFormat,
                       fail_if_cant_handle_hint: bool) -> MySqlLoadOptions:
    hints = records_format.validate(fail_if_cant_handle_hint=fail_if_cant_handle_hint)

    #
    # The server uses the character set indicated by the
    # character_set_database system variable to interpret the
    # information in the file. SET NAMES and the setting of
    # character_set_client do not affect interpretation of input. If
    # the contents of the input file use a character set that differs
    # from the default, it is usually preferable to specify the
    # character set of the file by using the CHARACTER SET clause. A
    # character set of binary specifies “no conversion.”
    #
    hint_encoding: HintEncoding = hints.encoding
    character_set = MYSQL_CHARACTER_SETS_FOR_LOAD.get(hint_encoding)
    if character_set is not None:
        mysql_character_set = character_set
    else:
        cant_handle_hint(fail_if_cant_handle_hint, 'encoding', records_format.hints)
        mysql_character_set = 'utf8'
    quiet_remove(unhandled_hints, 'encoding')

    field_terminator: HintFieldDelimiter = hints.field_delimiter
    mysql_fields_terminator = field_terminator
    quiet_remove(unhandled_hints, 'field-delimiter')

    # https://dev.mysql.com/doc/refman/8.0/en/load-data.html
    #
    #
    # LOAD DATA can be used to read files obtained from external
    # sources. For example, many programs can export data in
    # comma-separated values (CSV) format, such that lines have fields
    # separated by commas and enclosed within double quotation marks,
    # with an initial line of column names. If the lines in such a
    # file are terminated by carriage return/newline pairs, the
    # statement shown here illustrates the field- and line-handling
    # options you would use to load the file:
    #
    # LOAD DATA INFILE 'data.txt' INTO TABLE tbl_name
    #  FIELDS TERMINATED BY ',' ENCLOSED BY '"'
    #  LINES TERMINATED BY '\r\n'
    #  IGNORE 1 LINES;
    #
    #
    mysql_fields_enclosed_by = None
    mysql_fields_optionally_enclosed_by = None
    hint_quotechar: HintQuoteChar = hints.quotechar
    hint_quoting: HintQuoting = hints.quoting
    if hint_quoting == 'all':
        mysql_fields_enclosed_by = hint_quotechar
    elif hint_quoting == 'minimal':
        # "If the input values are not necessarily enclosed within
        # quotation marks, use OPTIONALLY before the ENCLOSED BY option."
        #
        # This implies to me that parsing here is permissive -
        # otherwise unambiguous strings without double quotes around
        # them will be understood as a string, not rejected.
        mysql_fields_optionally_enclosed_by = hint_quotechar
    elif hint_quoting == 'nonnumeric':
        mysql_fields_optionally_enclosed_by = hint_quotechar
    elif hint_quoting is None:
        pass
    else:
        _assert_never(hint_quotechar)
    quiet_remove(unhandled_hints, 'quoting')
    quiet_remove(unhandled_hints, 'quotechar')

    # If the field begins with the ENCLOSED BY character, instances of
    # that character are recognized as terminating a field value only
    # if followed by the field or line TERMINATED BY sequence. To
    # avoid ambiguity, occurrences of the ENCLOSED BY character within
    # a field value can be doubled and are interpreted as a single
    # instance of the character. For example, if ENCLOSED BY '"' is
    # specified, quotation marks are handled as shown here:
    hint_doublequote: HintDoublequote = hints.doublequote
    if hint_quoting is not None:
        # We need to ignore flake8's "is vs ==" check because 'is'
        # doesn't work currently with MyPy's Literal[] case checking
        if hint_doublequote == True:  # noqa: E712
            pass
        elif hint_doublequote == False:  # noqa: E712
            cant_handle_hint(fail_if_cant_handle_hint, 'doublequote', records_format.hints)
        else:
            _assert_never(hint_doublequote)
    quiet_remove(unhandled_hints, 'doublequote')

    # FIELDS ESCAPED BY controls how to read or write special characters:
    #
    # * For input, if the FIELDS ESCAPED BY character is not empty,
    #   occurrences of that character are stripped and the following
    #   character is taken literally as part of a field value. Some
    #   two-character sequences that are exceptions, where the first
    #   character is the escape character.
    #
    # [...]
    #
    # If the FIELDS ESCAPED BY character is empty, escape-sequence
    # interpretation does not occur.
    hint_escape: HintEscape = hints.escape
    if hint_escape is None:
        mysql_fields_escaped_by = None
    elif hint_escape == '\\':
        mysql_fields_escaped_by = '\\'
    else:
        _assert_never(hint_quoting)
    quiet_remove(unhandled_hints, 'escape')

    mysql_lines_starting_by = ''

    hint_record_terminator: HintRecordTerminator = hints.record_terminator
    mysql_lines_terminated_by = hint_record_terminator
    quiet_remove(unhandled_hints, 'record-terminator')

    hint_header_row: HintHeaderRow = hints.header_row
    # We need to ignore flake8's "is vs ==" check because 'is'
    # doesn't work currently with MyPy's Literal[] case checking
    if hint_header_row == True:  # noqa: E712
        mysql_ignore_n_lines = 1
    elif hint_header_row == False:  # noqa: E712
        mysql_ignore_n_lines = 0
    else:
        _assert_never(hint_header_row)
    quiet_remove(unhandled_hints, 'header-row')

    hint_compression: HintCompression = hints.compression
    if hint_compression is not None:
        cant_handle_hint(fail_if_cant_handle_hint, 'compression', records_format.hints)
    quiet_remove(unhandled_hints, 'compression')

    #
    # Date/time parsing in MySQL seems to be permissive enough to
    # accept all formats we've sent at it in integration tests.  That
    # said, DD/MM support is unlikely to work if the server isn't set
    # to a compatible locale.  This is true for a number of the
    # database drivers; the backlog item to address is here:
    #
    # https://github.com/bluelabsio/records-mover/issues/75
    #
    # To address, we'll want to look into "set trade_date" per
    # https://stackoverflow.com/questions/44171283/load-data-local-infile-with-sqlalchemy-and-pymysql
    #
    quiet_remove(unhandled_hints, 'dateformat')
    quiet_remove(unhandled_hints, 'timeonlyformat')
    quiet_remove(unhandled_hints, 'datetimeformat')
    quiet_remove(unhandled_hints, 'datetimeformattz')

    return MySqlLoadOptions(character_set=mysql_character_set,
                            fields_terminated_by=mysql_fields_terminator,
                            fields_enclosed_by=mysql_fields_enclosed_by,
                            fields_optionally_enclosed_by=mysql_fields_optionally_enclosed_by,
                            fields_escaped_by=mysql_fields_escaped_by,
                            lines_starting_by=mysql_lines_starting_by,
                            lines_terminated_by=mysql_lines_terminated_by,
                            ignore_n_lines=mysql_ignore_n_lines)
 def test_valid(self):
     variants = list(get_args(DelimitedVariant))  # type: ignore
     for variant in variants:
         records_format = DelimitedRecordsFormat(variant=variant)
         records_format.validate(fail_if_cant_handle_hint=True)
 def test_invalid_dateformattz(self):
     records_format = DelimitedRecordsFormat(
         variant='bluelabs', hints={'datetimeformattz': 'invalid'})
     with self.assertRaises(NotImplementedError):
         records_format.validate(fail_if_cant_handle_hint=True)