def load(self, hints, fail_if): processing_instructions = ProcessingInstructions() processing_instructions.fail_if_cant_handle_hint = fail_if processing_instructions.fail_if_dont_understand = fail_if processing_instructions.fail_if_row_invalid = fail_if self.mock_records_load_plan.records_format = DelimitedRecordsFormat( hints=hints) self.mock_records_load_plan.processing_instructions = processing_instructions return self.redshift_db_driver.loader().\ load(schema='myschema', table='mytable', load_plan=self.mock_records_load_plan, directory=self.mock_directory)
def test_vertica_format_permissive(self): vertica_format = DelimitedRecordsFormat(variant='vertica') processing_instructions = ProcessingInstructions(fail_if_row_invalid=False) load_plan = RecordsLoadPlan(processing_instructions=processing_instructions, records_format=vertica_format) unhandled_hints = set(load_plan.records_format.hints.keys()) options = vertica_import_options(unhandled_hints, load_plan) expected_options = { 'abort_on_error': False, 'delimiter': '\x01', 'enclosed_by': None, 'enforcelength': False, 'error_tolerance': True, 'escape_as': None, 'gzip': False, 'load_method': 'AUTO', 'no_commit': False, 'null_as': None, 'record_terminator': '\x02', 'rejectmax': None, 'skip': 0, 'trailing_nullcols': True, } self.assertDictEqual(options, expected_options) self.assertEqual(unhandled_hints, set())
def test_datetimeformat(self): # Double check this before adding anything else in here to see # if it has changed, but YYYY-MM-DD HH:MI:SS, YYYY-MM-DD # HH24:MI:SS and YYYY-MM-DD HH:MI:SS are the only formats # accepted by BigQuery as of this writing should_raise = { 'YYYY-MM-DD HH12:MI AM': True, 'MM/DD/YY HH24:MI': True, } processing_instructions = ProcessingInstructions( fail_if_dont_understand=True, fail_if_cant_handle_hint=True, fail_if_row_invalid=True) for datetimeformat in DATETIME_CASES: records_format =\ DelimitedRecordsFormat(variant='bigquery', hints={ 'datetimeformat': datetimeformat }) load_plan = RecordsLoadPlan( processing_instructions=processing_instructions, records_format=records_format) unhandled_hints = set(records_format.hints.keys()) try: load_job_config(unhandled_hints, load_plan) except NotImplementedError: if should_raise[datetimeformat]: pass else: raise
def test_dataframe_to_int64_and_back_to_object_produces_int_columns(self) -> None: # This reproduces a situation found when a user worked around # a separate historical Records Mover limitation by doing an # unusual cast on their dataframe...and then hit a separate # limitation: # # https://github.com/bluelabsio/records-mover/pull/103 data = {'Population': [11190846, 1303171035, 207847528]} df = DataFrame(data, columns=['Population']) df['Population'] = df['Population'].astype("Int64") df['Population'] = df['Population'].astype("object") source = DataframesRecordsSource(dfs=[df]) processing_instructions = ProcessingInstructions() schema = source.initial_records_schema(processing_instructions) dialect = RedshiftDialect() mock_engine = Mock(name='engine') mock_engine.dialect = dialect driver = RedshiftDBDriver(db=mock_engine) schema_sql = schema.to_schema_sql(driver=driver, schema_name='my_schema_name', table_name='my_table_name') expected_schema_sql = """ CREATE TABLE my_schema_name.my_table_name ( \t"Population" INTEGER ) """ self.assertEqual(schema_sql, expected_schema_sql)
def test_christmas_tree_format_1_permissive(self): vertica_format = DelimitedRecordsFormat(variant='dumb', hints=christmas_tree_format_1_hints) processing_instructions = ProcessingInstructions(fail_if_cant_handle_hint=False) load_plan = RecordsLoadPlan(processing_instructions=processing_instructions, records_format=vertica_format) unhandled_hints = set(load_plan.records_format.hints.keys()) with patch.object(driver_logger, 'warning') as mock_warning: options = vertica_import_options(unhandled_hints, load_plan) expected_options = { 'abort_on_error': True, 'delimiter': '\x01', 'enforcelength': True, 'error_tolerance': False, 'escape_as': '\\', 'load_method': 'AUTO', 'no_commit': False, 'null_as': None, 'record_terminator': '\x02', 'rejectmax': 1, 'skip': 1, 'trailing_nullcols': False, } self.assertDictEqual(options, expected_options) self.assertListEqual(mock_warning.mock_calls, [call("Ignoring hint compression = 'LZO'"), call("Ignoring hint quoting = 'nonnumeric'")]) self.assertEqual(unhandled_hints, set())
def test_pandas_read_csv_options_vertica(self): self.maxDiff = None expected = { 'dayfirst': False, 'compression': None, 'delimiter': '\x01', 'doublequote': False, 'engine': 'c', 'error_bad_lines': True, 'header': None, 'lineterminator': '\x02', 'prefix': 'untitled_', 'quotechar': '"', 'quoting': 3, 'warn_bad_lines': True, 'parse_dates': [0, 1, 2, 3], } processing_instructions = ProcessingInstructions() records_format = DelimitedRecordsFormat(hints=vertica_format_hints) unhandled_hints = set(records_format.hints) actual = pandas_read_csv_options(records_format, self.records_schema, unhandled_hints, processing_instructions) self.assertEqual(expected, actual) self.assertFalse(unhandled_hints)
def test_pandas_read_csv_options_bluelabs(self): expected = { 'dayfirst': False, 'compression': 'gzip', 'delimiter': ',', 'doublequote': False, 'encoding': 'UTF8', 'engine': 'python', 'error_bad_lines': True, 'escapechar': '\\', 'header': None, 'prefix': 'untitled_', 'quotechar': '"', 'quoting': 3, 'warn_bad_lines': True, 'parse_dates': [0, 1, 2, 3], } processing_instructions = ProcessingInstructions() records_format = DelimitedRecordsFormat(hints=bluelabs_format_hints) unhandled_hints = set(records_format.hints) actual = pandas_read_csv_options(records_format, self.records_schema, unhandled_hints, processing_instructions) self.assertEqual(expected, actual) self.assertFalse(unhandled_hints)
def test_timeonlyformat(self): # Double check this before adding anything else in here to see # if it has changed, but HH:MI:SS is the only format accepted # by BigQuery as of this writing should_raise = { 'HH:MI:SS': False, 'HH24:MI:SS': False, 'HH12:MI AM': True, } processing_instructions = ProcessingInstructions( fail_if_dont_understand=True, fail_if_cant_handle_hint=True, fail_if_row_invalid=True) for timeonlyformat in TIMEONLY_CASES: records_format =\ DelimitedRecordsFormat(variant='bigquery', hints={ 'timeonlyformat': timeonlyformat, }) load_plan = RecordsLoadPlan( processing_instructions=processing_instructions, records_format=records_format) unhandled_hints = set(records_format.hints.keys()) try: load_job_config(unhandled_hints, load_plan) except NotImplementedError: if should_raise[timeonlyformat]: pass else: raise
def test_load_job_config_permissive(self): records_format = DelimitedRecordsFormat(variant='bigquery') processing_instructions = ProcessingInstructions( fail_if_dont_understand=True, fail_if_cant_handle_hint=True, fail_if_row_invalid=False) load_plan = RecordsLoadPlan( processing_instructions=processing_instructions, records_format=records_format) unhandled_hints = set(records_format.hints.keys()) out = load_job_config(unhandled_hints, load_plan) expectations = { 'allowJaggedRows': True, 'allowQuotedNewlines': True, 'autodetect': False, 'createDisposition': 'CREATE_NEVER', 'destinationTableProperties': {}, 'encoding': 'UTF-8', 'fieldDelimiter': ',', 'ignoreUnknownValues': False, 'maxBadRecords': 999999, 'quote': '"', 'schemaUpdateOptions': None, 'skipLeadingRows': '1', 'sourceFormat': 'CSV', 'writeDisposition': 'WRITE_APPEND' } self.assertEqual(out.to_api_repr()['load'], expectations)
def test_pandas_to_csv_options_christmas_tree_format_3(self): expected = { 'compression': 'bz2', 'date_format': '%d-%m-%Y %H:%M:%S.%f%z', 'doublequote': True, 'encoding': 'UTF8', 'escapechar': '\\', 'header': False, 'line_terminator': '\x02', 'quotechar': '"', 'quoting': 0, 'sep': '\x01', } processing_instructions =\ ProcessingInstructions(fail_if_cant_handle_hint=False) records_format = DelimitedRecordsFormat( hints=christmas_tree_format_3_hints) unhandled_hints = set(records_format.hints) with patch.object(driver_logger, 'warning') as mock_warning: actual = pandas_to_csv_options(records_format, unhandled_hints, processing_instructions) self.assertEqual(expected, actual) self.assertListEqual(mock_warning.mock_calls, [ call("Ignoring hint quoting = " "'some_future_option_not_supported_now'"), call("Ignoring hint escape = '@'"), call("Ignoring hint datetimeformattz = 'HH:MI:SSOF YYYY-MM-DD'" ), call("Ignoring hint datetimeformattz = " "'YYYY-MM-DD HH24:MI:SSOF'"), call("Ignoring hint datetimeformat = 'YYYY-MM-DD HH24:MI:SS'") ]) self.assertFalse(unhandled_hints)
def setUp(self): self.mock_db_engine = MagicMock() self.mock_db_engine.dialect = create_autospec(VerticaDialect) self.mock_db_engine.dialect.preparer.return_value.quote = fake_quote self.mock_db_engine.engine = self.mock_db_engine self.mock_s3_temp_base_loc = MagicMock(name='s3_temp_base_loc') self.mock_url_resolver = Mock(name='url_resolver') self.mock_directory_url = self.mock_url_resolver.directory_url self.mock_s3_temp_base_loc.url = 's3://fakebucket/fakedir/fakesubdir/' with patch('records_mover.db.vertica.vertica_db_driver.VerticaLoader') \ as mock_VerticaLoader: self.vertica_db_driver = VerticaDBDriver( db=self.mock_db_engine, s3_temp_base_loc=self.mock_s3_temp_base_loc, url_resolver=self.mock_url_resolver) self.mock_VerticaLoader = mock_VerticaLoader self.mock_vertica_loader = mock_VerticaLoader.return_value mock_records_unload_plan = create_autospec(RecordsUnloadPlan) mock_records_unload_plan.records_format = create_autospec( DelimitedRecordsFormat) mock_records_unload_plan.records_format.format_type = 'delimited' mock_records_unload_plan.records_format.variant = None mock_records_unload_plan.processing_instructions = ProcessingInstructions( ) self.mock_records_unload_plan = mock_records_unload_plan mock_records_load_plan = Mock() mock_records_load_plan.processing_instructions = ProcessingInstructions( ) self.mock_records_load_plan = mock_records_load_plan mock_directory = Mock() mock_directory.loc.url = 's3://mybucket/myparent/mychild/' mock_directory.loc.aws_creds.return_value = Mock(name='aws creds') mock_directory.loc.aws_creds.return_value.access_key = 'fake_aws_id' mock_directory.loc.aws_creds.return_value.secret_key = 'fake_aws_secret' mock_directory.loc.aws_creds.return_value.token = None self.mock_directory = mock_directory
def test_weird_timeonlyformat(self): vertica_format = DelimitedRecordsFormat(variant='dumb', hints={ 'timeonlyformat': 'something else' }) processing_instructions = ProcessingInstructions() load_plan = RecordsLoadPlan(processing_instructions=processing_instructions, records_format=vertica_format) unhandled_hints = set(load_plan.records_format.hints.keys()) with self.assertRaisesRegexp(NotImplementedError, "Implement hint timeonlyformat='something else' or try again " "with fail_if_cant_handle_hint=False"): vertica_import_options(unhandled_hints, load_plan)
def test_load_job_config_vertica(self): records_format = DelimitedRecordsFormat(variant='vertica') processing_instructions = ProcessingInstructions(fail_if_dont_understand=True, fail_if_cant_handle_hint=True, fail_if_row_invalid=True) load_plan = RecordsLoadPlan(processing_instructions=processing_instructions, records_format=records_format) unhandled_hints = set(records_format.hints.keys()) with self.assertRaisesRegex(NotImplementedError, r"Implement hint record-terminator='\\x02' " "or try again with fail_if_cant_handle_hint=False"): load_job_config(unhandled_hints, load_plan)
def test_quote_all_with_doublequote(self): vertica_format = DelimitedRecordsFormat(variant='csv', hints={ 'quoting': 'all' }) processing_instructions = ProcessingInstructions() load_plan = RecordsLoadPlan(processing_instructions=processing_instructions, records_format=vertica_format) unhandled_hints = set(load_plan.records_format.hints.keys()) with self.assertRaisesRegexp(NotImplementedError, r"Implement hint doublequote=True or try again with " "fail_if_cant_handle_hint=False"): vertica_import_options(unhandled_hints, load_plan)
def test_load_job_config_unknown_quoting(self): records_format = DelimitedRecordsFormat(variant='bigquery', hints={'quoting': 'blah'}) processing_instructions = ProcessingInstructions( fail_if_dont_understand=True, fail_if_cant_handle_hint=True, fail_if_row_invalid=True) load_plan = RecordsLoadPlan( processing_instructions=processing_instructions, records_format=records_format) unhandled_hints = set(records_format.hints.keys()) with self.assertRaises(NotImplementedError): load_job_config(unhandled_hints, load_plan)
def test_load_job_config_no_bzip_support(self): records_format = DelimitedRecordsFormat(variant='bigquery', hints={'compression': 'BZIP'}) processing_instructions = ProcessingInstructions( fail_if_dont_understand=True, fail_if_cant_handle_hint=True, fail_if_row_invalid=True) load_plan = RecordsLoadPlan( processing_instructions=processing_instructions, records_format=records_format) unhandled_hints = set(records_format.hints.keys()) with self.assertRaisesRegex( NotImplementedError, r"Implement hint compression='BZIP' " "or try again with fail_if_cant_handle_hint=False"): load_job_config(unhandled_hints, load_plan)
def test_pandas_read_csv_options_inconsistent_date_format(self): processing_instructions = ProcessingInstructions() hints = bluelabs_format_hints.copy() hints.update({ 'dateformat': 'DD-MM-YYYY', 'datetimeformattz': 'MM-DD-YYYY HH24:MIOF', 'datetimeformat': 'DD-MM-YYYY HH24:MI', }) records_format = DelimitedRecordsFormat(hints=hints) unhandled_hints = set(records_format.hints) with self.assertRaises(NotImplementedError): pandas_read_csv_options(records_format, self.records_schema, unhandled_hints, processing_instructions)
def test_load_job_config_unsupported_datetimeformattz(self): records_format = DelimitedRecordsFormat( variant='bigquery', hints={'datetimeformattz': 'MM/DD/YY HH:MI:SSOF'}) processing_instructions = ProcessingInstructions( fail_if_dont_understand=True, fail_if_cant_handle_hint=True, fail_if_row_invalid=True) load_plan = RecordsLoadPlan( processing_instructions=processing_instructions, records_format=records_format) unhandled_hints = set(records_format.hints.keys()) with self.assertRaisesRegex( NotImplementedError, r"Implement hint datetimeformattz='MM/DD/YY HH:MI:SSOF' " "or try again with fail_if_cant_handle_hint=False"): load_job_config(unhandled_hints, load_plan)
def test_quote_all_without_doublequote(self): vertica_format = DelimitedRecordsFormat(variant='csv', hints={ 'quoting': 'all', 'doublequote': False, # Vertica doesn't support exporting CSV variant style dates by # default, so let's pick some it can for purposes of this # test: 'dateformat': 'YYYY-MM-DD', 'datetimeformat': 'YYYY-MM-DD HH:MI:SS', 'datetimeformattz': 'YYYY-MM-DD HH:MI:SSOF', }) processing_instructions = ProcessingInstructions() load_plan = RecordsLoadPlan(processing_instructions=processing_instructions, records_format=vertica_format) unhandled_hints = set(load_plan.records_format.hints.keys()) out = vertica_import_options(unhandled_hints, load_plan) self.assertEqual(out['enclosed_by'], '"')
def test_pandas_to_csv_options_vertica(self): expected = { 'date_format': '%Y-%m-%d %H:%M:%S.%f%z', 'doublequote': False, 'encoding': 'UTF8', 'header': False, 'line_terminator': '\x02', 'quotechar': '"', 'quoting': 3, 'sep': '\x01', } processing_instructions = ProcessingInstructions() records_format = DelimitedRecordsFormat(hints=vertica_format_hints) unhandled_hints = set(records_format.hints) actual = pandas_to_csv_options(records_format, unhandled_hints, processing_instructions) self.assertEqual(expected, actual) self.assertFalse(unhandled_hints)
def test_pandas_to_csv_options_csv(self): expected = { 'compression': 'gzip', 'date_format': '%m/%d/%y %H:%M', 'doublequote': True, 'encoding': 'UTF8', 'header': True, 'line_terminator': '\n', 'quotechar': '"', 'quoting': 0, 'sep': ',' } processing_instructions =\ ProcessingInstructions(fail_if_cant_handle_hint=True) records_format = DelimitedRecordsFormat(hints=csv_format_hints) unhandled_hints = set(records_format.hints) actual = pandas_to_csv_options(records_format, unhandled_hints, processing_instructions) self.assertEqual(expected, actual) self.assertFalse(unhandled_hints)
def test_load_job_config_parquet(self): records_format = ParquetRecordsFormat() processing_instructions = ProcessingInstructions( fail_if_dont_understand=True, fail_if_cant_handle_hint=True, fail_if_row_invalid=True) load_plan = RecordsLoadPlan( processing_instructions=processing_instructions, records_format=records_format) unhandled_hints = set() out = load_job_config(unhandled_hints, load_plan) expectations = { 'allowJaggedRows': False, 'autodetect': False, 'createDisposition': 'CREATE_NEVER', 'destinationTableProperties': {}, 'ignoreUnknownValues': True, 'maxBadRecords': 0, 'schemaUpdateOptions': None, 'sourceFormat': 'PARQUET', 'writeDisposition': 'WRITE_APPEND' } self.assertEqual(expectations, out.to_api_repr()['load'])
def test_pandas_to_csv_options_christmas_tree_format_1(self): expected = { 'date_format': '%Y-%m-%d %H:%M:%S.%f%z', 'doublequote': False, 'encoding': 'UTF8', 'escapechar': '\\', 'header': True, 'line_terminator': '\x02', 'quotechar': '"', 'quoting': 2, 'sep': '\x01' } processing_instructions =\ ProcessingInstructions(fail_if_cant_handle_hint=False) records_format = DelimitedRecordsFormat( hints=christmas_tree_format_1_hints) unhandled_hints = set(records_format.hints) with patch.object(driver_logger, 'warning') as mock_warning: actual = pandas_to_csv_options(records_format, unhandled_hints, processing_instructions) self.assertEqual(expected, actual) self.assertListEqual(mock_warning.mock_calls, [call("Ignoring hint compression = 'LZO'")]) self.assertFalse(unhandled_hints)
def test_pandas_numeric_types_and_constraints(self): self.maxDiff = None # https://docs.scipy.org/doc/numpy/reference/arrays.scalars.html # https://stackoverflow.com/a/53828986/9795956 dtypes = np.dtype([ ('int8', np.int8), ('int16', np.int16), ('int32', np.int32), ('int64', np.int64), ('ubyte', np.ubyte), ('uint8', np.uint8), ('uint16', np.uint16), ('uint32', np.uint32), ('uint64', np.uint64), ('float16', np.float16), ('float32', np.float32), ('float64', np.float64), # 'float96', np.float96), # not supported by numpy on macOS on amd64, apparantly ('float128', np.float128), ]) data = np.empty(0, dtype=dtypes) df = pd.DataFrame(data) processing_instructions = ProcessingInstructions() schema = RecordsSchema.from_dataframe(df, processing_instructions, include_index=False) data = schema.to_data() fields = data['fields'] fields_and_constraints = { field_name: fields[field_name]['constraints'] for field_name in fields } expected_fields = { 'int8': { 'required': False, 'unique': False, 'min': '-128', 'max': '127' }, 'float128': { 'fp_significand_bits': 64, 'fp_total_bits': 80, 'required': False, 'unique': False }, 'float16': { 'fp_significand_bits': 11, 'fp_total_bits': 16, 'required': False, 'unique': False }, 'float32': { 'fp_significand_bits': 23, 'fp_total_bits': 32, 'required': False, 'unique': False }, 'float64': { 'fp_significand_bits': 53, 'fp_total_bits': 64, 'required': False, 'unique': False }, 'int16': { 'max': '32767', 'min': '-32768', 'required': False, 'unique': False }, 'int32': { 'max': '2147483647', 'min': '-2147483648', 'required': False, 'unique': False }, 'int64': { 'max': '9223372036854775807', 'min': '-9223372036854775808', 'required': False, 'unique': False }, 'ubyte': { 'max': '255', 'min': '0', 'required': False, 'unique': False }, 'uint16': { 'max': '65535', 'min': '0', 'required': False, 'unique': False }, 'uint32': { 'max': '4294967295', 'min': '0', 'required': False, 'unique': False }, 'uint64': { 'max': '18446744073709551615', 'min': '0', 'required': False, 'unique': False }, 'uint8': { 'max': '255', 'min': '0', 'required': False, 'unique': False } } self.assertEqual(fields_and_constraints, expected_fields)