def known_supported_records_formats_for_load( self) -> List[BaseRecordsFormat]: return [ # MySQL supports a healthy amount of load types, but # doesn't support loading compressed files. DelimitedRecordsFormat(variant='bluelabs', hints={'compression': None}), DelimitedRecordsFormat(variant='bigquery', hints={'compression': None}), DelimitedRecordsFormat(variant='vertica', hints={'compression': None}), ]
def test_christmas_tree_format_1_permissive(self): vertica_format = DelimitedRecordsFormat(variant='dumb', hints=christmas_tree_format_1_hints) processing_instructions = ProcessingInstructions(fail_if_cant_handle_hint=False) load_plan = RecordsLoadPlan(processing_instructions=processing_instructions, records_format=vertica_format) unhandled_hints = set(load_plan.records_format.hints.keys()) with patch.object(driver_logger, 'warning') as mock_warning: options = vertica_import_options(unhandled_hints, load_plan) expected_options = { 'abort_on_error': True, 'delimiter': '\x01', 'enforcelength': True, 'error_tolerance': False, 'escape_as': '\\', 'load_method': 'AUTO', 'no_commit': False, 'null_as': None, 'record_terminator': '\x02', 'rejectmax': 1, 'skip': 1, 'trailing_nullcols': False, } self.assertDictEqual(options, expected_options) self.assertListEqual(mock_warning.mock_calls, [call("Ignoring hint compression = 'LZO'"), call("Ignoring hint quoting = 'nonnumeric'")]) self.assertEqual(unhandled_hints, set())
def test_load_job_config_permissive(self): records_format = DelimitedRecordsFormat(variant='bigquery') processing_instructions = ProcessingInstructions( fail_if_dont_understand=True, fail_if_cant_handle_hint=True, fail_if_row_invalid=False) load_plan = RecordsLoadPlan( processing_instructions=processing_instructions, records_format=records_format) unhandled_hints = set(records_format.hints.keys()) out = load_job_config(unhandled_hints, load_plan) expectations = { 'allowJaggedRows': True, 'allowQuotedNewlines': True, 'autodetect': False, 'createDisposition': 'CREATE_NEVER', 'destinationTableProperties': {}, 'encoding': 'UTF-8', 'fieldDelimiter': ',', 'ignoreUnknownValues': False, 'maxBadRecords': 999999, 'quote': '"', 'schemaUpdateOptions': None, 'skipLeadingRows': '1', 'sourceFormat': 'CSV', 'writeDisposition': 'WRITE_APPEND' } self.assertEqual(out.to_api_repr()['load'], expectations)
def test_timeonlyformat(self): # Double check this before adding anything else in here to see # if it has changed, but HH:MI:SS is the only format accepted # by BigQuery as of this writing should_raise = { 'HH:MI:SS': False, 'HH24:MI:SS': False, 'HH12:MI AM': True, } processing_instructions = ProcessingInstructions( fail_if_dont_understand=True, fail_if_cant_handle_hint=True, fail_if_row_invalid=True) for timeonlyformat in TIMEONLY_CASES: records_format =\ DelimitedRecordsFormat(variant='bigquery', hints={ 'timeonlyformat': timeonlyformat, }) load_plan = RecordsLoadPlan( processing_instructions=processing_instructions, records_format=records_format) unhandled_hints = set(records_format.hints.keys()) try: load_job_config(unhandled_hints, load_plan) except NotImplementedError: if should_raise[timeonlyformat]: pass else: raise
def test_pandas_read_csv_options_vertica(self): self.maxDiff = None expected = { 'dayfirst': False, 'compression': None, 'delimiter': '\x01', 'doublequote': False, 'engine': 'c', 'error_bad_lines': True, 'header': None, 'lineterminator': '\x02', 'prefix': 'untitled_', 'quotechar': '"', 'quoting': 3, 'warn_bad_lines': True, 'parse_dates': [0, 1, 2, 3], } processing_instructions = ProcessingInstructions() records_format = DelimitedRecordsFormat(hints=vertica_format_hints) unhandled_hints = set(records_format.hints) actual = pandas_read_csv_options(records_format, self.records_schema, unhandled_hints, processing_instructions) self.assertEqual(expected, actual) self.assertFalse(unhandled_hints)
def test_datetimeformat(self): # Double check this before adding anything else in here to see # if it has changed, but YYYY-MM-DD HH:MI:SS, YYYY-MM-DD # HH24:MI:SS and YYYY-MM-DD HH:MI:SS are the only formats # accepted by BigQuery as of this writing should_raise = { 'YYYY-MM-DD HH12:MI AM': True, 'MM/DD/YY HH24:MI': True, } processing_instructions = ProcessingInstructions( fail_if_dont_understand=True, fail_if_cant_handle_hint=True, fail_if_row_invalid=True) for datetimeformat in DATETIME_CASES: records_format =\ DelimitedRecordsFormat(variant='bigquery', hints={ 'datetimeformat': datetimeformat }) load_plan = RecordsLoadPlan( processing_instructions=processing_instructions, records_format=records_format) unhandled_hints = set(records_format.hints.keys()) try: load_job_config(unhandled_hints, load_plan) except NotImplementedError: if should_raise[datetimeformat]: pass else: raise
def test_pandas_to_csv_options_christmas_tree_format_3(self): expected = { 'compression': 'bz2', 'date_format': '%d-%m-%Y %H:%M:%S.%f%z', 'doublequote': True, 'encoding': 'UTF8', 'escapechar': '\\', 'header': False, 'line_terminator': '\x02', 'quotechar': '"', 'quoting': 0, 'sep': '\x01', } processing_instructions =\ ProcessingInstructions(fail_if_cant_handle_hint=False) records_format = DelimitedRecordsFormat( hints=christmas_tree_format_3_hints) unhandled_hints = set(records_format.hints) with patch.object(driver_logger, 'warning') as mock_warning: actual = pandas_to_csv_options(records_format, unhandled_hints, processing_instructions) self.assertEqual(expected, actual) self.assertListEqual(mock_warning.mock_calls, [ call("Ignoring hint quoting = " "'some_future_option_not_supported_now'"), call("Ignoring hint escape = '@'"), call("Ignoring hint datetimeformattz = 'HH:MI:SSOF YYYY-MM-DD'" ), call("Ignoring hint datetimeformattz = " "'YYYY-MM-DD HH24:MI:SSOF'"), call("Ignoring hint datetimeformat = 'YYYY-MM-DD HH24:MI:SS'") ]) self.assertFalse(unhandled_hints)
def test_vertica_format_permissive(self): vertica_format = DelimitedRecordsFormat(variant='vertica') processing_instructions = ProcessingInstructions(fail_if_row_invalid=False) load_plan = RecordsLoadPlan(processing_instructions=processing_instructions, records_format=vertica_format) unhandled_hints = set(load_plan.records_format.hints.keys()) options = vertica_import_options(unhandled_hints, load_plan) expected_options = { 'abort_on_error': False, 'delimiter': '\x01', 'enclosed_by': None, 'enforcelength': False, 'error_tolerance': True, 'escape_as': None, 'gzip': False, 'load_method': 'AUTO', 'no_commit': False, 'null_as': None, 'record_terminator': '\x02', 'rejectmax': None, 'skip': 0, 'trailing_nullcols': True, } self.assertDictEqual(options, expected_options) self.assertEqual(unhandled_hints, set())
def test_pandas_read_csv_options_bluelabs(self): expected = { 'dayfirst': False, 'compression': 'gzip', 'delimiter': ',', 'doublequote': False, 'encoding': 'UTF8', 'engine': 'python', 'error_bad_lines': True, 'escapechar': '\\', 'header': None, 'prefix': 'untitled_', 'quotechar': '"', 'quoting': 3, 'warn_bad_lines': True, 'parse_dates': [0, 1, 2, 3], } processing_instructions = ProcessingInstructions() records_format = DelimitedRecordsFormat(hints=bluelabs_format_hints) unhandled_hints = set(records_format.hints) actual = pandas_read_csv_options(records_format, self.records_schema, unhandled_hints, processing_instructions) self.assertEqual(expected, actual) self.assertFalse(unhandled_hints)
def test_can_unload_format_delimited_false(self): mock_db = Mock(name='mock_db') mock_url_resolver = MagicMock(name='mock_url_resolver') mock_gcs_temp_base_loc = MagicMock(name='gcs_temp_base_loc') big_query_unloader = BigQueryUnloader(db=mock_db, url_resolver=mock_url_resolver, gcs_temp_base_loc=mock_gcs_temp_base_loc) delimited_format = DelimitedRecordsFormat() self.assertFalse(big_query_unloader.can_unload_format(delimited_format))
def test_known_supported_records_formats_for_load(self): out = self.redshift_loader.known_supported_records_formats_for_load() self.assertEqual(out, [ DelimitedRecordsFormat(variant='csv', hints={ 'dateformat': 'YYYY-MM-DD', 'timeonlyformat': 'HH24:MI:SS', 'datetimeformat': 'YYYY-MM-DD HH:MI:SS', 'datetimeformattz': 'YYYY-MM-DD HH:MI:SSOF', }), DelimitedRecordsFormat(variant='bigquery'), DelimitedRecordsFormat(variant='csv'), DelimitedRecordsFormat(variant='bluelabs', hints={'quoting': 'all'}), DelimitedRecordsFormat(variant='bluelabs'), ])
def test_known_supported_records_formats_for_unload(self): mock_db = Mock(name='db') mock_source_records_format = Mock(name='source_records_format', spec=DelimitedRecordsFormat) mock_s3_temp_base_loc = Mock(name='s3_temp_base_loc') vertica_unloader = VerticaUnloader( db=mock_db, s3_temp_base_loc=mock_s3_temp_base_loc) mock_source_records_format.hints = {} out = vertica_unloader.known_supported_records_formats_for_unload() self.assertEqual(out, [DelimitedRecordsFormat(variant='vertica')])
def test_weird_timeonlyformat(self): vertica_format = DelimitedRecordsFormat(variant='dumb', hints={ 'timeonlyformat': 'something else' }) processing_instructions = ProcessingInstructions() load_plan = RecordsLoadPlan(processing_instructions=processing_instructions, records_format=vertica_format) unhandled_hints = set(load_plan.records_format.hints.keys()) with self.assertRaisesRegexp(NotImplementedError, "Implement hint timeonlyformat='something else' or try again " "with fail_if_cant_handle_hint=False"): vertica_import_options(unhandled_hints, load_plan)
def test_load_job_config_vertica(self): records_format = DelimitedRecordsFormat(variant='vertica') processing_instructions = ProcessingInstructions(fail_if_dont_understand=True, fail_if_cant_handle_hint=True, fail_if_row_invalid=True) load_plan = RecordsLoadPlan(processing_instructions=processing_instructions, records_format=records_format) unhandled_hints = set(records_format.hints.keys()) with self.assertRaisesRegex(NotImplementedError, r"Implement hint record-terminator='\\x02' " "or try again with fail_if_cant_handle_hint=False"): load_job_config(unhandled_hints, load_plan)
def test_quote_all_with_doublequote(self): vertica_format = DelimitedRecordsFormat(variant='csv', hints={ 'quoting': 'all' }) processing_instructions = ProcessingInstructions() load_plan = RecordsLoadPlan(processing_instructions=processing_instructions, records_format=vertica_format) unhandled_hints = set(load_plan.records_format.hints.keys()) with self.assertRaisesRegexp(NotImplementedError, r"Implement hint doublequote=True or try again with " "fail_if_cant_handle_hint=False"): vertica_import_options(unhandled_hints, load_plan)
def test_json(self): records_format = DelimitedRecordsFormat() self.assertEqual( { 'hints': { 'compression': 'GZIP', 'dateformat': 'YYYY-MM-DD', 'datetimeformat': 'YYYY-MM-DD HH24:MI:SS', 'datetimeformattz': 'YYYY-MM-DD HH:MI:SSOF', 'doublequote': False, 'encoding': 'UTF8', 'escape': '\\', 'field-delimiter': ',', 'header-row': False, 'quotechar': '"', 'quoting': None, 'record-terminator': '\n', 'timeonlyformat': 'HH24:MI:SS' }, 'type': 'delimited', 'variant': 'bluelabs' }, json.loads(records_format.json()))
def load(self, hints, fail_if): processing_instructions = ProcessingInstructions() processing_instructions.fail_if_cant_handle_hint = fail_if processing_instructions.fail_if_dont_understand = fail_if processing_instructions.fail_if_row_invalid = fail_if self.mock_records_load_plan.records_format = DelimitedRecordsFormat( hints=hints) self.mock_records_load_plan.processing_instructions = processing_instructions return self.redshift_db_driver.loader().\ load(schema='myschema', table='mytable', load_plan=self.mock_records_load_plan, directory=self.mock_directory)
def test_load_job_config_unknown_quoting(self): records_format = DelimitedRecordsFormat(variant='bigquery', hints={'quoting': 'blah'}) processing_instructions = ProcessingInstructions( fail_if_dont_understand=True, fail_if_cant_handle_hint=True, fail_if_row_invalid=True) load_plan = RecordsLoadPlan( processing_instructions=processing_instructions, records_format=records_format) unhandled_hints = set(records_format.hints.keys()) with self.assertRaises(NotImplementedError): load_job_config(unhandled_hints, load_plan)
def test_move_from_dataframe_compressed_with_header_row(self, mock_complain_on_unhandled_hints, mock_io, mock_prep_df_for_csv_output): mock_fileobj = Mock(name='fileobj') mock_records_format = DelimitedRecordsFormat(hints={ 'encoding': 'UTF8', 'compression': 'GZIP', 'header-row': True, 'quoting': 'all' }) fileobj_target = FileobjTarget(fileobj=mock_fileobj, records_format=mock_records_format) mock_df_1 = Mock(name='df_1') mock_df_1.index = ['a'] mock_df_2 = Mock(name='df_2') mock_df_2.index = ['a'] mock_processing_instructions = Mock(name='processing_instructions') mock_dfs_source = Mock(name='dfs_source') mock_dfs_source.dfs = [mock_df_1, mock_df_2] mock_prep_df_for_csv_output.side_effect = [mock_df_1, mock_df_2] out = fileobj_target.move_from_dataframes_source(mock_dfs_source, mock_processing_instructions) mock_df_1.to_csv.assert_called_with(path_or_buf=ANY, index=mock_dfs_source.include_index, mode="a", compression='gzip', date_format='%Y-%m-%d %H:%M:%S.%f%z', doublequote=False, encoding='UTF8', escapechar='\\', header=True, line_terminator='\n', quotechar='"', quoting=1, sep=',') mock_df_2.to_csv.assert_called_with(path_or_buf=ANY, index=mock_dfs_source.include_index, mode="a", compression='gzip', date_format='%Y-%m-%d %H:%M:%S.%f%z', doublequote=False, encoding='UTF8', escapechar='\\', header=False, line_terminator='\n', quotechar='"', quoting=1, sep=',') self.assertEqual(out, MoveResult(move_count=2, output_urls=None))
def test_pandas_read_csv_options_inconsistent_date_format(self): processing_instructions = ProcessingInstructions() hints = bluelabs_format_hints.copy() hints.update({ 'dateformat': 'DD-MM-YYYY', 'datetimeformattz': 'MM-DD-YYYY HH24:MIOF', 'datetimeformat': 'DD-MM-YYYY HH24:MI', }) records_format = DelimitedRecordsFormat(hints=hints) unhandled_hints = set(records_format.hints) with self.assertRaises(NotImplementedError): pandas_read_csv_options(records_format, self.records_schema, unhandled_hints, processing_instructions)
def test_known_supported_records_formats_for_unload(self): mock_db = Mock(name='db') mock_source_records_format = Mock(name='source_records_format', spec=DelimitedRecordsFormat) mock_s3_temp_base_loc = Mock(name='s3_temp_base_loc') vertica_unloader = VerticaUnloader( db=mock_db, s3_temp_base_loc=mock_s3_temp_base_loc) mock_resultset = Mock(name='resultset') mock_db.execute.return_value = mock_resultset mock_resultset.fetchall.return_value = ['awslib'] mock_source_records_format.hints = {} out = vertica_unloader.known_supported_records_formats_for_unload() mock_db.execute.\ assert_called_with("SELECT lib_name from user_libraries where lib_name = 'awslib'") self.assertEqual(out, [DelimitedRecordsFormat(variant='vertica')])
def test_load_job_config_no_bzip_support(self): records_format = DelimitedRecordsFormat(variant='bigquery', hints={'compression': 'BZIP'}) processing_instructions = ProcessingInstructions( fail_if_dont_understand=True, fail_if_cant_handle_hint=True, fail_if_row_invalid=True) load_plan = RecordsLoadPlan( processing_instructions=processing_instructions, records_format=records_format) unhandled_hints = set(records_format.hints.keys()) with self.assertRaisesRegex( NotImplementedError, r"Implement hint compression='BZIP' " "or try again with fail_if_cant_handle_hint=False"): load_job_config(unhandled_hints, load_plan)
def test_quote_all_without_doublequote(self): vertica_format = DelimitedRecordsFormat(variant='csv', hints={ 'quoting': 'all', 'doublequote': False, # Vertica doesn't support exporting CSV variant style dates by # default, so let's pick some it can for purposes of this # test: 'dateformat': 'YYYY-MM-DD', 'datetimeformat': 'YYYY-MM-DD HH:MI:SS', 'datetimeformattz': 'YYYY-MM-DD HH:MI:SSOF', }) processing_instructions = ProcessingInstructions() load_plan = RecordsLoadPlan(processing_instructions=processing_instructions, records_format=vertica_format) unhandled_hints = set(load_plan.records_format.hints.keys()) out = vertica_import_options(unhandled_hints, load_plan) self.assertEqual(out['enclosed_by'], '"')
def test_load_job_config_unsupported_datetimeformattz(self): records_format = DelimitedRecordsFormat( variant='bigquery', hints={'datetimeformattz': 'MM/DD/YY HH:MI:SSOF'}) processing_instructions = ProcessingInstructions( fail_if_dont_understand=True, fail_if_cant_handle_hint=True, fail_if_row_invalid=True) load_plan = RecordsLoadPlan( processing_instructions=processing_instructions, records_format=records_format) unhandled_hints = set(records_format.hints.keys()) with self.assertRaisesRegex( NotImplementedError, r"Implement hint datetimeformattz='MM/DD/YY HH:MI:SSOF' " "or try again with fail_if_cant_handle_hint=False"): load_job_config(unhandled_hints, load_plan)
def test_pandas_to_csv_options_vertica(self): expected = { 'date_format': '%Y-%m-%d %H:%M:%S.%f%z', 'doublequote': False, 'encoding': 'UTF8', 'header': False, 'line_terminator': '\x02', 'quotechar': '"', 'quoting': 3, 'sep': '\x01', } processing_instructions = ProcessingInstructions() records_format = DelimitedRecordsFormat(hints=vertica_format_hints) unhandled_hints = set(records_format.hints) actual = pandas_to_csv_options(records_format, unhandled_hints, processing_instructions) self.assertEqual(expected, actual) self.assertFalse(unhandled_hints)
def test_pandas_to_csv_options_csv(self): expected = { 'compression': 'gzip', 'date_format': '%m/%d/%y %H:%M', 'doublequote': True, 'encoding': 'UTF8', 'header': True, 'line_terminator': '\n', 'quotechar': '"', 'quoting': 0, 'sep': ',' } processing_instructions =\ ProcessingInstructions(fail_if_cant_handle_hint=True) records_format = DelimitedRecordsFormat(hints=csv_format_hints) unhandled_hints = set(records_format.hints) actual = pandas_to_csv_options(records_format, unhandled_hints, processing_instructions) self.assertEqual(expected, actual) self.assertFalse(unhandled_hints)
def test_dumb(self): records_format = DelimitedRecordsFormat(variant='dumb') # Should match up with # https://github.com/bluelabsio/records-mover/blob/master/docs/RECORDS_SPEC.md#dumb-variant expected_hints = { 'compression': 'GZIP', 'dateformat': 'YYYY-MM-DD', 'datetimeformat': 'YYYY-MM-DD HH:MI:SS', 'datetimeformattz': 'YYYY-MM-DD HH:MI:SSOF', 'doublequote': False, 'encoding': 'UTF8', 'escape': None, 'field-delimiter': ',', 'quotechar': '"', 'quoting': None, 'record-terminator': '\n', 'timeonlyformat': 'HH24:MI:SS', 'header-row': False, } self.assertEqual(expected_hints, records_format.hints)
def test_with_altered_hints(self): records_format = DelimitedRecordsFormat(variant='csv').alter_hints( {'quotechar': 'A'}) # Should match up with # https://github.com/bluelabsio/records-mover/blob/master/docs/RECORDS_SPEC.md#csv-variant expected_hints = { 'compression': 'GZIP', 'dateformat': 'MM/DD/YY', 'datetimeformat': 'MM/DD/YY HH24:MI', 'datetimeformattz': 'MM/DD/YY HH24:MI', 'doublequote': True, 'encoding': 'UTF8', 'escape': None, 'field-delimiter': ',', 'quotechar': 'A', 'quoting': 'minimal', 'record-terminator': '\n', 'timeonlyformat': 'HH24:MI:SS', 'header-row': True, } self.assertEqual(expected_hints, records_format.hints) self.assertEqual({'quotechar': 'A'}, records_format.custom_hints)
def postgres_copy_to_options(unhandled_hints: Set[str], delimited_records_format: DelimitedRecordsFormat, fail_if_cant_handle_hint: bool) ->\ Tuple[DateOutputStyle, Optional[DateOrderStyle], PostgresCopyOptions]: hints = delimited_records_format.validate( fail_if_cant_handle_hint=fail_if_cant_handle_hint) if needs_csv_format(hints): copy_options = postgres_copy_options_csv(unhandled_hints, hints, fail_if_cant_handle_hint, CopyOptionsMode.UNLOADING) else: copy_options = postgres_copy_options_text(unhandled_hints, hints, fail_if_cant_handle_hint, CopyOptionsMode.UNLOADING) date_output_style, date_order_style =\ determine_date_output_style(unhandled_hints, hints, fail_if_cant_handle_hint) return (date_output_style, date_order_style, copy_options)
def test_pandas_to_csv_options_christmas_tree_format_1(self): expected = { 'date_format': '%Y-%m-%d %H:%M:%S.%f%z', 'doublequote': False, 'encoding': 'UTF8', 'escapechar': '\\', 'header': True, 'line_terminator': '\x02', 'quotechar': '"', 'quoting': 2, 'sep': '\x01' } processing_instructions =\ ProcessingInstructions(fail_if_cant_handle_hint=False) records_format = DelimitedRecordsFormat( hints=christmas_tree_format_1_hints) unhandled_hints = set(records_format.hints) with patch.object(driver_logger, 'warning') as mock_warning: actual = pandas_to_csv_options(records_format, unhandled_hints, processing_instructions) self.assertEqual(expected, actual) self.assertListEqual(mock_warning.mock_calls, [call("Ignoring hint compression = 'LZO'")]) self.assertFalse(unhandled_hints)