def test_json_roundtrip_future_incompatible_format(self): dir_path = os.path.dirname(os.path.realpath(__file__)) sample_filename = os.path.join( dir_path, 'future_incompatible_redshift_example_1.json') with open(sample_filename) as f: sample_str = f.read() with self.assertRaises(UnsupportedSchemaError): RecordsSchema.from_json(sample_str)
def test_cast_dataframe_types_no_fields(self): mock_fields = [] mock_known_representations = Mock(name='known_representations') schema = RecordsSchema( fields=mock_fields, known_representations=mock_known_representations) mock_df = Mock(name='df') out = schema.cast_dataframe_types(mock_df) self.assertEqual(out, mock_df.apply.return_value)
def test_assign_dataframe_names_no_index(self): data = [{'a': 1}] df = DataFrame.from_dict(data) mock_field_a = Mock(name='field_a') mock_field_a.name = 'mya' mock_fields = [mock_field_a] mock_known_representations = Mock(name='known_representations') schema = RecordsSchema( fields=mock_fields, known_representations=mock_known_representations) out = schema.assign_dataframe_names(False, df) self.assertEqual(out.to_dict(orient='records'), [{'mya': 1}])
def test_to_schema_sql(self, mock_schema_to_schema_sql): mock_driver = Mock(name='driver') mock_schema_name = Mock(name='schema_name') mock_table_name = Mock(name='table_name') obj = RecordsSchema(fields=[], known_representations={}) out = obj.to_schema_sql(mock_driver, mock_schema_name, mock_table_name) mock_schema_to_schema_sql.assert_called_with( driver=mock_driver, records_schema=obj, schema_name=mock_schema_name, table_name=mock_table_name) self.assertEqual(out, mock_schema_to_schema_sql.return_value)
def test_assign_dataframe_names_with_index(self): data = [{'b': 1}] df = DataFrame.from_dict(data) self.assertEqual(df.to_dict(orient='index'), {0: {'b': 1}}) mock_field_a = Mock(name='field_a') mock_field_a.name = 'mya' mock_field_b = Mock(name='field_b') mock_field_b.name = 'myb' mock_fields = [mock_field_a, mock_field_b] mock_known_representations = Mock(name='known_representations') schema = RecordsSchema( fields=mock_fields, known_representations=mock_known_representations) out = schema.assign_dataframe_names(True, df) self.assertEqual(out.to_dict(orient='records'), [{'myb': 1}]) self.assertEqual(out.to_dict(orient='index'), {'mya': {'myb': 1}})
def test_refine_from_dataframe(self, mock_refine_schema_from_dataframe): mock_fields = Mock(name='fields') mock_known_representations = Mock(name='known_representations') schema = RecordsSchema( fields=mock_fields, known_representations=mock_known_representations) mock_df = Mock(name='df') mock_processing_instructions = Mock(name='processing_instructions') out = schema.refine_from_dataframe(mock_df, mock_processing_instructions) mock_refine_schema_from_dataframe.\ assert_called_with(records_schema=schema, df=mock_df, processing_instructions=mock_processing_instructions) self.assertEqual(out, mock_refine_schema_from_dataframe.return_value)
def test_from_db_table(self, mock_RecordsSchemaKnownRepresentation, mock_RecordsSchemaField): mock_schema_name = Mock(name='schema_name') mock_table_name = Mock(name='table_name') mock_driver = Mock(name='driver') mock_column = Mock(name='column') mock_table = mock_driver.table.return_value mock_table.columns = [mock_column] mock_origin_representation =\ mock_RecordsSchemaKnownRepresentation.from_db_driver.return_value mock_known_representations = { 'origin': mock_origin_representation, } mock_field = mock_RecordsSchemaField.from_sqlalchemy_column.return_value actual_schema = RecordsSchema.from_db_table( schema_name=mock_schema_name, table_name=mock_table_name, driver=mock_driver) mock_driver.table.assert_called_with(mock_schema_name, mock_table_name) mock_RecordsSchemaKnownRepresentation.from_db_driver.assert_called_with( mock_driver, mock_schema_name, mock_table_name) mock_RecordsSchemaField.\ from_sqlalchemy_column.assert_called_with(column=mock_column, driver=mock_driver, rep_type=mock_origin_representation.type) self.assertEqual(actual_schema.fields, [mock_field]) self.assertEqual(actual_schema.known_representations, mock_known_representations)
def test_dateformat(self) -> None: class DateFormatExpectations(TypedDict): # Use the datetimeformat/datetimeformattz which is # compatible, as pandas doesn't let you configure those # separately dayfirst: bool testcases: Dict[HintDateFormat, DateFormatExpectations] = { 'YYYY-MM-DD': { 'dayfirst': False, }, 'MM-DD-YYYY': { 'dayfirst': False, }, 'DD-MM-YYYY': { 'dayfirst': True, }, 'MM/DD/YY': { 'dayfirst': False, }, 'DD/MM/YY': { 'dayfirst': True, }, 'DD-MM-YY': { 'dayfirst': True, }, } for dateformat in DATE_CASES: records_format = DelimitedRecordsFormat(hints={ 'dateformat': dateformat, 'datetimeformat': f"{dateformat} HH:MI:SS", 'datetimeformattz': f"{dateformat} HH:MI:SSOF", 'compression': None, }) records_schema = RecordsSchema.from_data({ 'schema': 'bltypes/v1', 'fields': { 'first': { 'type': 'date' } }, }) unhandled_hints = set(records_format.hints) processing_instructions = ProcessingInstructions() expectations = testcases[dateformat] try: options = pandas_read_csv_options(records_format, records_schema, unhandled_hints, processing_instructions) except NotImplementedError: self.fail(f'Could not handle combination for {dateformat}') self.assertTrue(all(item in options.items() for item in expectations.items())) fileobj = io.StringIO(create_sample(dateformat)) df = pandas.read_csv(filepath_or_buffer=fileobj, **options) timestamp = df['untitled_0'][0] self.assertEqual(timestamp.year, SAMPLE_YEAR) self.assertEqual(timestamp.month, SAMPLE_MONTH) self.assertEqual(timestamp.day, SAMPLE_DAY)
def test_prep_df_for_csv_output_include_index(self): schema_data = { 'schema': "bltypes/v1", 'fields': { "date": { "type": "date", "index": 1, }, "time": { "type": "time", "index": 2, }, "timetz": { "type": "timetz", "index": 3, }, } } records_format = DelimitedRecordsFormat(variant='bluelabs') records_schema = RecordsSchema.from_data(schema_data) processing_instructions = ProcessingInstructions() # us_eastern = pytz.timezone('US/Eastern') data = { 'time': [ pd.Timestamp(year=1970, month=1, day=1, hour=12, minute=33, second=53, microsecond=1234) ], # timetz is not well supported in records mover yet. For # instance, specifying how it's turned into a CSV is not # currently part of the records spec: # # https://github.com/bluelabsio/records-mover/issues/76 # # In addition, Vertica suffers from a driver limitation: # # https://github.com/bluelabsio/records-mover/issues/77 # # 'timetz': [ # us_eastern.localize(pd.Timestamp(year=1970, month=1, day=1, # hour=12, minute=33, second=53, # microsecond=1234)), # ], } df = pd.DataFrame(data, index=[pd.Timestamp(year=1970, month=1, day=1)], columns=['time', 'timetz']) new_df = prep_df_for_csv_output(df=df, include_index=True, records_schema=records_schema, records_format=records_format, processing_instructions=processing_instructions) self.assertEqual(new_df.index[0], '1970-01-01') self.assertEqual(new_df['time'][0], '12:33:53') # self.assertEqual(new_df['timetz'][0], '12:33:53-05') self.assertIsNotNone(new_df)
def test_pandas_read_csv_options_bzip(self): records_format = DelimitedRecordsFormat(hints={'compression': 'BZIP'}) records_schema = RecordsSchema.from_data({'schema': 'bltypes/v1'}) unhandled_hints = set(records_format.hints) processing_instructions = ProcessingInstructions() expectations = {'compression': 'bz2'} out = pandas_read_csv_options(records_format, records_schema, unhandled_hints, processing_instructions) self.assertTrue( all(item in out.items() for item in expectations.items()))
def test_json_roundtrip_redshift_v1(self): dir_path = os.path.dirname(os.path.realpath(__file__)) sample_filename = os.path.join(dir_path, 'redshift_example_1.json') with open(sample_filename) as f: sample_str = f.read() records_schema = RecordsSchema.from_json(sample_str) sample_data = json.loads(sample_str) output_str = records_schema.to_json() output_data = json.loads(output_str) self.assertDictEqual(sample_data, output_data)
def test_timeonlyformat(self): schema_data = { 'schema': "bltypes/v1", 'fields': { "time_as_timestamp": { "type": "time", "index": 1, }, "time_as_time": { "type": "time", "index": 2, }, } } records_schema = RecordsSchema.from_data(schema_data) processing_instructions = ProcessingInstructions() for timeonlyformat in TIMEONLY_CASES: records_format = DelimitedRecordsFormat(variant='bluelabs', hints={ 'timeonlyformat': timeonlyformat, }) # us_eastern = pytz.timezone('US/Eastern') time_as_timestamp = pd.Timestamp(year=SAMPLE_YEAR, month=SAMPLE_MONTH, day=SAMPLE_DAY, hour=SAMPLE_HOUR, minute=SAMPLE_MINUTE, second=SAMPLE_SECOND) time_as_time = datetime.time(hour=SAMPLE_HOUR, minute=SAMPLE_MINUTE, second=SAMPLE_SECOND) data = { 'time_as_timestamp': [ time_as_timestamp ], 'time_as_time': [ time_as_time ], } df = pd.DataFrame(data, columns=['time_as_timestamp', 'time_as_time']) new_df = prep_df_for_csv_output(df=df, include_index=False, records_schema=records_schema, records_format=records_format, processing_instructions=processing_instructions) self.assertEqual(new_df['time_as_timestamp'][0], create_sample(timeonlyformat), timeonlyformat) self.assertEqual(new_df['time_as_time'][0], create_sample(timeonlyformat), timeonlyformat) # self.assertEqual(new_df['timetz'][0], '12:33:53-05') self.assertIsNotNone(new_df)
def test_dateformat(self): schema_data = { 'schema': "bltypes/v1", 'fields': { "date_as_timestamp": { "type": "date", "index": 1, }, "date_as_date": { "type": "date", "index": 1, }, } } records_schema = RecordsSchema.from_data(schema_data) processing_instructions = ProcessingInstructions() for dateformat in DATE_CASES: records_format = DelimitedRecordsFormat(variant='bluelabs', hints={ 'dateformat': dateformat }) # us_eastern = pytz.timezone('US/Eastern') data = { 'date_as_timestamp': [ pd.Timestamp(year=SAMPLE_YEAR, month=SAMPLE_MONTH, day=SAMPLE_DAY) ], 'date_as_date': [ datetime.date(year=SAMPLE_YEAR, month=SAMPLE_MONTH, day=SAMPLE_DAY) ], } df = pd.DataFrame(data, columns=['date_as_timestamp', 'date_as_date']) new_df = prep_df_for_csv_output(df=df, include_index=False, records_schema=records_schema, records_format=records_format, processing_instructions=processing_instructions) self.assertEqual(new_df['date_as_timestamp'][0], create_sample(dateformat)) self.assertEqual(new_df['date_as_date'][0], create_sample(dateformat)) self.assertIsNotNone(new_df)
def test_datetimeformat(self): schema_data = { 'schema': "bltypes/v1", 'fields': { "datetimez": { "type": "datetime", "index": 1, }, } } records_schema = RecordsSchema.from_data(schema_data) processing_instructions = ProcessingInstructions() for datetimeformat in DATETIME_CASES: records_format = DelimitedRecordsFormat(variant='bluelabs', hints={ 'datetimeformat': datetimeformat }) # us_eastern = pytz.timezone('US/Eastern') timestamp = pd.Timestamp(year=SAMPLE_YEAR, month=SAMPLE_MONTH, day=SAMPLE_DAY, hour=SAMPLE_HOUR, minute=SAMPLE_MINUTE, second=SAMPLE_SECOND) data = { 'datetime': [ timestamp ], } df = pd.DataFrame(data, columns=['datetime']) new_df = prep_df_for_csv_output(df=df, include_index=False, records_schema=records_schema, records_format=records_format, processing_instructions=processing_instructions) # No conversion is done of datetime as pandas' CSV # outputter handles it properly, so we should expect the # original again self.assertEqual(new_df['datetime'][0], timestamp, create_sample(datetimeformat)) # self.assertEqual(new_df['timetz'][0], '12:33:53-05') self.assertIsNotNone(new_df)
def setUp(self): self.records_schema = RecordsSchema.from_data({ 'schema': 'bltypes/v1', 'fields': { "date": { "type": "date", "index": 1, }, "time": { "type": "time", "index": 2, }, "timestamp": { "type": "datetime", "index": 3, }, "timestamptz": { "type": "datetimetz", "index": 4, } } })
def test_timeonlyformat(self) -> None: for timeonlyformat in TIMEONLY_CASES: records_format = DelimitedRecordsFormat(hints={ 'timeonlyformat': timeonlyformat, 'compression': None, }) records_schema = RecordsSchema.from_data({ 'schema': 'bltypes/v1', 'fields': { 'first': { 'type': 'time' } }, }) unhandled_hints = set(records_format.hints) processing_instructions = ProcessingInstructions() try: options = pandas_read_csv_options(records_format, records_schema, unhandled_hints, processing_instructions) except NotImplementedError: self.fail(f'Could not handle combination for {timeonlyformat}') self.assertEqual(options['parse_dates'], [0]) timeonly = create_sample(timeonlyformat) fileobj = io.StringIO(timeonly) df = pandas.read_csv(filepath_or_buffer=fileobj, **options) timestamp = df['untitled_0'][0] self.assertIsInstance(timestamp, pandas.Timestamp, f"Pandas did not parse {timeonly} as a timestamp object") self.assertEqual(timestamp.hour, SAMPLE_HOUR) self.assertEqual(timestamp.minute, SAMPLE_MINUTE) if 'SS' in timeonlyformat: self.assertEqual(timestamp.second, SAMPLE_SECOND) else: self.assertEqual(timestamp.second, 0, timeonly)
def schema_from_dataframe(df: DataFrame, processing_instructions: ProcessingInstructions, include_index: bool) -> 'RecordsSchema': from records_mover.records.schema import RecordsSchema # noqa from records_mover.records.schema.field import RecordsSchemaField # noqa fields = [] origin_representation = \ RecordsSchemaKnownRepresentation.from_dataframe(df, processing_instructions) known_representations: Dict[str, RecordsSchemaKnownRepresentation] = { 'origin': origin_representation } if include_index: fields.append( RecordsSchemaField.from_index( df.index, processing_instructions=processing_instructions)) for column in df: fields.append( RecordsSchemaField.from_series( df[column], processing_instructions=processing_instructions)) return RecordsSchema(fields=fields, known_representations=known_representations)
def refine_schema_from_dataframe(records_schema: 'RecordsSchema', df: DataFrame, processing_instructions: ProcessingInstructions = ProcessingInstructions()) ->\ 'RecordsSchema': from records_mover.records.schema import RecordsSchema max_sample_size = processing_instructions.max_inference_rows total_rows = len(df.index) if max_sample_size is not None and max_sample_size < total_rows: sampled_df = df.sample(n=max_sample_size) else: sampled_df = df rows_sampled = len(sampled_df.index) fields = [ field.refine_from_series(sampled_df[field.name], total_rows=total_rows, rows_sampled=rows_sampled) for field in records_schema.fields ] return RecordsSchema( fields=fields, known_representations=records_schema.known_representations)
def test_pandas_numeric_types_and_constraints(self): self.maxDiff = None # https://docs.scipy.org/doc/numpy/reference/arrays.scalars.html # https://stackoverflow.com/a/53828986/9795956 dtypes = np.dtype([ ('int8', np.int8), ('int16', np.int16), ('int32', np.int32), ('int64', np.int64), ('ubyte', np.ubyte), ('uint8', np.uint8), ('uint16', np.uint16), ('uint32', np.uint32), ('uint64', np.uint64), ('float16', np.float16), ('float32', np.float32), ('float64', np.float64), # 'float96', np.float96), # not supported by numpy on macOS on amd64, apparantly ('float128', np.float128), ]) data = np.empty(0, dtype=dtypes) df = pd.DataFrame(data) processing_instructions = ProcessingInstructions() schema = RecordsSchema.from_dataframe(df, processing_instructions, include_index=False) data = schema.to_data() fields = data['fields'] fields_and_constraints = { field_name: fields[field_name]['constraints'] for field_name in fields } expected_fields = { 'int8': { 'required': False, 'unique': False, 'min': '-128', 'max': '127' }, 'float128': { 'fp_significand_bits': 64, 'fp_total_bits': 80, 'required': False, 'unique': False }, 'float16': { 'fp_significand_bits': 11, 'fp_total_bits': 16, 'required': False, 'unique': False }, 'float32': { 'fp_significand_bits': 23, 'fp_total_bits': 32, 'required': False, 'unique': False }, 'float64': { 'fp_significand_bits': 53, 'fp_total_bits': 64, 'required': False, 'unique': False }, 'int16': { 'max': '32767', 'min': '-32768', 'required': False, 'unique': False }, 'int32': { 'max': '2147483647', 'min': '-2147483648', 'required': False, 'unique': False }, 'int64': { 'max': '9223372036854775807', 'min': '-9223372036854775808', 'required': False, 'unique': False }, 'ubyte': { 'max': '255', 'min': '0', 'required': False, 'unique': False }, 'uint16': { 'max': '65535', 'min': '0', 'required': False, 'unique': False }, 'uint32': { 'max': '4294967295', 'min': '0', 'required': False, 'unique': False }, 'uint64': { 'max': '18446744073709551615', 'min': '0', 'required': False, 'unique': False }, 'uint8': { 'max': '255', 'min': '0', 'required': False, 'unique': False } } self.assertEqual(fields_and_constraints, expected_fields)
def test_str(self): obj = RecordsSchema(fields=[], known_representations={}) self.assertEqual(str(obj), "RecordsSchema(types={})")
def test_datetimeformattz(self) -> None: class DateTimeFormatTzExpectations(TypedDict): # Use the datetimeformat/datetimeformattz which is # compatible, as pandas doesn't let you configure those # separately dayfirst: bool testcases: Dict[HintDateTimeFormatTz, DateTimeFormatTzExpectations] = { 'YYYY-MM-DD HH:MI:SSOF': { 'dayfirst': False, }, 'YYYY-MM-DD HH:MI:SS': { 'dayfirst': False, }, 'YYYY-MM-DD HH24:MI:SSOF': { 'dayfirst': False, }, 'MM/DD/YY HH24:MI': { 'dayfirst': False, }, } for datetimeformattz in DATETIMETZ_CASES: records_format = DelimitedRecordsFormat(hints={ 'datetimeformattz': datetimeformattz, 'compression': None, }) records_schema = RecordsSchema.from_data({ 'schema': 'bltypes/v1', 'fields': { 'first': { 'type': 'datetimetz' } }, }) unhandled_hints = set(records_format.hints) processing_instructions = ProcessingInstructions() expectations = testcases[datetimeformattz] try: options = pandas_read_csv_options(records_format, records_schema, unhandled_hints, processing_instructions) except NotImplementedError: self.fail(f'Could not handle combination for {datetimeformattz}') self.assertEqual(options['parse_dates'], [0]) self.assertTrue(all(item in options.items() for item in expectations.items())) datetimetz = create_sample(datetimeformattz) fileobj = io.StringIO(datetimetz) df = pandas.read_csv(filepath_or_buffer=fileobj, **options) timestamp = df['untitled_0'][0] self.assertIsInstance(timestamp, pandas.Timestamp, f"Pandas did not parse {datetimetz} as a timestamp object") self.assertEqual(timestamp.year, SAMPLE_YEAR) self.assertEqual(timestamp.month, SAMPLE_MONTH) self.assertEqual(timestamp.day, SAMPLE_DAY) self.assertEqual(timestamp.hour, SAMPLE_HOUR) self.assertEqual(timestamp.minute, SAMPLE_MINUTE) if 'SS' in datetimeformattz: self.assertEqual(timestamp.second, SAMPLE_SECOND) else: self.assertEqual(timestamp.second, 0)
def test_from_fileobjs(self, mock_stream_csv, mock_RecordsSchema): mock_fileobj = Mock(name='fileobj') mock_fileobjs = [mock_fileobj] mock_records_format = Mock(name='records_format') mock_processing_instructions = Mock(name='processing_instructions') mock_fileobj.seekable.return_value = True mock_reader = mock_stream_csv.return_value.__enter__.return_value data = [ { 'Country': 'Belgium', 'Capital': 'Brussels', 'Population': 11190846, 'Unnamed: 1': None }, { 'Country': 'India', 'Capital': 'New Delhi', 'Population': 1303171035, 'Unnamed: 1': None }, { 'Country': 'Brazil', 'Capital': 'Brasília', 'Population': 207847528, 'Unnamed: 1': None }, ] df = DataFrame.from_dict(data) mock_reader.get_chunk.return_value = df out = RecordsSchema.from_fileobjs(mock_fileobjs, mock_records_format, mock_processing_instructions) mock_reader.get_chunk.assert_called_with( mock_processing_instructions.max_inference_rows) mock_fileobj.seek.assert_called_with(0) mock_RecordsSchema.from_dataframe.assert_called_with( ANY, mock_processing_instructions, include_index=False) actual_cleaned_up_df = mock_RecordsSchema.from_dataframe.mock_calls[0][ 1][0] actual_cleaned_up_df_data = actual_cleaned_up_df.to_dict( orient='records') expected_cleaned_up_df_data = [ { 'Country': 'Belgium', 'Capital': 'Brussels', 'Population': 11190846 }, { 'Country': 'India', 'Capital': 'New Delhi', 'Population': 1303171035 }, { 'Country': 'Brazil', 'Capital': 'Brasília', 'Population': 207847528 }, ] self.assertEqual(actual_cleaned_up_df_data, expected_cleaned_up_df_data) self.assertEqual( out, mock_RecordsSchema.from_dataframe.return_value. refine_from_dataframe.return_value)