示例#1
0
 def test_json_roundtrip_future_incompatible_format(self):
     dir_path = os.path.dirname(os.path.realpath(__file__))
     sample_filename = os.path.join(
         dir_path, 'future_incompatible_redshift_example_1.json')
     with open(sample_filename) as f:
         sample_str = f.read()
     with self.assertRaises(UnsupportedSchemaError):
         RecordsSchema.from_json(sample_str)
示例#2
0
 def test_cast_dataframe_types_no_fields(self):
     mock_fields = []
     mock_known_representations = Mock(name='known_representations')
     schema = RecordsSchema(
         fields=mock_fields,
         known_representations=mock_known_representations)
     mock_df = Mock(name='df')
     out = schema.cast_dataframe_types(mock_df)
     self.assertEqual(out, mock_df.apply.return_value)
示例#3
0
 def test_assign_dataframe_names_no_index(self):
     data = [{'a': 1}]
     df = DataFrame.from_dict(data)
     mock_field_a = Mock(name='field_a')
     mock_field_a.name = 'mya'
     mock_fields = [mock_field_a]
     mock_known_representations = Mock(name='known_representations')
     schema = RecordsSchema(
         fields=mock_fields,
         known_representations=mock_known_representations)
     out = schema.assign_dataframe_names(False, df)
     self.assertEqual(out.to_dict(orient='records'), [{'mya': 1}])
示例#4
0
 def test_to_schema_sql(self, mock_schema_to_schema_sql):
     mock_driver = Mock(name='driver')
     mock_schema_name = Mock(name='schema_name')
     mock_table_name = Mock(name='table_name')
     obj = RecordsSchema(fields=[], known_representations={})
     out = obj.to_schema_sql(mock_driver, mock_schema_name, mock_table_name)
     mock_schema_to_schema_sql.assert_called_with(
         driver=mock_driver,
         records_schema=obj,
         schema_name=mock_schema_name,
         table_name=mock_table_name)
     self.assertEqual(out, mock_schema_to_schema_sql.return_value)
示例#5
0
 def test_assign_dataframe_names_with_index(self):
     data = [{'b': 1}]
     df = DataFrame.from_dict(data)
     self.assertEqual(df.to_dict(orient='index'), {0: {'b': 1}})
     mock_field_a = Mock(name='field_a')
     mock_field_a.name = 'mya'
     mock_field_b = Mock(name='field_b')
     mock_field_b.name = 'myb'
     mock_fields = [mock_field_a, mock_field_b]
     mock_known_representations = Mock(name='known_representations')
     schema = RecordsSchema(
         fields=mock_fields,
         known_representations=mock_known_representations)
     out = schema.assign_dataframe_names(True, df)
     self.assertEqual(out.to_dict(orient='records'), [{'myb': 1}])
     self.assertEqual(out.to_dict(orient='index'), {'mya': {'myb': 1}})
示例#6
0
    def test_refine_from_dataframe(self, mock_refine_schema_from_dataframe):
        mock_fields = Mock(name='fields')
        mock_known_representations = Mock(name='known_representations')
        schema = RecordsSchema(
            fields=mock_fields,
            known_representations=mock_known_representations)

        mock_df = Mock(name='df')
        mock_processing_instructions = Mock(name='processing_instructions')
        out = schema.refine_from_dataframe(mock_df,
                                           mock_processing_instructions)
        mock_refine_schema_from_dataframe.\
            assert_called_with(records_schema=schema,
                               df=mock_df,
                               processing_instructions=mock_processing_instructions)
        self.assertEqual(out, mock_refine_schema_from_dataframe.return_value)
示例#7
0
    def test_from_db_table(self, mock_RecordsSchemaKnownRepresentation,
                           mock_RecordsSchemaField):
        mock_schema_name = Mock(name='schema_name')
        mock_table_name = Mock(name='table_name')
        mock_driver = Mock(name='driver')
        mock_column = Mock(name='column')
        mock_table = mock_driver.table.return_value
        mock_table.columns = [mock_column]
        mock_origin_representation =\
            mock_RecordsSchemaKnownRepresentation.from_db_driver.return_value

        mock_known_representations = {
            'origin': mock_origin_representation,
        }
        mock_field = mock_RecordsSchemaField.from_sqlalchemy_column.return_value
        actual_schema = RecordsSchema.from_db_table(
            schema_name=mock_schema_name,
            table_name=mock_table_name,
            driver=mock_driver)
        mock_driver.table.assert_called_with(mock_schema_name, mock_table_name)
        mock_RecordsSchemaKnownRepresentation.from_db_driver.assert_called_with(
            mock_driver, mock_schema_name, mock_table_name)
        mock_RecordsSchemaField.\
            from_sqlalchemy_column.assert_called_with(column=mock_column,
                                                      driver=mock_driver,
                                                      rep_type=mock_origin_representation.type)

        self.assertEqual(actual_schema.fields, [mock_field])
        self.assertEqual(actual_schema.known_representations,
                         mock_known_representations)
示例#8
0
    def test_dateformat(self) -> None:
        class DateFormatExpectations(TypedDict):
            # Use the datetimeformat/datetimeformattz which is
            # compatible, as pandas doesn't let you configure those
            # separately
            dayfirst: bool

        testcases: Dict[HintDateFormat, DateFormatExpectations] = {
            'YYYY-MM-DD': {
                'dayfirst': False,
            },
            'MM-DD-YYYY': {
                'dayfirst': False,
            },
            'DD-MM-YYYY': {
                'dayfirst': True,
            },
            'MM/DD/YY': {
                'dayfirst': False,
            },
            'DD/MM/YY': {
                'dayfirst': True,
            },
            'DD-MM-YY': {
                'dayfirst': True,
            },
        }
        for dateformat in DATE_CASES:
            records_format = DelimitedRecordsFormat(hints={
                'dateformat': dateformat,
                'datetimeformat': f"{dateformat} HH:MI:SS",
                'datetimeformattz': f"{dateformat} HH:MI:SSOF",
                'compression': None,
            })
            records_schema = RecordsSchema.from_data({
                'schema': 'bltypes/v1',
                'fields': {
                    'first': {
                        'type': 'date'
                    }
                },
            })
            unhandled_hints = set(records_format.hints)
            processing_instructions = ProcessingInstructions()
            expectations = testcases[dateformat]
            try:
                options = pandas_read_csv_options(records_format,
                                                  records_schema,
                                                  unhandled_hints,
                                                  processing_instructions)
            except NotImplementedError:
                self.fail(f'Could not handle combination for {dateformat}')
            self.assertTrue(all(item in options.items() for item in expectations.items()))
            fileobj = io.StringIO(create_sample(dateformat))
            df = pandas.read_csv(filepath_or_buffer=fileobj,
                                 **options)
            timestamp = df['untitled_0'][0]
            self.assertEqual(timestamp.year, SAMPLE_YEAR)
            self.assertEqual(timestamp.month, SAMPLE_MONTH)
            self.assertEqual(timestamp.day, SAMPLE_DAY)
示例#9
0
    def test_prep_df_for_csv_output_include_index(self):
        schema_data = {
            'schema': "bltypes/v1",
            'fields': {
                "date": {
                    "type": "date",
                    "index": 1,
                },
                "time": {
                    "type": "time",
                    "index": 2,
                },
                "timetz": {
                    "type": "timetz",
                    "index": 3,
                },
            }
        }
        records_format = DelimitedRecordsFormat(variant='bluelabs')
        records_schema = RecordsSchema.from_data(schema_data)
        processing_instructions = ProcessingInstructions()
        # us_eastern = pytz.timezone('US/Eastern')
        data = {
            'time': [
                pd.Timestamp(year=1970, month=1, day=1,
                             hour=12, minute=33, second=53, microsecond=1234)
            ],
            # timetz is not well supported in records mover yet.  For
            # instance, specifying how it's turned into a CSV is not
            # currently part of the records spec:
            #
            #   https://github.com/bluelabsio/records-mover/issues/76
            #
            # In addition, Vertica suffers from a driver limitation:
            #
            #   https://github.com/bluelabsio/records-mover/issues/77
            #
            # 'timetz': [
            #     us_eastern.localize(pd.Timestamp(year=1970, month=1, day=1,
            #                                      hour=12, minute=33, second=53,
            #                                      microsecond=1234)),
            # ],
        }
        df = pd.DataFrame(data,
                          index=[pd.Timestamp(year=1970, month=1, day=1)],
                          columns=['time', 'timetz'])

        new_df = prep_df_for_csv_output(df=df,
                                        include_index=True,
                                        records_schema=records_schema,
                                        records_format=records_format,
                                        processing_instructions=processing_instructions)
        self.assertEqual(new_df.index[0], '1970-01-01')
        self.assertEqual(new_df['time'][0], '12:33:53')
        # self.assertEqual(new_df['timetz'][0], '12:33:53-05')
        self.assertIsNotNone(new_df)
 def test_pandas_read_csv_options_bzip(self):
     records_format = DelimitedRecordsFormat(hints={'compression': 'BZIP'})
     records_schema = RecordsSchema.from_data({'schema': 'bltypes/v1'})
     unhandled_hints = set(records_format.hints)
     processing_instructions = ProcessingInstructions()
     expectations = {'compression': 'bz2'}
     out = pandas_read_csv_options(records_format, records_schema,
                                   unhandled_hints, processing_instructions)
     self.assertTrue(
         all(item in out.items() for item in expectations.items()))
示例#11
0
 def test_json_roundtrip_redshift_v1(self):
     dir_path = os.path.dirname(os.path.realpath(__file__))
     sample_filename = os.path.join(dir_path, 'redshift_example_1.json')
     with open(sample_filename) as f:
         sample_str = f.read()
     records_schema = RecordsSchema.from_json(sample_str)
     sample_data = json.loads(sample_str)
     output_str = records_schema.to_json()
     output_data = json.loads(output_str)
     self.assertDictEqual(sample_data, output_data)
示例#12
0
    def test_timeonlyformat(self):
        schema_data = {
            'schema': "bltypes/v1",
            'fields': {
                "time_as_timestamp": {
                    "type": "time",
                    "index": 1,
                },
                "time_as_time": {
                    "type": "time",
                    "index": 2,
                },
            }
        }
        records_schema = RecordsSchema.from_data(schema_data)
        processing_instructions = ProcessingInstructions()
        for timeonlyformat in TIMEONLY_CASES:
            records_format = DelimitedRecordsFormat(variant='bluelabs',
                                                    hints={
                                                        'timeonlyformat': timeonlyformat,
                                                    })
            # us_eastern = pytz.timezone('US/Eastern')
            time_as_timestamp = pd.Timestamp(year=SAMPLE_YEAR, month=SAMPLE_MONTH, day=SAMPLE_DAY,
                                             hour=SAMPLE_HOUR, minute=SAMPLE_MINUTE,
                                             second=SAMPLE_SECOND)
            time_as_time = datetime.time(hour=SAMPLE_HOUR,
                                         minute=SAMPLE_MINUTE,
                                         second=SAMPLE_SECOND)
            data = {
                'time_as_timestamp': [
                    time_as_timestamp
                ],
                'time_as_time': [
                    time_as_time
                ],
            }
            df = pd.DataFrame(data, columns=['time_as_timestamp', 'time_as_time'])

            new_df = prep_df_for_csv_output(df=df,
                                            include_index=False,
                                            records_schema=records_schema,
                                            records_format=records_format,
                                            processing_instructions=processing_instructions)
            self.assertEqual(new_df['time_as_timestamp'][0],
                             create_sample(timeonlyformat),
                             timeonlyformat)
            self.assertEqual(new_df['time_as_time'][0],
                             create_sample(timeonlyformat),
                             timeonlyformat)
            # self.assertEqual(new_df['timetz'][0], '12:33:53-05')
            self.assertIsNotNone(new_df)
示例#13
0
    def test_dateformat(self):
        schema_data = {
            'schema': "bltypes/v1",
            'fields': {
                "date_as_timestamp": {
                    "type": "date",
                    "index": 1,
                },
                "date_as_date": {
                    "type": "date",
                    "index": 1,
                },
            }
        }
        records_schema = RecordsSchema.from_data(schema_data)
        processing_instructions = ProcessingInstructions()
        for dateformat in DATE_CASES:
            records_format = DelimitedRecordsFormat(variant='bluelabs',
                                                    hints={
                                                        'dateformat': dateformat
                                                    })
            # us_eastern = pytz.timezone('US/Eastern')
            data = {
                'date_as_timestamp': [
                    pd.Timestamp(year=SAMPLE_YEAR, month=SAMPLE_MONTH, day=SAMPLE_DAY)
                ],
                'date_as_date': [
                    datetime.date(year=SAMPLE_YEAR, month=SAMPLE_MONTH, day=SAMPLE_DAY)
                ],
            }
            df = pd.DataFrame(data,
                              columns=['date_as_timestamp', 'date_as_date'])

            new_df = prep_df_for_csv_output(df=df,
                                            include_index=False,
                                            records_schema=records_schema,
                                            records_format=records_format,
                                            processing_instructions=processing_instructions)
            self.assertEqual(new_df['date_as_timestamp'][0],
                             create_sample(dateformat))
            self.assertEqual(new_df['date_as_date'][0],
                             create_sample(dateformat))
            self.assertIsNotNone(new_df)
示例#14
0
    def test_datetimeformat(self):
        schema_data = {
            'schema': "bltypes/v1",
            'fields': {
                "datetimez": {
                    "type": "datetime",
                    "index": 1,
                },
            }
        }
        records_schema = RecordsSchema.from_data(schema_data)
        processing_instructions = ProcessingInstructions()
        for datetimeformat in DATETIME_CASES:
            records_format = DelimitedRecordsFormat(variant='bluelabs',
                                                    hints={
                                                        'datetimeformat': datetimeformat
                                                    })
            # us_eastern = pytz.timezone('US/Eastern')
            timestamp = pd.Timestamp(year=SAMPLE_YEAR, month=SAMPLE_MONTH, day=SAMPLE_DAY,
                                     hour=SAMPLE_HOUR, minute=SAMPLE_MINUTE,
                                     second=SAMPLE_SECOND)

            data = {
                'datetime': [
                    timestamp
                ],
            }
            df = pd.DataFrame(data, columns=['datetime'])

            new_df = prep_df_for_csv_output(df=df,
                                            include_index=False,
                                            records_schema=records_schema,
                                            records_format=records_format,
                                            processing_instructions=processing_instructions)
            # No conversion is done of datetime as pandas' CSV
            # outputter handles it properly, so we should expect the
            # original again
            self.assertEqual(new_df['datetime'][0],
                             timestamp,
                             create_sample(datetimeformat))
            # self.assertEqual(new_df['timetz'][0], '12:33:53-05')
            self.assertIsNotNone(new_df)
 def setUp(self):
     self.records_schema = RecordsSchema.from_data({
         'schema': 'bltypes/v1',
         'fields': {
             "date": {
                 "type": "date",
                 "index": 1,
             },
             "time": {
                 "type": "time",
                 "index": 2,
             },
             "timestamp": {
                 "type": "datetime",
                 "index": 3,
             },
             "timestamptz": {
                 "type": "datetimetz",
                 "index": 4,
             }
         }
     })
示例#16
0
 def test_timeonlyformat(self) -> None:
     for timeonlyformat in TIMEONLY_CASES:
         records_format = DelimitedRecordsFormat(hints={
             'timeonlyformat': timeonlyformat,
             'compression': None,
         })
         records_schema = RecordsSchema.from_data({
             'schema': 'bltypes/v1',
             'fields': {
                 'first': {
                     'type': 'time'
                 }
             },
         })
         unhandled_hints = set(records_format.hints)
         processing_instructions = ProcessingInstructions()
         try:
             options = pandas_read_csv_options(records_format,
                                               records_schema,
                                               unhandled_hints,
                                               processing_instructions)
         except NotImplementedError:
             self.fail(f'Could not handle combination for {timeonlyformat}')
         self.assertEqual(options['parse_dates'], [0])
         timeonly = create_sample(timeonlyformat)
         fileobj = io.StringIO(timeonly)
         df = pandas.read_csv(filepath_or_buffer=fileobj,
                              **options)
         timestamp = df['untitled_0'][0]
         self.assertIsInstance(timestamp, pandas.Timestamp,
                               f"Pandas did not parse {timeonly} as a timestamp object")
         self.assertEqual(timestamp.hour, SAMPLE_HOUR)
         self.assertEqual(timestamp.minute, SAMPLE_MINUTE)
         if 'SS' in timeonlyformat:
             self.assertEqual(timestamp.second, SAMPLE_SECOND)
         else:
             self.assertEqual(timestamp.second, 0,
                              timeonly)
示例#17
0
def schema_from_dataframe(df: DataFrame,
                          processing_instructions: ProcessingInstructions,
                          include_index: bool) -> 'RecordsSchema':
    from records_mover.records.schema import RecordsSchema  # noqa
    from records_mover.records.schema.field import RecordsSchemaField  # noqa
    fields = []
    origin_representation = \
        RecordsSchemaKnownRepresentation.from_dataframe(df, processing_instructions)
    known_representations: Dict[str, RecordsSchemaKnownRepresentation] = {
        'origin': origin_representation
    }

    if include_index:
        fields.append(
            RecordsSchemaField.from_index(
                df.index, processing_instructions=processing_instructions))
    for column in df:
        fields.append(
            RecordsSchemaField.from_series(
                df[column], processing_instructions=processing_instructions))

    return RecordsSchema(fields=fields,
                         known_representations=known_representations)
示例#18
0
def refine_schema_from_dataframe(records_schema: 'RecordsSchema',
                                 df: DataFrame,
                                 processing_instructions:
                                 ProcessingInstructions = ProcessingInstructions()) ->\
        'RecordsSchema':
    from records_mover.records.schema import RecordsSchema

    max_sample_size = processing_instructions.max_inference_rows
    total_rows = len(df.index)
    if max_sample_size is not None and max_sample_size < total_rows:
        sampled_df = df.sample(n=max_sample_size)
    else:
        sampled_df = df
    rows_sampled = len(sampled_df.index)

    fields = [
        field.refine_from_series(sampled_df[field.name],
                                 total_rows=total_rows,
                                 rows_sampled=rows_sampled)
        for field in records_schema.fields
    ]
    return RecordsSchema(
        fields=fields,
        known_representations=records_schema.known_representations)
示例#19
0
 def test_pandas_numeric_types_and_constraints(self):
     self.maxDiff = None
     # https://docs.scipy.org/doc/numpy/reference/arrays.scalars.html
     # https://stackoverflow.com/a/53828986/9795956
     dtypes = np.dtype([
         ('int8', np.int8),
         ('int16', np.int16),
         ('int32', np.int32),
         ('int64', np.int64),
         ('ubyte', np.ubyte),
         ('uint8', np.uint8),
         ('uint16', np.uint16),
         ('uint32', np.uint32),
         ('uint64', np.uint64),
         ('float16', np.float16),
         ('float32', np.float32),
         ('float64', np.float64),
         # 'float96', np.float96), # not supported by numpy on macOS on amd64, apparantly
         ('float128', np.float128),
     ])
     data = np.empty(0, dtype=dtypes)
     df = pd.DataFrame(data)
     processing_instructions = ProcessingInstructions()
     schema = RecordsSchema.from_dataframe(df,
                                           processing_instructions,
                                           include_index=False)
     data = schema.to_data()
     fields = data['fields']
     fields_and_constraints = {
         field_name: fields[field_name]['constraints']
         for field_name in fields
     }
     expected_fields = {
         'int8': {
             'required': False,
             'unique': False,
             'min': '-128',
             'max': '127'
         },
         'float128': {
             'fp_significand_bits': 64,
             'fp_total_bits': 80,
             'required': False,
             'unique': False
         },
         'float16': {
             'fp_significand_bits': 11,
             'fp_total_bits': 16,
             'required': False,
             'unique': False
         },
         'float32': {
             'fp_significand_bits': 23,
             'fp_total_bits': 32,
             'required': False,
             'unique': False
         },
         'float64': {
             'fp_significand_bits': 53,
             'fp_total_bits': 64,
             'required': False,
             'unique': False
         },
         'int16': {
             'max': '32767',
             'min': '-32768',
             'required': False,
             'unique': False
         },
         'int32': {
             'max': '2147483647',
             'min': '-2147483648',
             'required': False,
             'unique': False
         },
         'int64': {
             'max': '9223372036854775807',
             'min': '-9223372036854775808',
             'required': False,
             'unique': False
         },
         'ubyte': {
             'max': '255',
             'min': '0',
             'required': False,
             'unique': False
         },
         'uint16': {
             'max': '65535',
             'min': '0',
             'required': False,
             'unique': False
         },
         'uint32': {
             'max': '4294967295',
             'min': '0',
             'required': False,
             'unique': False
         },
         'uint64': {
             'max': '18446744073709551615',
             'min': '0',
             'required': False,
             'unique': False
         },
         'uint8': {
             'max': '255',
             'min': '0',
             'required': False,
             'unique': False
         }
     }
     self.assertEqual(fields_and_constraints, expected_fields)
示例#20
0
 def test_str(self):
     obj = RecordsSchema(fields=[], known_representations={})
     self.assertEqual(str(obj), "RecordsSchema(types={})")
示例#21
0
    def test_datetimeformattz(self) -> None:
        class DateTimeFormatTzExpectations(TypedDict):
            # Use the datetimeformat/datetimeformattz which is
            # compatible, as pandas doesn't let you configure those
            # separately
            dayfirst: bool

        testcases: Dict[HintDateTimeFormatTz, DateTimeFormatTzExpectations] = {
            'YYYY-MM-DD HH:MI:SSOF': {
                'dayfirst': False,
            },
            'YYYY-MM-DD HH:MI:SS': {
                'dayfirst': False,
            },
            'YYYY-MM-DD HH24:MI:SSOF': {
                'dayfirst': False,
            },
            'MM/DD/YY HH24:MI': {
                'dayfirst': False,
            },
        }
        for datetimeformattz in DATETIMETZ_CASES:
            records_format = DelimitedRecordsFormat(hints={
                'datetimeformattz': datetimeformattz,
                'compression': None,
            })
            records_schema = RecordsSchema.from_data({
                'schema': 'bltypes/v1',
                'fields': {
                    'first': {
                        'type': 'datetimetz'
                    }
                },
            })
            unhandled_hints = set(records_format.hints)
            processing_instructions = ProcessingInstructions()
            expectations = testcases[datetimeformattz]
            try:
                options = pandas_read_csv_options(records_format,
                                                  records_schema,
                                                  unhandled_hints,
                                                  processing_instructions)
            except NotImplementedError:
                self.fail(f'Could not handle combination for {datetimeformattz}')
            self.assertEqual(options['parse_dates'], [0])
            self.assertTrue(all(item in options.items() for item in expectations.items()))
            datetimetz = create_sample(datetimeformattz)
            fileobj = io.StringIO(datetimetz)
            df = pandas.read_csv(filepath_or_buffer=fileobj,
                                 **options)
            timestamp = df['untitled_0'][0]
            self.assertIsInstance(timestamp, pandas.Timestamp,
                                  f"Pandas did not parse {datetimetz} as a timestamp object")
            self.assertEqual(timestamp.year, SAMPLE_YEAR)
            self.assertEqual(timestamp.month, SAMPLE_MONTH)
            self.assertEqual(timestamp.day, SAMPLE_DAY)
            self.assertEqual(timestamp.hour, SAMPLE_HOUR)
            self.assertEqual(timestamp.minute, SAMPLE_MINUTE)
            if 'SS' in datetimeformattz:
                self.assertEqual(timestamp.second, SAMPLE_SECOND)
            else:
                self.assertEqual(timestamp.second, 0)
示例#22
0
 def test_from_fileobjs(self, mock_stream_csv, mock_RecordsSchema):
     mock_fileobj = Mock(name='fileobj')
     mock_fileobjs = [mock_fileobj]
     mock_records_format = Mock(name='records_format')
     mock_processing_instructions = Mock(name='processing_instructions')
     mock_fileobj.seekable.return_value = True
     mock_reader = mock_stream_csv.return_value.__enter__.return_value
     data = [
         {
             'Country': 'Belgium',
             'Capital': 'Brussels',
             'Population': 11190846,
             'Unnamed: 1': None
         },
         {
             'Country': 'India',
             'Capital': 'New Delhi',
             'Population': 1303171035,
             'Unnamed: 1': None
         },
         {
             'Country': 'Brazil',
             'Capital': 'Brasília',
             'Population': 207847528,
             'Unnamed: 1': None
         },
     ]
     df = DataFrame.from_dict(data)
     mock_reader.get_chunk.return_value = df
     out = RecordsSchema.from_fileobjs(mock_fileobjs, mock_records_format,
                                       mock_processing_instructions)
     mock_reader.get_chunk.assert_called_with(
         mock_processing_instructions.max_inference_rows)
     mock_fileobj.seek.assert_called_with(0)
     mock_RecordsSchema.from_dataframe.assert_called_with(
         ANY, mock_processing_instructions, include_index=False)
     actual_cleaned_up_df = mock_RecordsSchema.from_dataframe.mock_calls[0][
         1][0]
     actual_cleaned_up_df_data = actual_cleaned_up_df.to_dict(
         orient='records')
     expected_cleaned_up_df_data = [
         {
             'Country': 'Belgium',
             'Capital': 'Brussels',
             'Population': 11190846
         },
         {
             'Country': 'India',
             'Capital': 'New Delhi',
             'Population': 1303171035
         },
         {
             'Country': 'Brazil',
             'Capital': 'Brasília',
             'Population': 207847528
         },
     ]
     self.assertEqual(actual_cleaned_up_df_data,
                      expected_cleaned_up_df_data)
     self.assertEqual(
         out, mock_RecordsSchema.from_dataframe.return_value.
         refine_from_dataframe.return_value)