示例#1
0
 def test_cast_dataframe_types_no_fields(self):
     mock_fields = []
     mock_known_representations = Mock(name='known_representations')
     schema = RecordsSchema(
         fields=mock_fields,
         known_representations=mock_known_representations)
     mock_df = Mock(name='df')
     out = schema.cast_dataframe_types(mock_df)
     self.assertEqual(out, mock_df.apply.return_value)
示例#2
0
 def test_to_schema_sql(self, mock_schema_to_schema_sql):
     mock_driver = Mock(name='driver')
     mock_schema_name = Mock(name='schema_name')
     mock_table_name = Mock(name='table_name')
     obj = RecordsSchema(fields=[], known_representations={})
     out = obj.to_schema_sql(mock_driver, mock_schema_name, mock_table_name)
     mock_schema_to_schema_sql.assert_called_with(
         driver=mock_driver,
         records_schema=obj,
         schema_name=mock_schema_name,
         table_name=mock_table_name)
     self.assertEqual(out, mock_schema_to_schema_sql.return_value)
示例#3
0
 def test_assign_dataframe_names_no_index(self):
     data = [{'a': 1}]
     df = DataFrame.from_dict(data)
     mock_field_a = Mock(name='field_a')
     mock_field_a.name = 'mya'
     mock_fields = [mock_field_a]
     mock_known_representations = Mock(name='known_representations')
     schema = RecordsSchema(
         fields=mock_fields,
         known_representations=mock_known_representations)
     out = schema.assign_dataframe_names(False, df)
     self.assertEqual(out.to_dict(orient='records'), [{'mya': 1}])
示例#4
0
 def test_assign_dataframe_names_with_index(self):
     data = [{'b': 1}]
     df = DataFrame.from_dict(data)
     self.assertEqual(df.to_dict(orient='index'), {0: {'b': 1}})
     mock_field_a = Mock(name='field_a')
     mock_field_a.name = 'mya'
     mock_field_b = Mock(name='field_b')
     mock_field_b.name = 'myb'
     mock_fields = [mock_field_a, mock_field_b]
     mock_known_representations = Mock(name='known_representations')
     schema = RecordsSchema(
         fields=mock_fields,
         known_representations=mock_known_representations)
     out = schema.assign_dataframe_names(True, df)
     self.assertEqual(out.to_dict(orient='records'), [{'myb': 1}])
     self.assertEqual(out.to_dict(orient='index'), {'mya': {'myb': 1}})
示例#5
0
    def test_refine_from_dataframe(self, mock_refine_schema_from_dataframe):
        mock_fields = Mock(name='fields')
        mock_known_representations = Mock(name='known_representations')
        schema = RecordsSchema(
            fields=mock_fields,
            known_representations=mock_known_representations)

        mock_df = Mock(name='df')
        mock_processing_instructions = Mock(name='processing_instructions')
        out = schema.refine_from_dataframe(mock_df,
                                           mock_processing_instructions)
        mock_refine_schema_from_dataframe.\
            assert_called_with(records_schema=schema,
                               df=mock_df,
                               processing_instructions=mock_processing_instructions)
        self.assertEqual(out, mock_refine_schema_from_dataframe.return_value)
示例#6
0
def schema_from_dataframe(df: DataFrame,
                          processing_instructions: ProcessingInstructions,
                          include_index: bool) -> 'RecordsSchema':
    from records_mover.records.schema import RecordsSchema  # noqa
    from records_mover.records.schema.field import RecordsSchemaField  # noqa
    fields = []
    origin_representation = \
        RecordsSchemaKnownRepresentation.from_dataframe(df, processing_instructions)
    known_representations: Dict[str, RecordsSchemaKnownRepresentation] = {
        'origin': origin_representation
    }

    if include_index:
        fields.append(
            RecordsSchemaField.from_index(
                df.index, processing_instructions=processing_instructions))
    for column in df:
        fields.append(
            RecordsSchemaField.from_series(
                df[column], processing_instructions=processing_instructions))

    return RecordsSchema(fields=fields,
                         known_representations=known_representations)
示例#7
0
def refine_schema_from_dataframe(records_schema: 'RecordsSchema',
                                 df: DataFrame,
                                 processing_instructions:
                                 ProcessingInstructions = ProcessingInstructions()) ->\
        'RecordsSchema':
    from records_mover.records.schema import RecordsSchema

    max_sample_size = processing_instructions.max_inference_rows
    total_rows = len(df.index)
    if max_sample_size is not None and max_sample_size < total_rows:
        sampled_df = df.sample(n=max_sample_size)
    else:
        sampled_df = df
    rows_sampled = len(sampled_df.index)

    fields = [
        field.refine_from_series(sampled_df[field.name],
                                 total_rows=total_rows,
                                 rows_sampled=rows_sampled)
        for field in records_schema.fields
    ]
    return RecordsSchema(
        fields=fields,
        known_representations=records_schema.known_representations)
示例#8
0
 def test_str(self):
     obj = RecordsSchema(fields=[], known_representations={})
     self.assertEqual(str(obj), "RecordsSchema(types={})")