Exemplo n.º 1
0
    def test_table_publish_mixed_type_column(self, mock_session_helper,
                                             mock_create_table):
        dataframe = setup_grouped_dataframe()
        bucket, key = self.setup_s3()
        partitions = []
        redshift_params = self.setup_redshift_params()
        msh = mock_session_helper(region=redshift_params['region'],
                                  cluster_id=redshift_params['cluster_id'],
                                  host=redshift_params['host'],
                                  port=redshift_params['port'],
                                  db_name=redshift_params['db_name'])

        msh.configure_session_helper()

        dataframe.iat[5, dataframe.columns.get_loc("text_col")] = 45

        parq.publish(bucket=bucket,
                     key=key,
                     dataframe=dataframe,
                     partitions=partitions,
                     redshift_params=redshift_params)

        df_types = parq._get_dataframe_datatypes(dataframe, partitions)
        partition_types = parq._get_dataframe_datatypes(
            dataframe, partitions, True)

        mock_create_table.assert_called_once_with(
            redshift_params['table_name'], redshift_params['schema_name'],
            df_types, partition_types, parq.s3_url(bucket, key), msh)
Exemplo n.º 2
0
    def test_table_publish(self, mock_session_helper, mock_create_table):
        with get_s3_client() as s3_client:
            dataframe = setup_grouped_dataframe()
            bucket, key = self.setup_s3(s3_client)
            partitions = ["text_col", "int_col", "float_col"]
            redshift_params = self.setup_redshift_params()
            msh = mock_session_helper(region=redshift_params['region'],
                                      cluster_id=redshift_params['cluster_id'],
                                      host=redshift_params['host'],
                                      port=redshift_params['port'],
                                      db_name=redshift_params['db_name'])

            msh.configure_session_helper()
            parq.publish(bucket=bucket,
                         key=key,
                         dataframe=dataframe,
                         partitions=partitions,
                         redshift_params=redshift_params)

            df_types = parq._get_dataframe_datatypes(dataframe, partitions)
            partition_types = parq._get_dataframe_datatypes(
                dataframe, partitions, True)

            mock_create_table.assert_called_once_with(
                redshift_params['table_name'], redshift_params['schema_name'],
                df_types, partition_types, parq.s3_url(bucket, key), msh)
Exemplo n.º 3
0
 def test_dataframe_sans_partitions(self):
     dataframe = setup_grouped_dataframe()
     partitions = ["text_col", "int_col", "float_col"]
     dataframe_dtypes = dataframe.dtypes.to_dict()
     for part in partitions:
         dataframe_dtypes.pop(part, None)
     assert parq._get_dataframe_datatypes(dataframe,
                                          partitions) == dataframe_dtypes
Exemplo n.º 4
0
 def test_partition_datatypes(self):
     columns, dataframe = self.setup_df()
     partitions = ["text_col", "int_col", "float_col"]
     assert parq._get_dataframe_datatypes(dataframe, partitions, True) == {
         'text_col': 'object',
         'int_col': 'int64',
         'float_col': 'float64'
     }
Exemplo n.º 5
0
 def test_df_datatypes(self):
     columns, dataframe = self.setup_df()
     assert parq._get_dataframe_datatypes(dataframe) == {
         'grouped_col': 'object',
         'text_col': 'object',
         'int_col': 'int64',
         'float_col': 'float64'
     }
Exemplo n.º 6
0
 def test_partition_datatypes(self):
     dataframe = setup_grouped_dataframe()
     partitions = ["text_col", "int_col", "float_col"]
     dataframe_dtypes = dataframe.dtypes.to_dict()
     part_dtypes = {}
     for part in partitions:
         part_dtypes[part] = dataframe_dtypes.pop(part, None)
     assert parq._get_dataframe_datatypes(dataframe, partitions,
                                          True) == part_dtypes
Exemplo n.º 7
0
 def test_df_datatypes(self):
     dataframe = setup_grouped_dataframe()
     assert parq._get_dataframe_datatypes(
         dataframe) == dataframe.dtypes.to_dict()
Exemplo n.º 8
0
 def test_dataframe_sans_partitions(self):
     columns, dataframe = self.setup_df()
     partitions = ["text_col", "int_col", "float_col"]
     assert parq._get_dataframe_datatypes(dataframe, partitions) == {
         'grouped_col': 'object'
     }