def test_s3_parquet_to_dataframe():
    with get_s3_client() as s3_client:
        columns = {
            "string_col": "string",
            "int_col": "integer",
            "float_col": "float",
            "bool_col": "boolean",
            "datetime_col": "datetime"
        }

        bucket = "foobucket"
        key = "fookey"

        df = setup_grouped_dataframe(count=10, columns=columns)
        bucket, parquet_paths = setup_partitioned_parquet(
            dataframe=df,
            bucket=bucket,
            key=key,
            partition_data_types={},
            s3_client=s3_client)

        first_published_file = parquet_paths[0]
        response = fetch_parq._s3_parquet_to_dataframe(
            bucket=bucket, key=first_published_file, partition_metadata={})

        assert isinstance(response, pd.DataFrame)
        for col in columns.keys():
            assert (col in response.columns)

        assert response.shape == df.shape
        sorted_dfs_equal_by_pandas_testing(response, df)
示例#2
0
    def test_table_publish_mixed_type_column(self, mock_session_helper,
                                             mock_create_table):
        dataframe = setup_grouped_dataframe()
        bucket, key = self.setup_s3()
        partitions = []
        redshift_params = self.setup_redshift_params()
        msh = mock_session_helper(region=redshift_params['region'],
                                  cluster_id=redshift_params['cluster_id'],
                                  host=redshift_params['host'],
                                  port=redshift_params['port'],
                                  db_name=redshift_params['db_name'])

        msh.configure_session_helper()

        dataframe.iat[5, dataframe.columns.get_loc("text_col")] = 45

        parq.publish(bucket=bucket,
                     key=key,
                     dataframe=dataframe,
                     partitions=partitions,
                     redshift_params=redshift_params)

        df_types = parq._get_dataframe_datatypes(dataframe, partitions)
        partition_types = parq._get_dataframe_datatypes(
            dataframe, partitions, True)

        mock_create_table.assert_called_once_with(
            redshift_params['table_name'], redshift_params['schema_name'],
            df_types, partition_types, parq.s3_url(bucket, key), msh)
示例#3
0
    def test_table_publish(self, mock_session_helper, mock_create_table):
        with get_s3_client() as s3_client:
            dataframe = setup_grouped_dataframe()
            bucket, key = self.setup_s3(s3_client)
            partitions = ["text_col", "int_col", "float_col"]
            redshift_params = self.setup_redshift_params()
            msh = mock_session_helper(region=redshift_params['region'],
                                      cluster_id=redshift_params['cluster_id'],
                                      host=redshift_params['host'],
                                      port=redshift_params['port'],
                                      db_name=redshift_params['db_name'])

            msh.configure_session_helper()
            parq.publish(bucket=bucket,
                         key=key,
                         dataframe=dataframe,
                         partitions=partitions,
                         redshift_params=redshift_params)

            df_types = parq._get_dataframe_datatypes(dataframe, partitions)
            partition_types = parq._get_dataframe_datatypes(
                dataframe, partitions, True)

            mock_create_table.assert_called_once_with(
                redshift_params['table_name'], redshift_params['schema_name'],
                df_types, partition_types, parq.s3_url(bucket, key), msh)
示例#4
0
def test_s3_partitioned_parquet_to_dataframe():
    partition_types = {"string_col": "string",
                       "int_col": "integer",
                       "float_col": "float",
                       "bool_col": "boolean",
                       "datetime_col": "datetime"}
    columns = dict(partition_types)
    columns["metrics"] = "int"

    bucket = "foobucket"
    key = "fookey"

    df = setup_grouped_dataframe(count=10, columns=columns)
    bucket, parquet_paths = setup_partitioned_parquet(
        dataframe=df,
        bucket=bucket,
        key=key,
        partition_data_types=partition_types
    )

    first_published_file = parquet_paths[0]
    response = fetch_parq._s3_parquet_to_dataframe(
        bucket=bucket, key=first_published_file, partition_metadata=partition_types)

    assert isinstance(response, pd.DataFrame)
    for col in columns.keys():
        assert (col in response.columns)

    full_response = pd.DataFrame()
    for path in parquet_paths:
        full_response = full_response.append(fetch_parq._s3_parquet_to_dataframe(
            bucket=bucket, key=path, partition_metadata=partition_types))

    assert full_response.shape == df.shape
    sorted_dfs_equal_by_pandas_testing(full_response, df)
示例#5
0
 def test_works_without_partitions(self):
     dataframe = setup_grouped_dataframe()
     bucket, key = self.setup_s3()
     partitions = []
     parq.publish(bucket=bucket,
                  key=key,
                  dataframe=dataframe,
                  partitions=partitions)
示例#6
0
 def test_dataframe_sans_partitions(self):
     dataframe = setup_grouped_dataframe()
     partitions = ["text_col", "int_col", "float_col"]
     dataframe_dtypes = dataframe.dtypes.to_dict()
     for part in partitions:
         dataframe_dtypes.pop(part, None)
     assert parq._get_dataframe_datatypes(dataframe,
                                          partitions) == dataframe_dtypes
示例#7
0
 def test_no_redshift_publish(self):
     dataframe = setup_grouped_dataframe()
     bucket, key = self.setup_s3()
     partitions = []
     parq.publish(bucket=bucket,
                  key=key,
                  dataframe=dataframe,
                  partitions=partitions)
示例#8
0
 def test_generates_partitions_in_order(self):
     dataframe = setup_grouped_dataframe()
     bucket, key = self.setup_s3()
     partitions = dataframe.columns[:2]
     with patch('s3parq.publish_parq.pq.write_to_dataset',
                return_value=None) as mock_method:
         parq._gen_parquet_to_s3(bucket, key, dataframe, partitions)
         arg, kwarg = mock_method.call_args
         assert (kwarg['partition_cols'] == partitions).all()
示例#9
0
 def test_no_redshift_publish(self):
     with get_s3_client() as s3_client:
         dataframe = setup_grouped_dataframe()
         bucket, key = self.setup_s3(s3_client)
         partitions = []
         parq.publish(bucket=bucket,
                      key=key,
                      dataframe=dataframe,
                      partitions=partitions)
示例#10
0
 def test_partition_datatypes(self):
     dataframe = setup_grouped_dataframe()
     partitions = ["text_col", "int_col", "float_col"]
     dataframe_dtypes = dataframe.dtypes.to_dict()
     part_dtypes = {}
     for part in partitions:
         part_dtypes[part] = dataframe_dtypes.pop(part, None)
     assert parq._get_dataframe_datatypes(dataframe, partitions,
                                          True) == part_dtypes
示例#11
0
 def test_reject_timedelta_dataframes(self):
     dataframe = setup_grouped_dataframe()
     bucket, key = self.setup_s3()
     partitions = ['text_col']
     dataframe['time_col'] = pd.Timedelta('1 days')
     with pytest.raises(NotImplementedError):
         parq.publish(bucket=bucket,
                      key=key,
                      dataframe=dataframe,
                      partitions=partitions)
示例#12
0
    def test_input_equals_output(self):
        dataframe = setup_grouped_dataframe()
        bucket, key = self.setup_s3()
        s3_path = f"s3://{bucket}/{key}"
        partitions = [dataframe.columns[0]]
        parq.publish(bucket=bucket,
                     key=key,
                     dataframe=dataframe,
                     partitions=partitions)

        from_s3 = pq.ParquetDataset(s3_path, filesystem=s3fs.S3FileSystem())
        s3pd = from_s3.read().to_pandas()
        # Switch partition type back -> by default it gets set to a category
        s3pd[partitions[0]] = s3pd[partitions[0]].astype(
            dataframe[partitions[0]].dtype)

        sorted_dfs_equal_by_pandas_testing(dataframe, s3pd)
示例#13
0
def test_gets_max():
    key = "safekeyprefixname/safedatasetname"
    bucket = "safebucketname"
    part_types = {"int_col": "int", "float_col": "float"}

    df = setup_grouped_dataframe(count=10, columns=part_types)
    bucket, parquet_paths = setup_partitioned_parquet(
        dataframe=df,
        bucket=bucket,
        key=key,
        partition_data_types={"int_col": "int"})

    fetched_max = fetch_parq.get_max_partition_value(bucket=bucket,
                                                     key=key,
                                                     partition="int_col")

    # Test max of column is max of the fetched partition
    assert df["int_col"].max() == fetched_max
示例#14
0
def test_gets_max_denies_text():
    key = "safekeyprefixname/safedatasetname"
    bucket = "safebucketname"
    part_types = {"string_col": "string", "bool_col": "bool"}
    col_types = dict(part_types)
    col_types["metrics"] = "int"
    df = setup_grouped_dataframe(count=10, columns=col_types)
    bucket, parquet_paths = setup_partitioned_parquet(
        dataframe=df, bucket=bucket, key=key, partition_data_types=part_types)

    with pytest.raises(ValueError):
        fetched_max = fetch_parq.get_max_partition_value(
            bucket=bucket, key=key, partition="string_col")

    with pytest.raises(ValueError):
        fetched_max = fetch_parq.get_max_partition_value(bucket=bucket,
                                                         key=key,
                                                         partition="bool_col")
示例#15
0
 def test_set_metadata_correctly(self):
     dataframe = setup_grouped_dataframe()
     bucket, key = self.setup_s3()
     s3_client = boto3.client('s3')
     partitions = ['string_col', 'int_col', 'bool_col']
     parq.publish(bucket=bucket,
                  key=key,
                  dataframe=dataframe,
                  partitions=partitions)
     for obj in s3_client.list_objects(Bucket=bucket)['Contents']:
         if obj['Key'].endswith(".parquet"):
             meta = s3_client.get_object(Bucket=bucket,
                                         Key=obj['Key'])['Metadata']
             assert meta['partition_data_types'] == str({
                 "string_col":
                 "string",
                 "int_col":
                 "integer",
                 "bool_col":
                 "boolean"
             })
示例#16
0
    def test_schema_publish(self, mock_session_helper, mock_create_schema):
        dataframe = setup_grouped_dataframe()
        bucket, key = self.setup_s3()
        partitions = [dataframe.columns[0]]
        redshift_params = self.setup_redshift_params()
        msh = mock_session_helper(region=redshift_params['region'],
                                  cluster_id=redshift_params['cluster_id'],
                                  host=redshift_params['host'],
                                  port=redshift_params['port'],
                                  db_name=redshift_params['db_name'])

        msh.configure_session_helper()
        parq.publish(bucket=bucket,
                     key=key,
                     dataframe=dataframe,
                     partitions=partitions,
                     redshift_params=redshift_params)

        mock_create_schema.assert_called_once_with(
            redshift_params['schema_name'], redshift_params['db_name'],
            redshift_params['iam_role'], msh)
示例#17
0
 def test_accepts_valid_partitions(self):
     dataframe = setup_grouped_dataframe()
     parq.check_partitions(dataframe.columns, dataframe)
示例#18
0
 def test_df_datatypes(self):
     dataframe = setup_grouped_dataframe()
     assert parq._get_dataframe_datatypes(
         dataframe) == dataframe.dtypes.to_dict()
示例#19
0
 def test_reject_non_column_partitions(self):
     dataframe = setup_grouped_dataframe()
     with pytest.raises(ValueError):
         parq.check_partitions(('banana', ), dataframe)