Exemplo n.º 1
0
def _write_dataframe_to_s3(config, logger, df: pyspark.sql.DataFrame, df_name: str) -> None:
    """
    Converts a PySpark DataFrame to Pandas, before writing out to a CSV file stored in Amazon
    S3, in the given bucket pulled from the config object.
    
    """
    logger.warn(f'About to write dataframe: {df_name} as CSV to S3')
    
    # Convert Pyspark dataframe to Pandas
    pd_df = df.toPandas()
    
    # Get S3 details
    s3 = boto3.resource('s3',
                        aws_access_key_id=config['AWS']['AWS_ACCESS_KEY_ID'],
                        aws_secret_access_key=config['AWS']['AWS_SECRET_ACCESS_KEY'])
    
    #Write Pandas df to CSV stored locally
    csv_buff = StringIO()
    
    pd_df.to_csv(csv_buff, sep=',', index = False)
    
    # Write to S3
    s3.Object(config['S3']['BUCKET_NAME'], f'{df_name}.csv').put(Body=csv_buff.getvalue())
    
    logger.warn(f'Finished writing dataframe: {df_name} as CSV to S3')
Exemplo n.º 2
0
def assert_test_dfs_equal(expected_df: pyspark.sql.DataFrame,
                          generated_df: pyspark.sql.DataFrame) -> None:
    """
    Used to compare two dataframes (typically, in a unit test).
    Better than the direct df1.equals(df2) method, as this function
    allows for tolerances in the floating point columns, and is
    also more descriptive with which parts of the two dataframes
    are in disagreement.
    :param expected_df: First dataframe to compare
    :param generated_df: Second dataframe to compare
    """

    row_limit = 10000

    e_count = expected_df.count()
    g_count = generated_df.count()

    if (e_count > row_limit) or (g_count > row_limit):
        raise Exception(
            f"One or both of the dataframes passed has too many rows (>{row_limit})."
            f"Please limit your test sizes to be lower than this number.")

    assert e_count == g_count, "The dataframes have a different number of rows."

    expected_pdf = expected_df.toPandas()
    generated_pdf = generated_df.toPandas()

    assert list(expected_pdf.columns) == list(generated_pdf.columns), \
        "The two dataframes have different columns."

    for col in expected_pdf.columns:
        error_msg = f"The columns with name: `{col}` were not equal."
        if expected_pdf[col].dtype.type == np.object_:
            assert expected_pdf[[col]].equals(generated_pdf[[col]]), error_msg
        else:
            # Numpy will not equate nulls on both sides. Filter them out.
            expected_pdf = expected_pdf[expected_pdf[col].notnull()]
            generated_pdf = generated_pdf[generated_pdf[col].notnull()]
            try:
                is_close = np.allclose(expected_pdf[col].values,
                                       generated_pdf[col].values)
            except ValueError:
                logging.error(
                    f"Problem encountered while equating column '{col}'.")
                raise
            assert is_close, error_msg