def test_read_multiple_csv_s3_storage_opts(storage_options): path = "s3://modin-datasets/testing/multiple_csv/" # Test the fact of handling of `storage_options` modin_df = pd.read_csv_glob(path, storage_options=storage_options) pandas_df = pd.concat([ pandas.read_csv( f"{path}test_data{i}.csv", storage_options=storage_options, ) for i in range(2) ], ).reset_index(drop=True) df_equals(modin_df, pandas_df)
def test_read_multiple_csv_s3(): modin_df = pd.read_csv_glob("S3://noaa-ghcn-pds/csv/178*.csv") # We have to specify the columns because the column names are not identical. Since we specified the column names, we also have to skip the original column names. pandas_dfs = [ pandas.read_csv( "s3://noaa-ghcn-pds/csv/178{}.csv".format(i), names=modin_df.columns, skiprows=[0], ) for i in range(10) ] pandas_df = pd.concat(pandas_dfs) # Indexes get messed up when concatting so we reset both. pandas_df = pandas_df.reset_index(drop=True) modin_df = modin_df.reset_index(drop=True) df_equals(modin_df, pandas_df)