예제 #1
0
    def test_read_csv_glob_4373(self):
        columns, filename = ["col0"], "1x1.csv"
        pd.DataFrame([[1]], columns=columns).to_csv(filename)

        kwargs = {"filepath_or_buffer": filename, "usecols": columns}
        modin_df = pd.read_csv_glob(**kwargs)
        pandas_df = pandas.read_csv(**kwargs)
        df_equals(modin_df, pandas_df)
예제 #2
0
 def test_read_single_csv_with_parse_dates(self, parse_dates):
     try:
         pandas_df = pandas.read_csv(time_parsing_csv_path,
                                     parse_dates=parse_dates)
     except Exception as pandas_exception:
         with pytest.raises(Exception) as modin_exception:
             modin_df = pd.read_csv_glob(time_parsing_csv_path,
                                         parse_dates=parse_dates)
             # Call __repr__ on the modin df to force it to materialize.
             repr(modin_df)
         assert isinstance(
             modin_exception.value, type(pandas_exception)
         ), "Got Modin Exception type {}, but pandas Exception type {} was expected".format(
             type(modin_exception.value), type(pandas_exception))
     else:
         modin_df = pd.read_csv_glob(time_parsing_csv_path,
                                     parse_dates=parse_dates)
         df_equals(modin_df, pandas_df)
예제 #3
0
    def test_read_multiple_small_csv(self):  # noqa: F811
        pandas_df = pandas.concat([pandas.read_csv(fname) for fname in pytest.files])
        modin_df = pd.read_csv_glob(pytest.glob_path)

        # Indexes get messed up when concatting so we reset both.
        pandas_df = pandas_df.reset_index(drop=True)
        modin_df = modin_df.reset_index(drop=True)

        df_equals(modin_df, pandas_df)
예제 #4
0
    def test_read_csv_empty_frame(self):
        kwargs = {
            "usecols": [0],
            "index_col": 0,
        }

        modin_df = pd.read_csv_glob(pytest.files[0], **kwargs)
        pandas_df = pandas.read_csv(pytest.files[0], **kwargs)

        df_equals(modin_df, pandas_df)
예제 #5
0
    def test_read_multiple_csv_nrows(self, request, nrows):  # noqa: F811
        pandas_df = pandas.concat([pandas.read_csv(fname) for fname in pytest.files])
        pandas_df = pandas_df.iloc[:nrows, :]

        modin_df = pd.read_csv_glob(pytest.glob_path, nrows=nrows)

        # Indexes get messed up when concatting so we reset both.
        pandas_df = pandas_df.reset_index(drop=True)
        modin_df = modin_df.reset_index(drop=True)

        df_equals(modin_df, pandas_df)
예제 #6
0
def test_read_multiple_csv_s3_storage_opts(storage_options):
    path = "s3://modin-datasets/testing/multiple_csv/"
    # Test the fact of handling of `storage_options`
    modin_df = pd.read_csv_glob(path, storage_options=storage_options)
    pandas_df = pd.concat([
        pandas.read_csv(
            f"{path}test_data{i}.csv",
            storage_options=storage_options,
        ) for i in range(2)
    ], ).reset_index(drop=True)

    df_equals(modin_df, pandas_df)
예제 #7
0
def test_read_multiple_csv_s3():
    modin_df = pd.read_csv_glob("S3://noaa-ghcn-pds/csv/178*.csv")

    # We have to specify the columns because the column names are not identical. Since we specified the column names, we also have to skip the original column names.
    pandas_dfs = [
        pandas.read_csv(
            "s3://noaa-ghcn-pds/csv/178{}.csv".format(i),
            names=modin_df.columns,
            skiprows=[0],
        ) for i in range(10)
    ]
    pandas_df = pd.concat(pandas_dfs)

    # Indexes get messed up when concatting so we reset both.
    pandas_df = pandas_df.reset_index(drop=True)
    modin_df = modin_df.reset_index(drop=True)

    df_equals(modin_df, pandas_df)
예제 #8
0
 def test_read_csv_without_glob(self):
     with pytest.warns(UserWarning, match=r"Shell-style wildcard"):
         with pytest.raises(FileNotFoundError):
             pd.read_csv_glob(
                 "s3://nyc-tlc/trip data/yellow_tripdata_2020-")