def parse_file(logger, reader_format: str, url, reader_options: dict) -> List: result = [] if reader_format == "csv": # pandas.read_csv additional arguments can be passed to customize how to parse csv. # see https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html result.append(pd.read_csv(url, **reader_options)) elif reader_format == "flat_json": # We can add option to call to pd.normalize_json to normalize semi-structured JSON data into a flat table # by asking user to specify how to flatten the nested columns result.append(pd.read_json(url, **reader_options)) elif reader_format == "html": result += pd.read_html(url, **reader_options) elif reader_format == "excel": result.append(pd.read_excel(url, **reader_options)) elif reader_format == "feather": result.append(pd.read_feather(url, **reader_options)) elif reader_format == "parquet": result.append(pd.read_parquet(url, **reader_options)) elif reader_format == "orc": result.append(pd.read_orc(url, **reader_options)) elif reader_format == "pickle": result.append(pd.read_pickle(url, **reader_options)) else: reason = f"Reader {reader_format} is not supported\n{traceback.format_exc()}" logger.error(reason) raise Exception(reason) return result
def test_orc_reader_boolean_type(datadir, orc_file): file_path = datadir / orc_file pdf = pd.read_orc(file_path) df = cudf.read_orc(file_path).to_pandas() assert_eq(pdf, df)
def test_orc_reader_basic(dirpath): data = { "boolean1": np.array([False, True], dtype="bool"), "byte1": np.array([1, 100], dtype="int8"), "short1": np.array([1024, 2048], dtype="int16"), "int1": np.array([65536, 65536], dtype="int32"), "long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"), "float1": np.array([1.0, 2.0], dtype="float32"), "double1": np.array([-15.0, -5.0], dtype="float64"), "bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"), "string1": np.array(["hi", "bye"], dtype="object"), } expected = pd.DataFrame.from_dict(data) inputfile = os.path.join(dirpath, "TestOrcFile.test1.orc") got = read_orc(inputfile, columns=data.keys()) tm.assert_equal(expected, got)
def test_orc_reader_decimal(dirpath): from decimal import Decimal # Only testing the first 10 rows of area_data data = { "_col0": np.array( [ Decimal("-1000.50000"), Decimal("-999.60000"), Decimal("-998.70000"), Decimal("-997.80000"), Decimal("-996.90000"), Decimal("-995.10000"), Decimal("-994.11000"), Decimal("-993.12000"), Decimal("-992.13000"), Decimal("-991.14000"), ], dtype="object", ) } expected = pd.DataFrame.from_dict(data) inputfile = os.path.join(dirpath, "TestOrcFile.decimal.orc") got = read_orc(inputfile).iloc[:10] tm.assert_equal(expected, got)
def test_orc_reader_empty(dirpath): columns = [ "boolean1", "byte1", "short1", "int1", "long1", "float1", "double1", "bytes1", "string1", ] dtypes = [ "bool", "int8", "int16", "int32", "int64", "float32", "float64", "object", "object", ] expected = pd.DataFrame(index=pd.RangeIndex(0)) for colname, dtype in zip(columns, dtypes): expected[colname] = pd.Series(dtype=dtype) inputfile = os.path.join(dirpath, "TestOrcFile.emptyFile.orc") got = read_orc(inputfile, columns=columns) tm.assert_equal(expected, got)
def test_orc_reader_multiple_files(datadir, num_rows): path = datadir / "TestOrcFile.testSnappy.orc" df_1 = pd.read_orc(path) df_2 = pd.read_orc(path) df = pd.concat([df_1, df_2], ignore_index=True) gdf = cudf.read_orc([path, path], num_rows=num_rows).to_pandas() # Slice rows out of the whole dataframe for comparison as PyArrow doesn't # have an API to read a subsection of rows from the file df = df[:num_rows] df = df.reset_index(drop=True) assert_eq(df, gdf)
def test_orc_timestamp_read(datadir): path = datadir / "TestOrcFile.timestamp.issue.orc" pdf = pd.read_orc(path) gdf = cudf.read_orc(path) assert_eq(pdf, gdf)
def _read_orc_file(self): records = [] for file in glob.glob(os.path.join(os.path.join(self.orc_dir_name, '**/*'))): df = pd.read_orc(file) for i in range(df.shape[0]): records.append(df.loc[i]) return records
def parse_file(logger, reader_format: str, url, reader_options: dict) -> List: result = [] if reader_format == "csv": # pandas.read_csv additional arguments can be passed to customize how to parse csv. # see https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html result.append(pd.read_csv(url, **reader_options)) elif reader_format == "json": result.append(pd.read_json(url, **reader_options)) elif reader_format == "html": result += pd.read_html(url, **reader_options) elif reader_format == "excel": result.append(pd.read_excel(url, **reader_options)) elif reader_format == "feather": result.append(pd.read_feather(url, **reader_options)) elif reader_format == "parquet": result.append(pd.read_parquet(url, **reader_options)) elif reader_format == "orc": result.append(pd.read_orc(url, **reader_options)) elif reader_format == "pickle": result.append(pd.read_pickle(url, **reader_options)) else: reason = f"Reader {reader_format} is not supported\n{traceback.format_exc()}" logger.error(reason) raise Exception(reason) return result
def read_orc(path, columns: Optional[List[str]] = None, **kwargs) -> DataFrame: # noqa: PR01, RT01, D200 """ Load an ORC object from the file path, returning a DataFrame. """ ErrorMessage.default_to_pandas("read_orc") Engine.subscribe(_update_engine) return DataFrame(pandas.read_orc(path, columns, **kwargs))
def test_orc_reader_decimal_type(datadir, orc_file): file_path = datadir / orc_file pdf = pd.read_orc(file_path) df = cudf.read_orc(file_path).to_pandas() # Converting to strings since pandas keeps it in decimal pdf["col8"] = pdf["col8"].astype("str") df["col8"] = df["col8"].astype("str") assert_eq(pdf, df)
def test_orc_writer_decimal(tmpdir, scale): np.random.seed(0) fname = tmpdir / "decimal.orc" expected = cudf.DataFrame({"dec_val": gen_rand_series("i", 100)}) expected["dec_val"] = expected["dec_val"].astype(Decimal64Dtype(7, scale)) expected.to_orc(fname) got = pd.read_orc(fname) assert_eq(expected.to_pandas()["dec_val"], got["dec_val"])
def test_empty_string_columns(data): buffer = BytesIO() expected = cudf.DataFrame({"string": data}, dtype="str") expected.to_orc(buffer) expected_pdf = pd.read_orc(buffer) got_df = cudf.read_orc(buffer) assert_eq(expected, got_df) assert_eq(expected_pdf, got_df)
def test_empty_dataframe(): buffer = BytesIO() expected = cudf.DataFrame() expected.to_orc(buffer) # Raise error if column name is mentioned, but it doesn't exist. with pytest.raises(RuntimeError): cudf.read_orc(buffer, columns=["a"]) got_df = cudf.read_orc(buffer) expected_pdf = pd.read_orc(buffer) assert_eq(expected, got_df) assert_eq(expected_pdf, got_df)
def test_empty_string_columns(data): buffer = BytesIO() expected = cudf.DataFrame({"string": data}, dtype="str") expected.to_orc(buffer) expected_pdf = pd.read_orc(buffer) got_df = cudf.read_orc(buffer) assert_eq(expected, got_df) assert_eq( expected_pdf, got_df.to_pandas(nullable=True) if expected_pdf["string"].dtype == pd.StringDtype() else got_df, )
def test_orc_reader_snappy_compressed(dirpath): data = { "int1": np.array( [ -1160101563, 1181413113, 2065821249, -267157795, 172111193, 1752363137, 1406072123, 1911809390, -1308542224, -467100286, ], dtype="int32", ), "string1": np.array( [ "f50dcb8", "382fdaaa", "90758c6", "9e8caf3f", "ee97332b", "d634da1", "2bea4396", "d67d89e8", "ad71007e", "e8c82066", ], dtype="object", ), } expected = pd.DataFrame.from_dict(data) inputfile = os.path.join(dirpath, "TestOrcFile.testSnappy.orc") got = read_orc(inputfile).iloc[:10] tm.assert_equal(expected, got)
def test_orc_reader_date_high(dirpath): data = { "time": np.array( [ "2038-05-05 12:34:56.100000", "2038-05-05 12:34:56.100100", "2038-05-05 12:34:56.100200", "2038-05-05 12:34:56.100300", "2038-05-05 12:34:56.100400", "2038-05-05 12:34:56.100500", "2038-05-05 12:34:56.100600", "2038-05-05 12:34:56.100700", "2038-05-05 12:34:56.100800", "2038-05-05 12:34:56.100900", ], dtype="datetime64[ns]", ), "date": np.array( [ datetime.date(2038, 12, 25), datetime.date(2038, 12, 25), datetime.date(2038, 12, 25), datetime.date(2038, 12, 25), datetime.date(2038, 12, 25), datetime.date(2038, 12, 25), datetime.date(2038, 12, 25), datetime.date(2038, 12, 25), datetime.date(2038, 12, 25), datetime.date(2038, 12, 25), ], dtype="object", ), } expected = pd.DataFrame.from_dict(data) inputfile = os.path.join(dirpath, "TestOrcFile.testDate2038.orc") got = read_orc(inputfile).iloc[:10] tm.assert_equal(expected, got)
def read_orc(path: FilePathOrBuffer, columns: Optional[List[str]] = None, **kwargs) -> DataFrame: ErrorMessage.default_to_pandas("read_orc") return DataFrame(pandas.read_orc(path, columns, **kwargs))
def read_orc(data_fp): return pd.read_orc(data_fp)
for j in range(num_ints): bits = np_arr[i, 32 * j:32 * j + 32] # int_val = int(''.join(map(str, arr)), base=2) int_val = 0 for digit in bits: int_val = (int_val << 1) + digit out_sub[j] = int_val out[i, :] = out_sub return out if __name__ == '__main__': basepath = "/run/media/sharwinbobde/ExtraStorage/2M-scaled-array-1.orc/" # we need a sample file to get the number of features. sample_file = "part-00012-7d53a446-d692-475a-853f-9e55ccc8e9fa-c000.snappy.orc" df = pd.read_orc(basepath + sample_file) df = df.rename(columns={"FeatureVector_all_features": "vec"}) num_records = df.shape[0] num_features = len(df['vec'][0]) print(num_features) print(f"num_records = {num_records}") print(f"num_features = {num_features}") LSH_NUM_BITS = int(2**13) LSH = LSHBias(feature_dim=num_features, bits=LSH_NUM_BITS) W = np.array(LSH.W, dtype=np.float32) b_gpu = gpuarray.to_gpu(W) # reuse this every time
def test_orc_reader_multi_file_multi_stripe(datadir): path = datadir / "TestOrcFile.testStripeLevelStats.orc" gdf = cudf.read_orc([path, path], engine="cudf", stripes=[[0, 1], [2]]) pdf = pd.read_orc(path) assert_eq(pdf, gdf)
def read_orc(path: FilePathOrBuffer, columns: Optional[List[str]] = None, **kwargs) -> DataFrame: # noqa: D103 ErrorMessage.default_to_pandas("read_orc") Engine.subscribe(_update_engine) return DataFrame(pandas.read_orc(path, columns, **kwargs))
def read_orc(filepath: str, **kwargs) -> pd.DataFrame: return pd.read_orc(filepath, **kwargs)