def test_orcfile_readwrite(): from pyarrow import orc buffer_output_stream = pa.BufferOutputStream() a = pa.array([1, None, 3, None]) b = pa.array([None, "Arrow", None, "ORC"]) table = pa.table({"int64": a, "utf8": b}) orc.write_table(table, buffer_output_stream) buffer_reader = pa.BufferReader(buffer_output_stream.getvalue()) orc_file = orc.ORCFile(buffer_reader) output_table = orc_file.read() assert table.equals(output_table) # Check for default WriteOptions assert orc_file.compression == 'UNCOMPRESSED' assert orc_file.file_version == '0.12' assert orc_file.row_index_stride == 10000 assert orc_file.compression_size == 65536 # deprecated keyword order buffer_output_stream = pa.BufferOutputStream() with pytest.warns(FutureWarning): orc.write_table(buffer_output_stream, table) buffer_reader = pa.BufferReader(buffer_output_stream.getvalue()) orc_file = orc.ORCFile(buffer_reader) output_table = orc_file.read() assert table.equals(output_table) # Check for default WriteOptions assert orc_file.compression == 'UNCOMPRESSED' assert orc_file.file_version == '0.12' assert orc_file.row_index_stride == 10000 assert orc_file.compression_size == 65536
def _read_orc(self, filename): if ('CPU' in self.compute_type): if (filename.startswith('gs://')): fs = gcsfs.GCSFileSystem() with fs.open(filename, mode='rb') as file: dataset = pyarrow_orc.ORCFile(file).read().to_pandas() else: with open(filename, mode='rb') as file: dataset = pyarrow_orc.ORCFile(file).read().to_pandas() elif ('GPU' in self.compute_type): dataset = cudf.read_orc(filename) return dataset
def test_orcfile_empty(datadir): from pyarrow import orc table = orc.ORCFile(datadir / 'TestOrcFile.emptyFile.orc').read() assert table.num_rows == 0 expected_schema = pa.schema([ ('boolean1', pa.bool_()), ('byte1', pa.int8()), ('short1', pa.int16()), ('int1', pa.int32()), ('long1', pa.int64()), ('float1', pa.float32()), ('double1', pa.float64()), ('bytes1', pa.binary()), ('string1', pa.string()), ('middle', pa.struct([ ('list', pa.list_(pa.struct([ ('int1', pa.int32()), ('string1', pa.string()), ]))), ])), ('list', pa.list_(pa.struct([ ('int1', pa.int32()), ('string1', pa.string()), ]))), ('map', pa.list_(pa.struct([ ('key', pa.string()), ('value', pa.struct([ ('int1', pa.int32()), ('string1', pa.string()), ])), ]))), ]) assert table.schema == expected_schema
def read_orc( filepath_or_buffer, engine="cudf", columns=None, stripe=None, skip_rows=None, num_rows=None, use_index=True, **kwargs, ): """{docstring}""" filepath_or_buffer, compression = ioutils.get_filepath_or_buffer( filepath_or_buffer, None, **kwargs ) if compression is not None: ValueError("URL content-encoding decompression is not supported") if engine == "cudf": df = libcudf.orc.read_orc( filepath_or_buffer, columns, stripe, skip_rows, num_rows, use_index ) else: warnings.warn("Using CPU via PyArrow to read ORC dataset.") orc_file = orc.ORCFile(filepath_or_buffer) pa_table = orc_file.read(columns=columns) df = cudf.DataFrame.from_arrow(pa_table) return df
def test_orcfile_empty(datadir): from pyarrow import orc table = orc.ORCFile(datadir / "TestOrcFile.emptyFile.orc").read() assert table.num_rows == 0 expected_schema = pa.schema([ ("boolean1", pa.bool_()), ("byte1", pa.int8()), ("short1", pa.int16()), ("int1", pa.int32()), ("long1", pa.int64()), ("float1", pa.float32()), ("double1", pa.float64()), ("bytes1", pa.binary()), ("string1", pa.string()), ("middle", pa.struct([("list", pa.list_( pa.struct([("int1", pa.int32()), ("string1", pa.string())])))])), ("list", pa.list_(pa.struct([("int1", pa.int32()), ("string1", pa.string())]))), ("map", pa.map_(pa.string(), pa.struct([("int1", pa.int32()), ("string1", pa.string())]))), ]) assert table.schema == expected_schema
def check_example_file(orc_path, expected_df, need_fix=False): """ Check a ORC file against the expected columns dictionary. """ from pyarrow import orc orc_file = orc.ORCFile(orc_path) # Exercise ORCFile.read() table = orc_file.read() assert isinstance(table, pa.Table) table.validate() # This workaround needed because of ARROW-3080 orc_df = pd.DataFrame(table.to_pydict()) assert set(expected_df.columns) == set(orc_df.columns) # reorder columns if necessary if not orc_df.columns.equals(expected_df.columns): expected_df = expected_df.reindex(columns=orc_df.columns) if need_fix: fix_example_values(orc_df, expected_df) check_example_values(orc_df, expected_df) # Exercise ORCFile.read_stripe() json_pos = 0 for i in range(orc_file.nstripes): batch = orc_file.read_stripe(i) check_example_values(pd.DataFrame(batch.to_pydict()), expected_df, start=json_pos, stop=json_pos + len(batch)) json_pos += len(batch) assert json_pos == orc_file.nrows
def read_orc(path, columns=None, **kwargs): """{docstring}""" warnings.warn("Using CPU via PyArrow to read ORC dataset, this will " "be GPU accelerated in the future") orc_file = orc.ORCFile(path) pa_table = orc_file.read(columns=columns) return DataFrame.from_arrow(pa_table)
def _read_orc_stripe(fs, path, stripe, columns=None): """Pull out specific data from specific part of ORC file""" from pyarrow import orc with fs.open(path, "rb") as f: o = orc.ORCFile(f) table = o.read_stripe(stripe, columns) return table.to_pandas(date_as_object=False)
def test_orcfile_readwrite_with_writeoptions(): from pyarrow import orc buffer_output_stream = pa.BufferOutputStream() a = pa.array([1, None, 3, None]) b = pa.array([None, "Arrow", None, "ORC"]) table = pa.table({"int64": a, "utf8": b}) orc.write_table( table, buffer_output_stream, compression='snappy', file_version='0.11', row_index_stride=5000, compression_block_size=32768, ) buffer_reader = pa.BufferReader(buffer_output_stream.getvalue()) orc_file = orc.ORCFile(buffer_reader) output_table = orc_file.read() assert table.equals(output_table) # Check for modified WriteOptions assert orc_file.compression == 'SNAPPY' assert orc_file.file_version == '0.11' assert orc_file.row_index_stride == 5000 assert orc_file.compression_size == 32768 # deprecated keyword order buffer_output_stream = pa.BufferOutputStream() with pytest.warns(FutureWarning): orc.write_table( buffer_output_stream, table, compression='uncompressed', file_version='0.11', row_index_stride=20000, compression_block_size=16384, ) buffer_reader = pa.BufferReader(buffer_output_stream.getvalue()) orc_file = orc.ORCFile(buffer_reader) output_table = orc_file.read() assert table.equals(output_table) # Check for default WriteOptions assert orc_file.compression == 'UNCOMPRESSED' assert orc_file.file_version == '0.11' assert orc_file.row_index_stride == 20000 assert orc_file.compression_size == 16384
def test_orcfile_readwrite(): from pyarrow import orc buffer_output_stream = pa.BufferOutputStream() a = pa.array([1, None, 3, None]) b = pa.array([None, "Arrow", None, "ORC"]) table = pa.table({"int64": a, "utf8": b}) orc.write_table(table, buffer_output_stream) buffer_reader = pa.BufferReader(buffer_output_stream.getvalue()) output_table = orc.ORCFile(buffer_reader).read() assert table.equals(output_table) # deprecated keyword order buffer_output_stream = pa.BufferOutputStream() with pytest.warns(FutureWarning): orc.write_table(buffer_output_stream, table) buffer_reader = pa.BufferReader(buffer_output_stream.getvalue()) output_table = orc.ORCFile(buffer_reader).read() assert table.equals(output_table)
def get_product_df(): import pyarrow.orc as orc target_path = '/backup/ND/GC Product Master/part-00000-efc19fcb-f673-4438-af65-02560d10d5ac-c000.snappy.orc' with open(target_path, 'rb') as file: data = orc.ORCFile(file) product = data.read().to_pandas() if 'product_info.csv' not in os.listdir('/home/yezhipeng/info/'): product.to_csv('/home/yezhipeng/info/product_info.csv') print('copy saved to /home/yezhipeng/info/') return product
def read_orc_metadata(path): """{docstring}""" orc_file = orc.ORCFile(path) num_rows = orc_file.nrows num_stripes = orc_file.nstripes col_names = orc_file.schema.names return num_rows, num_stripes, col_names
def run_task(sync, files): """sync main flow""" if isinstance(files, str): files = [files] for file in files: sync_task = sync.sync_task adb_table = sync_task["adb_table"] tmp_mysql_table = adb_table if sync.dml_operator == 'INSERT INTO': tmp_mysql_table = f'{adb_table}_tmp' file_parts = file.split('/') suffix = file_parts[-1] local_name = f'{file_addr}/{suffix}' start = time.time() download(file, local_name) logger.info('oss read done') with open(local_name, 'rb') as f: data = orc.ORCFile(f) table = data.read() df = table.to_pandas() special_columns = sync_task.get('special_columns') if special_columns: sp = [c for c in special_columns if c in df.columns] if sp: df = df.drop(columns=sp) num_columns = table.num_columns logger.info(table.num_rows) logger.info(num_columns) # find partitions for path in file_parts: if '=' in path: df[path.split('=')[0]] = path.split('=')[1] total = df.shape[0] logger.info(total) base = 10000 rn = total // base all_tasks = [] with ThreadPoolExecutor(5 * multiprocessing.cpu_count()) as executor: for i in range(rn): task = executor.submit(data_frame_to_mysql, df[i * base:(i + 1) * base], tmp_mysql_table) all_tasks.append(task) task = executor.submit(data_frame_to_mysql, df[rn * base:total], tmp_mysql_table) all_tasks.append(task) wait(all_tasks, return_when=ALL_COMPLETED) os.remove(local_name) logger.info('sync done!') logger.info(time.time() - start)
def read_orc( filepath_or_buffer, engine="cudf", columns=None, stripes=None, skiprows=None, num_rows=None, use_index=True, decimals_as_float=True, force_decimal_scale=None, timestamp_type=None, **kwargs, ): """{docstring}""" from cudf import DataFrame filepath_or_buffer, compression = ioutils.get_filepath_or_buffer( path_or_data=filepath_or_buffer, compression=None, **kwargs) if compression is not None: ValueError("URL content-encoding decompression is not supported") if engine == "cudf": df = DataFrame._from_table( libcudf.orc.read_orc( filepath_or_buffer, columns, stripes, skiprows, num_rows, use_index, decimals_as_float, force_decimal_scale, timestamp_type, )) else: def read_orc_stripe(orc_file, stripe, columns): pa_table = orc_file.read_stripe(stripe, columns) if isinstance(pa_table, pa.RecordBatch): pa_table = pa.Table.from_batches([pa_table]) return pa_table warnings.warn("Using CPU via PyArrow to read ORC dataset.") orc_file = orc.ORCFile(filepath_or_buffer) if stripes is not None and len(stripes) > 0: pa_tables = [ read_orc_stripe(orc_file, i, columns) for i in stripes ] pa_table = pa.concat_tables(pa_tables) else: pa_table = orc_file.read(columns=columns) df = cudf.DataFrame.from_arrow(pa_table) return df
def _make_empty_df(filepath_or_buffer, columns): orc_file = orc.ORCFile(filepath_or_buffer) schema = orc_file.schema col_names = schema.names if columns is None else columns return cudf.DataFrame({ col_name: cudf.core.column.column_empty( row_count=0, dtype=schema.field(col_name).type.to_pandas_dtype(), ) for col_name in col_names })
def test_orcfile_readwrite(): from pyarrow import orc buffer_output_stream = pa.BufferOutputStream() a = pa.array([1, None, 3, None]) b = pa.array([None, "Arrow", None, "ORC"]) table = pa.table({"int64": a, "utf8": b}) orc.write_table(buffer_output_stream, table) buffer_reader = pa.BufferReader(buffer_output_stream.getvalue()) output_table = orc.ORCFile(buffer_reader).read() assert table.equals(output_table)
def load_data(self, filename='dataset.orc', col_labels=airline_cols, y_label='ArrDelayBinary'): target_filename = self.CSP_paths['train_data'] + '/' + filename self.log_to_file(f'\n> loading dataset from {target_filename}...\n') with PerfTimer() as ingestion_timer: if 'CPU' in self.compute_type: if 'ORC' in self.data_type: if (target_filename.startswith('gs://')): fs = gcsfs.GCSFileSystem() with fs.open(target_filename, mode='rb') as file: dataset = pyarrow_orc.ORCFile( file).read().to_pandas() else: with open(target_filename, mode='rb') as file: dataset = pyarrow_orc.ORCFile( file).read().to_pandas() elif 'CSV' in self.data_type: dataset = pd.read_csv(target_filename, names=col_labels) elif 'GPU' in self.compute_type: if 'ORC' in self.data_type: dataset = cudf.read_orc(target_filename) elif 'CSV' in self.data_type: dataset = cudf.read_csv(target_filename, names=col_labels) self.log_to_file( f'\t ingestion completed in {ingestion_timer.duration}') self.log_to_file( f'dataset descriptors: {dataset.shape}\n {dataset.dtypes}\n {dataset.columns}\n' ) self.query_memory() # TODO: if mem_free below a threshold issue a warning [ ? ] return dataset, col_labels, y_label, ingestion_timer.duration
def _read_orc_stripes(fs, path, stripes, schema, columns): # Construct a list of RecordBatch objects. # Each ORC stripe will corresonpond to a single RecordBatch. if columns is None: columns = list(schema) batches = [] with fs.open(path, "rb") as f: o = orc.ORCFile(f) _stripes = range(o.nstripes) if stripes is None else stripes for stripe in _stripes: batches.append(o.read_stripe(stripe, columns)) return batches
def test_bytesio_readwrite(): from pyarrow import orc from io import BytesIO buf = BytesIO() a = pa.array([1, None, 3, None]) b = pa.array([None, "Arrow", None, "ORC"]) table = pa.table({"int64": a, "utf8": b}) orc.write_table(table, buf) buf.seek(0) orc_file = orc.ORCFile(buf) output_table = orc_file.read() assert table.equals(output_table)
def test_read_orc(datadir, hdfs, test_url): fname = datadir / "orc" / "TestOrcFile.testSnappy.orc" # Read from local file system as buffer with open(fname, mode="rb") as f: buffer = BytesIO(f.read()) # Write to hdfs hdfs.upload(basedir + "/file.orc", buffer) if test_url: hd_fpath = f"hdfs://{host}:{port}{basedir}/file.orc" else: hd_fpath = f"hdfs://{basedir}/file.orc" got = cudf.read_orc(hd_fpath) expect = orc.ORCFile(buffer).read().to_pandas() assert_eq(expect, got)
def transform(): parser = argparse.ArgumentParser( description='Returns total lines of data records') parser.add_argument("--fileinput", type=str, required=True) #parser.add_argument("--output_path", type=str, required=True) parser.add_argument("--totallines", type=str, required=True) args = parser.parse_args() filename = args.fileinput with open(filename, 'rb') as file: data = orc.ORCFile(file) df = data.read().to_pandas() Path(args.totallines).parent.mkdir(parents=True, exist_ok=True) with open(args.totallines, 'w') as f: f.write(len(df))
def test_write_orc(pdf, hdfs, test_url): # Orc writer doesn't support writing unsigned ints pdf["Integer2"] = pdf["Integer2"].astype("int64") gdf = cudf.from_pandas(pdf) if test_url: hd_fpath = "hdfs://{}:{}{}/test_orc_writer.orc".format( host, port, basedir) else: hd_fpath = "hdfs://{}/test_orc_writer.orc".format(basedir) gdf.to_orc(hd_fpath) assert hdfs.exists(f"{basedir}/test_orc_writer.orc") with hdfs.open(f"{basedir}/test_orc_writer.orc", mode="rb") as f: got = orc.ORCFile(f).read().to_pandas() assert_eq(pdf, got)
def read_orc(path, engine='cudf', columns=None, skip_rows=None, num_rows=None): """{docstring}""" if engine == 'cudf': df = cpp_read_orc( path, columns, skip_rows, num_rows ) else: warnings.warn("Using CPU via PyArrow to read ORC dataset.") orc_file = orc.ORCFile(path) pa_table = orc_file.read(columns=columns) df = DataFrame.from_arrow(pa_table) return df
def read_orc( filepath_or_buffer, engine="cudf", columns=None, stripe=None, skip_rows=None, num_rows=None, use_index=True, decimals_as_float=True, force_decimal_scale=None, **kwargs, ): """{docstring}""" filepath_or_buffer, compression = ioutils.get_filepath_or_buffer( filepath_or_buffer, None, **kwargs ) if compression is not None: ValueError("URL content-encoding decompression is not supported") if engine == "cudf": df = libcudf.orc.read_orc( filepath_or_buffer, columns, stripe, skip_rows, num_rows, use_index, decimals_as_float, force_decimal_scale, ) else: warnings.warn("Using CPU via PyArrow to read ORC dataset.") orc_file = orc.ORCFile(filepath_or_buffer) if stripe is not None: pa_table = orc_file.read_stripe(stripe, columns) if isinstance(pa_table, pa.RecordBatch): pa_table = pa.Table.from_batches([pa_table]) else: pa_table = orc_file.read(columns=columns) df = cudf.DataFrame.from_arrow(pa_table) return df
def test_column_selection(tempdir): from pyarrow import orc # create a table with nested types inner = pa.field('inner', pa.int64()) middle = pa.field('middle', pa.struct([inner])) fields = [ pa.field('basic', pa.int32()), pa.field('list', pa.list_(pa.field('item', pa.int32()))), pa.field('struct', pa.struct([middle, pa.field('inner2', pa.int64())])), pa.field( 'list-struct', pa.list_( pa.field( 'item', pa.struct([ pa.field('inner1', pa.int64()), pa.field('inner2', pa.int64()) ])))), pa.field('basic2', pa.int64()), ] arrs = [[0], [[1, 2]], [{ "middle": { "inner": 3 }, "inner2": 4 }], [[{ "inner1": 5, "inner2": 6 }, { "inner1": 7, "inner2": 8 }]], [9]] table = pa.table(arrs, schema=pa.schema(fields)) path = str(tempdir / 'test.orc') orc.write_table(table, path) orc_file = orc.ORCFile(path) # default selecting all columns result1 = orc_file.read() assert result1.equals(table) # selecting with columns names result2 = orc_file.read(columns=["basic", "basic2"]) assert result2.equals(table.select(["basic", "basic2"])) result3 = orc_file.read(columns=["list", "struct", "basic2"]) assert result3.equals(table.select(["list", "struct", "basic2"])) # using dotted paths result4 = orc_file.read(columns=["struct.middle.inner"]) expected4 = pa.table({"struct": [{"middle": {"inner": 3}}]}) assert result4.equals(expected4) result5 = orc_file.read(columns=["struct.inner2"]) expected5 = pa.table({"struct": [{"inner2": 4}]}) assert result5.equals(expected5) result6 = orc_file.read( columns=["list", "struct.middle.inner", "struct.inner2"]) assert result6.equals(table.select(["list", "struct"])) result7 = orc_file.read(columns=["list-struct.inner1"]) expected7 = pa.table({"list-struct": [[{"inner1": 5}, {"inner1": 7}]]}) assert result7.equals(expected7) # selecting with (Arrow-based) field indices result2 = orc_file.read(columns=[0, 4]) assert result2.equals(table.select(["basic", "basic2"])) result3 = orc_file.read(columns=[1, 2, 3]) assert result3.equals(table.select(["list", "struct", "list-struct"])) # error on non-existing name or index with pytest.raises(IOError): # liborc returns ParseError, which gets translated into IOError # instead of ValueError orc_file.read(columns=["wrong"]) with pytest.raises(ValueError): orc_file.read(columns=[5])
def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs): """Read cudf dataframe from ORC file(s). Note that this function is mostly borrowed from upstream Dask. Parameters ---------- path: str or list(str) Location of file(s), which can be a full URL with protocol specifier, and may include glob character if a single string. columns: None or list(str) Columns to load. If None, loads all. filters : None or list of tuple or list of lists of tuples If not None, specifies a filter predicate used to filter out row groups using statistics stored for each row group as Parquet metadata. Row groups that do not match the given filter predicate are not read. The predicate is expressed in disjunctive normal form (DNF) like `[[('x', '=', 0), ...], ...]`. DNF allows arbitrary boolean logical combinations of single column predicates. The innermost tuples each describe a single column predicate. The list of inner predicates is interpreted as a conjunction (AND), forming a more selective and multiple column predicate. Finally, the outermost list combines these filters as a disjunction (OR). Predicates may also be passed as a list of tuples. This form is interpreted as a single conjunction. To express OR in predicates, one must use the (preferred) notation of list of lists of tuples. storage_options: None or dict Further parameters to pass to the bytes backend. Returns ------- cudf.DataFrame """ storage_options = storage_options or {} fs, fs_token, paths = get_fs_token_paths(path, mode="rb", storage_options=storage_options) schema = None nstripes_per_file = [] for path in paths: with fs.open(path, "rb") as f: o = orc.ORCFile(f) if schema is None: schema = o.schema elif schema != o.schema: raise ValueError( "Incompatible schemas while parsing ORC files") nstripes_per_file.append(o.nstripes) schema = _get_pyarrow_dtypes(schema, categories=None) if columns is not None: ex = set(columns) - set(schema) if ex: raise ValueError("Requested columns (%s) not in schema (%s)" % (ex, set(schema))) else: columns = list(schema) with fs.open(paths[0], "rb") as f: meta = cudf.read_orc(f, stripes=[0], columns=columns, **kwargs) name = "read-orc-" + tokenize(fs_token, path, columns, **kwargs) dsk = {} N = 0 for path, n in zip(paths, nstripes_per_file): for stripe in (range(n) if filters is None else cudf.io.orc._filter_stripes(filters, path)): dsk[(name, N)] = ( _read_orc_stripe, fs, path, stripe, columns, kwargs, ) N += 1 divisions = [None] * (len(dsk) + 1) return dd.core.new_dd_object(dsk, name, meta, divisions)
def read_orc(path, columns=None, storage_options=None, **kwargs): """Read cudf dataframe from ORC file(s). Note that this function is mostly borrowed from upstream Dask. Parameters ---------- path: str or list(str) Location of file(s), which can be a full URL with protocol specifier, and may include glob character if a single string. columns: None or list(str) Columns to load. If None, loads all. storage_options: None or dict Further parameters to pass to the bytes backend. Returns ------- cudf.DataFrame """ storage_options = storage_options or {} fs, fs_token, paths = get_fs_token_paths(path, mode="rb", storage_options=storage_options) schema = None nstripes_per_file = [] for path in paths: with fs.open(path, "rb") as f: o = orc.ORCFile(f) if schema is None: schema = o.schema elif schema != o.schema: raise ValueError( "Incompatible schemas while parsing ORC files") nstripes_per_file.append(o.nstripes) schema = _get_pyarrow_dtypes(schema, categories=None) if columns is not None: ex = set(columns) - set(schema) if ex: raise ValueError("Requested columns (%s) not in schema (%s)" % (ex, set(schema))) else: columns = list(schema) with fs.open(paths[0], "rb") as f: meta = cudf.read_orc(f, stripe=0, columns=columns, **kwargs) name = "read-orc-" + tokenize(fs_token, path, columns, **kwargs) dsk = {} N = 0 for path, n in zip(paths, nstripes_per_file): for stripe in range(n): dsk[(name, N)] = ( _read_orc_stripe, fs, path, stripe, columns, kwargs, ) N += 1 divisions = [None] * (len(dsk) + 1) return dd.core.new_dd_object(dsk, name, meta, divisions)
def read_orc( filepath_or_buffer, engine="cudf", columns=None, filters=None, stripes=None, skiprows=None, num_rows=None, use_index=True, decimal_cols_as_float=None, timestamp_type=None, use_python_file_object=True, **kwargs, ): """{docstring}""" if decimal_cols_as_float is not None: warnings.warn( "`decimal_cols_as_float` is deprecated and will be removed in " "the future", FutureWarning, ) from cudf import DataFrame # Multiple sources are passed as a list. If a single source is passed, # wrap it in a list for unified processing downstream. if not is_list_like(filepath_or_buffer): filepath_or_buffer = [filepath_or_buffer] # Each source must have a correlating stripe list. If a single stripe list # is provided rather than a list of list of stripes then extrapolate that # stripe list across all input sources if stripes is not None: if any(not isinstance(stripe, list) for stripe in stripes): stripes = [stripes] # Must ensure a stripe for each source is specified, unless None if not len(stripes) == len(filepath_or_buffer): raise ValueError( "A list of stripes must be provided for each input source" ) filepaths_or_buffers = [] for source in filepath_or_buffer: if ioutils.is_directory(source, **kwargs): fs = ioutils._ensure_filesystem( passed_filesystem=None, path=source, **kwargs, ) source = stringify_path(source) source = fs.sep.join([source, "*.orc"]) tmp_source, compression = ioutils.get_filepath_or_buffer( path_or_data=source, compression=None, use_python_file_object=use_python_file_object, **kwargs, ) if compression is not None: raise ValueError( "URL content-encoding decompression is not supported" ) if isinstance(tmp_source, list): filepaths_or_buffers.extend(tmp_source) else: filepaths_or_buffers.append(tmp_source) if filters is not None: selected_stripes = _filter_stripes( filters, filepaths_or_buffers, stripes, skiprows, num_rows ) # Return empty if everything was filtered if len(selected_stripes) == 0: return _make_empty_df(filepaths_or_buffers[0], columns) else: stripes = selected_stripes if engine == "cudf": return DataFrame._from_data( *liborc.read_orc( filepaths_or_buffers, columns, stripes, skiprows, num_rows, use_index, decimal_cols_as_float, timestamp_type, ) ) else: def read_orc_stripe(orc_file, stripe, columns): pa_table = orc_file.read_stripe(stripe, columns) if isinstance(pa_table, pa.RecordBatch): pa_table = pa.Table.from_batches([pa_table]) return pa_table warnings.warn("Using CPU via PyArrow to read ORC dataset.") if len(filepath_or_buffer) > 1: raise NotImplementedError( "Using CPU via PyArrow only supports a single a " "single input source" ) orc_file = orc.ORCFile(filepath_or_buffer[0]) if stripes is not None and len(stripes) > 0: for stripe_source_file in stripes: pa_tables = [ read_orc_stripe(orc_file, i, columns) for i in stripe_source_file ] pa_table = pa.concat_tables(pa_tables) else: pa_table = orc_file.read(columns=columns) df = cudf.DataFrame.from_arrow(pa_table) return df
def read_metadata( cls, fs, paths, columns, index, split_stripes, aggregate_files, **kwargs, ): # Convert root directory to file list. # TODO: Handle hive-partitioned data if len(paths) == 1 and not fs.isfile(paths[0]): paths = fs.find(paths[0]) schema = None parts = [] def _get_schema(_o, schema): if schema is None: schema = _o.schema elif schema != _o.schema: raise ValueError("Incompatible schemas while parsing ORC files") return schema if split_stripes: offset = 0 for path in paths: with fs.open(path, "rb") as f: o = orc.ORCFile(f) if schema is None: schema = o.schema elif schema != o.schema: raise ValueError("Incompatible schemas while parsing ORC files") _stripes = list(range(o.nstripes)) if offset: parts.append([(path, _stripes[0:offset])]) while offset < o.nstripes: parts.append( [(path, _stripes[offset : offset + int(split_stripes)])] ) offset += int(split_stripes) if aggregate_files and int(split_stripes) > 1: offset -= o.nstripes else: offset = 0 else: for path in paths: if schema is None: with fs.open(paths[0], "rb") as f: o = orc.ORCFile(f) schema = o.schema parts.append([(path, None)]) schema = _get_pyarrow_dtypes(schema, categories=None) if columns is not None: ex = set(columns) - set(schema) if ex: raise ValueError( "Requested columns (%s) not in schema (%s)" % (ex, set(schema)) ) # Check if we can aggregate adjacent parts together parts = cls._aggregate_files(aggregate_files, split_stripes, parts) columns = list(schema) if columns is None else columns index = [index] if isinstance(index, str) else index meta = _meta_from_dtypes(columns, schema, index, []) return parts, schema, meta
if isinstance(json, dict): if (len(json) > 1): DataFrame = json_normalize(flatten(json)) else: DataFrame = json_normalize(list(json.values())[0]) else: FlattenedData = (flatten(_json) for _json in json) DataFrame = pd.DataFrame(FlattenedData) import pandas as pd import pyarrow.orc as orc import pyarrow.parquet as parquet with open("SampleORC", encoding="utf-16", errors='ignore') as file: #with open("SampleORC") as file: data = orc.ORCFile(file) df = data.read().to_pandas() data = orc.ORCFile("SampleORC.orc") df = data.read().to_pandas() data2 = parquet.ParquetFile("SampleParquet.parquet") df2 = data2.read().to_pandas() from fastavro import reader with open('SampleAvro.avro', 'rb') as fo: for record in reader(fo): print(record) import numpy as np import pandas as pd