def test_file_pathlib(file_fixture, tmpdir): import pathlib _, batches = file_fixture.write_batches() source = file_fixture.get_source() path = tmpdir.join('file.arrow').strpath with open(path, 'wb') as f: f.write(source) t1 = pa.open_file(pathlib.Path(path)).read_all() t2 = pa.open_file(pa.OSFile(path)).read_all() assert t1.equals(t2)
def test_open_file_from_buffer(file_fixture): # ARROW-2859; APIs accept the buffer protocol _, batches = file_fixture.write_batches() source = file_fixture.get_source() reader1 = pa.open_file(source) reader2 = pa.open_file(pa.BufferReader(source)) reader3 = pa.RecordBatchFileReader(source) result1 = reader1.read_all() result2 = reader2.read_all() result3 = reader3.read_all() assert result1.equals(result2) assert result1.equals(result3)
def test_deprecated_pyarrow_ns_apis(): table = pa.table([pa.array([1, 2, 3, 4])], names=['a']) sink = pa.BufferOutputStream() with pa.ipc.new_stream(sink, table.schema) as writer: writer.write(table) with pytest.warns(FutureWarning, match="please use pyarrow.ipc.open_stream"): pa.open_stream(sink.getvalue()) sink = pa.BufferOutputStream() with pa.ipc.new_file(sink, table.schema) as writer: writer.write(table) with pytest.warns(FutureWarning, match="please use pyarrow.ipc.open_file"): pa.open_file(sink.getvalue())
def test_read_all(self): _, batches = self.write_batches() file_contents = pa.BufferReader(self._get_source()) reader = pa.open_file(file_contents) result = reader.read_all() expected = pa.Table.from_batches(batches) assert result.equals(expected)
def test_read_pandas(self): frames, _ = self.write_batches() file_contents = pa.BufferReader(self._get_source()) reader = pa.open_file(file_contents) result = reader.read_pandas() expected = pd.concat(frames) assert_frame_equal(result, expected)
def test_file_read_all(sink_factory): fixture = FileFormatFixture(sink_factory) _, batches = fixture.write_batches() file_contents = pa.BufferReader(fixture.get_source()) reader = pa.open_file(file_contents) result = reader.read_all() expected = pa.Table.from_batches(batches) assert result.equals(expected)
def test_simple_roundtrip(self): batches = self.write_batches() file_contents = self._get_source() reader = pa.open_file(file_contents) assert reader.num_record_batches == len(batches) for i, batch in enumerate(batches): # it works. Must convert back to DataFrame batch = reader.get_batch(i) assert batches[i].equals(batch)
def _check_roundtrip(self, as_table=False): _, batches = self.write_batches(as_table=as_table) file_contents = pa.BufferReader(self._get_source()) reader = pa.open_file(file_contents) assert reader.num_record_batches == len(batches) for i, batch in enumerate(batches): # it works. Must convert back to DataFrame batch = reader.get_batch(i) assert batches[i].equals(batch) assert reader.schema.equals(batches[0].schema)
def func(): df = table.to_pandas() batch = pa.RecordBatch.from_pandas(df) sink = io.BytesIO() writer = pa.RecordBatchFileWriter(sink, batch.schema) writer.write_batch(batch) writer.close() buf_reader = pa.BufferReader(sink.getvalue()) reader = pa.open_file(buf_reader) reader.read_all()
def leak2(): data = [pa.array(np.concatenate([np.random.randn(100000)] * 10))] table = pa.Table.from_arrays(data, ['foo']) while True: print('calling to_pandas') print('memory_usage: {0}'.format(memory_profiler.memory_usage())) df = table.to_pandas() batch = pa.RecordBatch.from_pandas(df) sink = io.BytesIO() writer = pa.RecordBatchFileWriter(sink, batch.schema) writer.write_batch(batch) writer.close() buf_reader = pa.BufferReader(sink.getvalue()) reader = pa.open_file(buf_reader) reader.read_all() gc.collect()
def o(fp, fptype=None): """ Opens the file given by the path into a python object. :param str fp: The path of the file """ fp = xv(fp) # , dtype={'ID': object} import re if re.match(r".*\.conllu", fp) is not None: import conllu sentences = conllu.parse(cat(fp)) # sys.stdout.write(str(type(ret))) return sentences if re.match(r".*\.h5", fp) is not None: # /usr/local/lib/python3.6/dist-packages/h5py sps("ttyify show_h5 " + q(fp)) # from show_h5 import print_h5 # print_h5(fp, section=None, show_attrs=False, show_data=False) # import required libraries import h5py as h5 import numpy as np import matplotlib.pyplot as plt # Read H5 file ret = h5.File(fp, "r") return ret if re.match(r".*\.narrow", fp) is not None: import pyarrow as pa try: ret = pa.open_file(fp) return ret except: pass if re.match(r".*\.npy", fp) is not None: import numpy as np try: ret = np.load(fp) return ret except: pass try: ret = np.load(fp, encoding="latin1") return ret except: pass try: ret = np.load(fp, encoding="bytes") return ret except: pass if re.match(r".*\.xml", fp) is not None: import pandas as pd ret = read_csv_smart(fp) # sys.stdout.write(str(type(ret))) return ret if re.match(r".*\.csv", fp) is not None: if fptype == "np": import numpy as np ret = np.genfromtxt(fp, delimiter=",") else: import pandas as pd ret = read_csv_smart(fp) # sys.stdout.write(str(type(ret))) return ret if re.match(r".*\.xls", fp) is not None: import pandas as pd ret = pd.read_excel(fp) # sys.stdout.write(str(type(ret))) return ret if re.match(r".*\.pkl", fp) is not None: import pandas as pd ret = pd.read_pickle(fp) return ret if re.match(r".*\.gensim", fp) is not None: import gensim # Load pre-trained Word2Vec model. try: ret = gensim.models.LdaModel.load(fp) except: try: ret = gensim.models.Word2Vec.load(fp) except: pass return ret if re.match(r".*\.pickle$", fp) is not None or re.match(r".*\.p$", fp) is not None: import pickle with open(fp, "rUb") as f: data = f.read() ret = pickle.loads(data) return ret # Just open the file as an unknown binary # Use python-magic import magic mimestr = magic.from_file(fp) # https://stackoverflow.com/q/4980146 gre = Re() if gre.match(r"gzip compressed data", mimestr): # do something with gre.last_match import gzip f = gzip.open(fp) return f.read() # elif gre.match(r'bar',mimestr): # # do something with gre.last_match # return mimestr return mimestr
def test_empty_file(self): buf = io.BytesIO(b'') with pytest.raises(pa.ArrowInvalid): pa.open_file(buf)
def read_file(source): reader = pa.open_file(source) return [reader.get_batch(i) for i in range(reader.num_record_batches)]
def test_empty_file(): buf = b'' with pytest.raises(pa.ArrowInvalid): pa.open_file(pa.BufferReader(buf))
#!/usr/bin/env python import os import pyarrow as pa times = int(os.getenv('TIMES', 1)) for time in range(times): reader = pa.open_file('/tmp/python.arrow') schema = reader.schema for i in range(reader.num_record_batches): record_batch = reader.get_batch(i) if times > 1: break print('=' * 48) print(f'record-batch[{i}]:') for j, column in enumerate(schema): values = record_batch.column(j).to_numpy() print(f' {column.name}: {record_batch.column(j).to_numpy()}')