def orc_to_pandas(file_name=None, file_io_obj=None, stripes=None): if file_name is not None: f = open(file_name, "rb") elif file_io_obj is not None: f = file_io_obj reader = pyorc.Reader(f) dtypes = { col: ORC_TO_PANDAS_TYPES[pyorc_type.name] for col, pyorc_type in reader.schema.fields.items() } if stripes is None: df = pd.DataFrame.from_records( reader, columns=reader.schema.fields.keys() ) else: records = [ record for i in stripes for record in list(reader.read_stripe(i)) ] df = pd.DataFrame.from_records( records, columns=reader.schema.fields.keys() ) # Need to type-cast to extracted `dtypes` from pyorc schema because # a fully empty/ full <NA> can result in incorrect dtype by pandas. df = df.astype(dtypes) return df
def set_rand_params(self, params): params_dict = {} for param, values in params.items(): if values == ALL_POSSIBLE_VALUES: if param == "columns": col_size = self._rand(len(self._df.columns)) params_dict[param] = list( np.unique(np.random.choice(self._df.columns, col_size))) elif param == "stripes": f = io.BytesIO(self._current_buffer) reader = pyorc.Reader(f) stripes = [i for i in range(reader.num_of_stripes)] params_dict[param] = np.random.choice([ None, list( map( int, np.unique( np.random.choice(stripes, reader.num_of_stripes)), )), ]) elif param == "use_index": params_dict[param] = np.random.choice([True, False]) elif param in ("skiprows", "num_rows"): params_dict[param] = np.random.choice( [None, self._rand(len(self._df))]) else: if not isinstance(values, list): raise TypeError("values must be of type list") params_dict[param] = np.random.choice(values) self._current_params["test_kwargs"] = self.process_kwargs(params_dict)
def read_hdfs_orc(path, hdfs, writer): chunk_rows = 1024 * 256 with hdfs.open(path, 'rb') as f: reader = pyorc.Reader(f) fields = reader.schema.fields schema = [] for c in fields: schema.append((c, arrow_type(fields[c]))) pa_struct = pa.struct(schema) while True: rows = reader.read(num=chunk_rows) if not rows: break rb = pa.RecordBatch.from_struct_array( pa.array(rows, type=pa_struct)) sink = pa.BufferOutputStream() rb_writer = pa.ipc.new_stream(sink, rb.schema) rb_writer.write_batch(rb) rb_writer.close() buf = sink.getvalue() chunk = writer.next(buf.size) buf_writer = pa.FixedSizeBufferWriter(chunk) buf_writer.write(buf) buf_writer.close()
def iter_orc(path): with open(path, 'rb') as f: reader = pyorc.Reader(f) column_names = reader.schema.fields.keys() for item in reader: d = OrderedDict(zip(column_names, item)) yield d
def read(self, ctx, file, range_tracker): pos = -1 def split_points_unclaimed(stop_pos): if pos >= stop_pos: return 0 return RangeTracker.SPLIT_POINTS_UNKNOWN range_tracker.set_split_points_unclaimed_callback( split_points_unclaimed) start_offset = range_tracker.start_position() if start_offset is None: start_offset = 0 idx = 0 reader = pyorc.Reader(file, **self.pyorc_options) if reader.num_of_stripes > 0: pos = 0 else: pos = range_tracker.stop_position() while range_tracker.try_claim(pos): stripe = reader.read_stripe(idx) if idx + 1 < reader.num_of_stripes: idx = idx + 1 pos = stripe.bytes_offset else: pos = range_tracker.stop_position() yield stripe.read()
def flush(self): self._buffer.seek(0) # TODO: to optimize it reader = pyorc.Reader(self._buffer) columns = reader.schema.fields.keys() return (','.join(columns) + '\n' + '\n'.join(','.join([str(c) for c in row]) for row in reader)).encode('utf-8') + b'\n'
def GetData(self, request, context): with open('./data.orc', 'rb') as dataf: reader = pyorc.Reader(dataf) print(reader.schema) #data = [cbor.dumps(it) for it in reader] chunk = [] for idx, row in enumerate(reader): #chunk.append(cbor.dumps(row)) chunk.append(row) if idx % 10000 == 0: yield server_pb2.DataResponse(data=[cbor.dumps(chunk)]) chunk = [] yield server_pb2.DataResponse(data=[cbor.dumps(chunk)])
def export_type_schema(self, itemExporter): itemExporter.start_exporting() # create and write some test data num_records = 10 for i in range(num_records): l = ItemLoader(TestItem()) l.add_value('ftext', 'this is a test text') l.add_value('ftext_array', ['test1', 'test2', 'test3', 'test4']) l.add_value('ffloat', float(2.5)) l.add_value('fint', int(10)) l.add_value('fbool', False) datetime_str = '2020-02-29T11:12:13' datetime_fmt = '%Y-%m-%dT%H:%M:%S' datetime_obj = datetime.datetime.strptime(datetime_str, datetime_fmt) datetime_obj = datetime_obj.replace(tzinfo=timezone.utc) l.add_value('fdatetime', datetime_obj) itemExporter.export_item(l.load_item()) itemExporter.finish_exporting() self.file.close() # reread file and compare results with open(self.filename, 'rb') as f: reader = pyorc.Reader(f, struct_repr=pyorc.StructRepr.DICT) self.assertEqual( len(reader), 10, msg= "Number of records read corresponds to number of records written" ) for i in range(len(reader)): record = next(reader) self.assertEqual("this is a test text", record.get('ftext', None), msg="String data is read correctly") self.assertEqual(['test1', 'test2', 'test3', 'test4'], record.get('ftext_array', None), msg="String array data is read correctly") self.assertEqual(float(2.5), record.get('ffloat', None), msg="Float data is read correctly") self.assertEqual(int(10), record.get('fint', None), msg="Int data is read correctly") self.assertFalse(record.get('fbool', None), msg="Bool data is read correctly") self.assertEqual(datetime_obj, record.get('fdatetime', None), msg="DateTime data is read correctly")
def read_single_orc(path, fs, writer): chunk_rows = 1024 * 256 with fs.open(path, "rb") as f: reader = pyorc.Reader(f) fields = reader.schema.fields schema = [] for c in fields: schema.append((c, arrow_type(fields[c]))) pa_struct = pa.struct(schema) while True: rows = reader.read(num=chunk_rows) if not rows: break batch = pa.RecordBatch.from_struct_array( pa.array(rows, type=pa_struct)) writer.write(batch)
def read_hdfs_orc(vineyard_socket, path, proc_num, proc_index): if proc_index: raise ValueError('Parallel reading ORC hasn\'t been supported yet') client = vineyard.connect(vineyard_socket) builder = DataframeStreamBuilder(client) stream = builder.seal(client) client.persist(stream) ret = {'type': 'return'} ret['content'] = repr(stream.id) print(json.dumps(ret), flush=True) chunk_rows = 1024 * 256 writer = stream.open_writer(client) host, port = urlparse(path).netloc.split(':') hdfs = HDFileSystem(host=host, port=int(port), pars={"dfs.client.read.shortcircuit": "false"}) path = urlparse(path).path with hdfs.open(path, 'rb') as f: reader = pyorc.Reader(f) fields = reader.schema.fields schema = [] for c in fields: schema.append((c, arrow_type(fields[c]))) pa_struct = pa.struct(schema) while True: rows = reader.read(num=chunk_rows) if not rows: break rb = pa.RecordBatch.from_struct_array(pa.array(rows, type=pa_struct)) sink = pa.BufferOutputStream() rb_writer = pa.ipc.new_stream(sink, rb.schema) rb_writer.write_batch(rb) rb_writer.close() buf = sink.getvalue() chunk = writer.next(buf.size) buf_writer = pa.FixedSizeBufferWriter(chunk) buf_writer.write(buf) buf_writer.close() writer.finish()
def orc_to_pandas(file_name=None, file_io_obj=None, stripes=None): if file_name is not None: f = open(file_name, "rb") elif file_io_obj is not None: f = file_io_obj reader = pyorc.Reader(f) if stripes is None: df = pd.DataFrame.from_records(reader, columns=reader.schema.fields.keys()) else: records = [ record for i in stripes for record in list(reader.read_stripe(i)) ] df = pd.DataFrame.from_records(records, columns=reader.schema.fields.keys()) return df
def read_local_orc(vineyard_socket, path, proc_num, proc_index): if proc_index: return client = vineyard.connect(vineyard_socket) builder = DataframeStreamBuilder(client) stream = builder.seal(client) ret = {'type': 'return'} ret['content'] = repr(stream.id) print(json.dumps(ret)) writer = stream.open_writer(client) with open(path, 'rb') as f: reader = pyorc.Reader(f) fields = reader.schema.fields schema = [] for c in fields: schema.append((c, arrow_type(fields[c]))) pa_struct = pa.struct(schema) while True: rows = reader.read(num=1024) if not rows: break rb = pa.RecordBatch.from_struct_array(pa.array(rows, type=pa_struct)) sink = pa.BufferOutputStream() rb_writer = pa.ipc.new_stream(sink, rb.schema) rb_writer.write_batch(rb) rb_writer.close() buf = sink.getvalue() chunk = writer.next(buf.size) buf_writer = pa.FixedSizeBufferWriter(chunk) buf_writer.write(buf) buf_writer.close() writer.finish()
def _last_datetime(self, category, date): if self.conn.status(f"/krwordcloud/add-article/{date}")['length'] == 0: return config.min_date tfname = '' with tempfile.NamedTemporaryFile("wb") as tf: tfname = tf.name with self.conn.read(f"/krwordcloud/add-article/{date}", chunk_size=8096) as hf: for chunk in hf: tf.write(chunk) with open(tfname, 'rb') as tf: reader = pyorc.Reader(tf) maximum = datetime.datetime \ .strptime(f"{date} GMT+0900", "%Y-%m-%d.orc GMT%z") for row in reader: if row[0] > maximum and row[1] == category: maximum = row[0] if (maximum < config.min_date): return config.min_date elif maximum > datetime.datetime.now().replace(tzinfo=KST): return datetime.datetime.now().replace(tzinfo=KST) else: return maximum os.unlink(tfname)
def read_orc_inventory_file(filename, keys): with open(filename, "rb") as data: reader = pyorc.Reader(data) for row in reader: record = {keys[i].lower(): v for i, v in enumerate(row)} yield record
def decode_orc(filename: str): with open(filename, 'rb') as data: reader = pyorc.Reader(data) return '\n'.join(','.join([str(c) for c in row]) for row in reader)
#!/usr/local/bin/python3 import pyorc from concurrent import futures import logging import grpc import server_pb2 import server_pb2_grpc import pyorc import cbor2 import cbor2 with open('./data.orc', 'rb') as data: reader = pyorc.Reader(data) print(reader.schema) for row in reader: pass #print(row)
import pyorc import csv example = open("./actions_20201208_0001.orc", "rb") reader = pyorc.Reader(example) rows = reader.read() with open('orc.csv', 'w') as out: csv_out = csv.writer(out) csv_out.writerow(reader.schema.fields.keys()) csv_out.writerows(rows)