Пример #1
0
def orc_to_pandas(file_name=None, file_io_obj=None, stripes=None):
    if file_name is not None:
        f = open(file_name, "rb")
    elif file_io_obj is not None:
        f = file_io_obj

    reader = pyorc.Reader(f)

    dtypes = {
        col: ORC_TO_PANDAS_TYPES[pyorc_type.name]
        for col, pyorc_type in reader.schema.fields.items()
    }

    if stripes is None:
        df = pd.DataFrame.from_records(
            reader, columns=reader.schema.fields.keys()
        )
    else:
        records = [
            record for i in stripes for record in list(reader.read_stripe(i))
        ]
        df = pd.DataFrame.from_records(
            records, columns=reader.schema.fields.keys()
        )

    # Need to type-cast to extracted `dtypes` from pyorc schema because
    # a fully empty/ full <NA> can result in incorrect dtype by pandas.
    df = df.astype(dtypes)

    return df
Пример #2
0
 def set_rand_params(self, params):
     params_dict = {}
     for param, values in params.items():
         if values == ALL_POSSIBLE_VALUES:
             if param == "columns":
                 col_size = self._rand(len(self._df.columns))
                 params_dict[param] = list(
                     np.unique(np.random.choice(self._df.columns,
                                                col_size)))
             elif param == "stripes":
                 f = io.BytesIO(self._current_buffer)
                 reader = pyorc.Reader(f)
                 stripes = [i for i in range(reader.num_of_stripes)]
                 params_dict[param] = np.random.choice([
                     None,
                     list(
                         map(
                             int,
                             np.unique(
                                 np.random.choice(stripes,
                                                  reader.num_of_stripes)),
                         )),
                 ])
             elif param == "use_index":
                 params_dict[param] = np.random.choice([True, False])
             elif param in ("skiprows", "num_rows"):
                 params_dict[param] = np.random.choice(
                     [None, self._rand(len(self._df))])
         else:
             if not isinstance(values, list):
                 raise TypeError("values must be of type list")
             params_dict[param] = np.random.choice(values)
     self._current_params["test_kwargs"] = self.process_kwargs(params_dict)
Пример #3
0
def read_hdfs_orc(path, hdfs, writer):
    chunk_rows = 1024 * 256

    with hdfs.open(path, 'rb') as f:
        reader = pyorc.Reader(f)
        fields = reader.schema.fields
        schema = []
        for c in fields:
            schema.append((c, arrow_type(fields[c])))
        pa_struct = pa.struct(schema)
        while True:
            rows = reader.read(num=chunk_rows)
            if not rows:
                break
            rb = pa.RecordBatch.from_struct_array(
                pa.array(rows, type=pa_struct))
            sink = pa.BufferOutputStream()
            rb_writer = pa.ipc.new_stream(sink, rb.schema)
            rb_writer.write_batch(rb)
            rb_writer.close()
            buf = sink.getvalue()
            chunk = writer.next(buf.size)
            buf_writer = pa.FixedSizeBufferWriter(chunk)
            buf_writer.write(buf)
            buf_writer.close()
Пример #4
0
def iter_orc(path):
    with open(path, 'rb') as f:
        reader = pyorc.Reader(f)
        column_names = reader.schema.fields.keys()
        for item in reader:
            d = OrderedDict(zip(column_names, item))
            yield d
Пример #5
0
Файл: orc.py Проект: ornew/beamx
    def read(self, ctx, file, range_tracker):
        pos = -1

        def split_points_unclaimed(stop_pos):
            if pos >= stop_pos:
                return 0
            return RangeTracker.SPLIT_POINTS_UNKNOWN

        range_tracker.set_split_points_unclaimed_callback(
            split_points_unclaimed)

        start_offset = range_tracker.start_position()
        if start_offset is None:
            start_offset = 0

        idx = 0
        reader = pyorc.Reader(file, **self.pyorc_options)
        if reader.num_of_stripes > 0:
            pos = 0
        else:
            pos = range_tracker.stop_position()

        while range_tracker.try_claim(pos):
            stripe = reader.read_stripe(idx)
            if idx + 1 < reader.num_of_stripes:
                idx = idx + 1
                pos = stripe.bytes_offset
            else:
                pos = range_tracker.stop_position()

            yield stripe.read()
Пример #6
0
 def flush(self):
     self._buffer.seek(0)
     # TODO: to optimize it
     reader = pyorc.Reader(self._buffer)
     columns = reader.schema.fields.keys()
     return (','.join(columns) + '\n' +
             '\n'.join(','.join([str(c) for c in row])
                       for row in reader)).encode('utf-8') + b'\n'
Пример #7
0
    def GetData(self, request, context):
        with open('./data.orc', 'rb') as dataf:
            reader = pyorc.Reader(dataf)
            print(reader.schema)

            #data = [cbor.dumps(it) for it in reader]
            chunk = []
            for idx, row in enumerate(reader):
                #chunk.append(cbor.dumps(row))
                chunk.append(row)
                if idx % 10000 == 0:
                    yield server_pb2.DataResponse(data=[cbor.dumps(chunk)])
                    chunk = []
        yield server_pb2.DataResponse(data=[cbor.dumps(chunk)])
 def export_type_schema(self, itemExporter):
     itemExporter.start_exporting()
     # create and write some test data
     num_records = 10
     for i in range(num_records):
         l = ItemLoader(TestItem())
         l.add_value('ftext', 'this is a test text')
         l.add_value('ftext_array', ['test1', 'test2', 'test3', 'test4'])
         l.add_value('ffloat', float(2.5))
         l.add_value('fint', int(10))
         l.add_value('fbool', False)
         datetime_str = '2020-02-29T11:12:13'
         datetime_fmt = '%Y-%m-%dT%H:%M:%S'
         datetime_obj = datetime.datetime.strptime(datetime_str,
                                                   datetime_fmt)
         datetime_obj = datetime_obj.replace(tzinfo=timezone.utc)
         l.add_value('fdatetime', datetime_obj)
         itemExporter.export_item(l.load_item())
     itemExporter.finish_exporting()
     self.file.close()
     # reread file and compare results
     with open(self.filename, 'rb') as f:
         reader = pyorc.Reader(f, struct_repr=pyorc.StructRepr.DICT)
         self.assertEqual(
             len(reader),
             10,
             msg=
             "Number of records read corresponds to number of records written"
         )
         for i in range(len(reader)):
             record = next(reader)
             self.assertEqual("this is a test text",
                              record.get('ftext', None),
                              msg="String data is read correctly")
             self.assertEqual(['test1', 'test2', 'test3', 'test4'],
                              record.get('ftext_array', None),
                              msg="String array data is read correctly")
             self.assertEqual(float(2.5),
                              record.get('ffloat', None),
                              msg="Float data is read correctly")
             self.assertEqual(int(10),
                              record.get('fint', None),
                              msg="Int data is read correctly")
             self.assertFalse(record.get('fbool', None),
                              msg="Bool data is read correctly")
             self.assertEqual(datetime_obj,
                              record.get('fdatetime', None),
                              msg="DateTime data is read correctly")
Пример #9
0
def read_single_orc(path, fs, writer):
    chunk_rows = 1024 * 256

    with fs.open(path, "rb") as f:
        reader = pyorc.Reader(f)
        fields = reader.schema.fields
        schema = []
        for c in fields:
            schema.append((c, arrow_type(fields[c])))
        pa_struct = pa.struct(schema)
        while True:
            rows = reader.read(num=chunk_rows)
            if not rows:
                break
            batch = pa.RecordBatch.from_struct_array(
                pa.array(rows, type=pa_struct))
            writer.write(batch)
Пример #10
0
def read_hdfs_orc(vineyard_socket, path, proc_num, proc_index):
    if proc_index:
        raise ValueError('Parallel reading ORC hasn\'t been supported yet')
    client = vineyard.connect(vineyard_socket)
    builder = DataframeStreamBuilder(client)

    stream = builder.seal(client)
    client.persist(stream)
    ret = {'type': 'return'}
    ret['content'] = repr(stream.id)
    print(json.dumps(ret), flush=True)

    chunk_rows = 1024 * 256

    writer = stream.open_writer(client)

    host, port = urlparse(path).netloc.split(':')
    hdfs = HDFileSystem(host=host, port=int(port), pars={"dfs.client.read.shortcircuit": "false"})
    path = urlparse(path).path

    with hdfs.open(path, 'rb') as f:
        reader = pyorc.Reader(f)
        fields = reader.schema.fields
        schema = []
        for c in fields:
            schema.append((c, arrow_type(fields[c])))
        pa_struct = pa.struct(schema)
        while True:
            rows = reader.read(num=chunk_rows)
            if not rows:
                break
            rb = pa.RecordBatch.from_struct_array(pa.array(rows, type=pa_struct))
            sink = pa.BufferOutputStream()
            rb_writer = pa.ipc.new_stream(sink, rb.schema)
            rb_writer.write_batch(rb)
            rb_writer.close()
            buf = sink.getvalue()
            chunk = writer.next(buf.size)
            buf_writer = pa.FixedSizeBufferWriter(chunk)
            buf_writer.write(buf)
            buf_writer.close()

    writer.finish()
Пример #11
0
def orc_to_pandas(file_name=None, file_io_obj=None, stripes=None):
    if file_name is not None:
        f = open(file_name, "rb")
    elif file_io_obj is not None:
        f = file_io_obj

    reader = pyorc.Reader(f)

    if stripes is None:
        df = pd.DataFrame.from_records(reader,
                                       columns=reader.schema.fields.keys())
    else:
        records = [
            record for i in stripes for record in list(reader.read_stripe(i))
        ]
        df = pd.DataFrame.from_records(records,
                                       columns=reader.schema.fields.keys())

    return df
Пример #12
0
def read_local_orc(vineyard_socket, path, proc_num, proc_index):
    if proc_index:
        return 
    client = vineyard.connect(vineyard_socket)
    builder = DataframeStreamBuilder(client)

    stream = builder.seal(client)
    ret = {'type': 'return'}
    ret['content'] = repr(stream.id)
    print(json.dumps(ret))

    writer = stream.open_writer(client)

    with open(path, 'rb') as f:
        reader = pyorc.Reader(f)
        fields = reader.schema.fields
        schema = []
        for c in fields:
            schema.append((c, arrow_type(fields[c])))
        pa_struct = pa.struct(schema)
        while True:
            rows = reader.read(num=1024)
            if not rows:
                break
            rb = pa.RecordBatch.from_struct_array(pa.array(rows, type=pa_struct))
            sink = pa.BufferOutputStream()
            rb_writer = pa.ipc.new_stream(sink, rb.schema)
            rb_writer.write_batch(rb)
            rb_writer.close()
            buf = sink.getvalue()
            chunk = writer.next(buf.size)
            buf_writer = pa.FixedSizeBufferWriter(chunk)
            buf_writer.write(buf)
            buf_writer.close()


    writer.finish()
Пример #13
0
 def _last_datetime(self, category, date):
     if self.conn.status(f"/krwordcloud/add-article/{date}")['length'] == 0:
         return config.min_date
     tfname = ''
     with tempfile.NamedTemporaryFile("wb") as tf:
         tfname = tf.name
         with self.conn.read(f"/krwordcloud/add-article/{date}",
                             chunk_size=8096) as hf:
             for chunk in hf:
                 tf.write(chunk)
         with open(tfname, 'rb') as tf:
             reader = pyorc.Reader(tf)
             maximum = datetime.datetime \
                 .strptime(f"{date} GMT+0900", "%Y-%m-%d.orc GMT%z")
             for row in reader:
                 if row[0] > maximum and row[1] == category:
                     maximum = row[0]
             if (maximum < config.min_date):
                 return config.min_date
             elif maximum > datetime.datetime.now().replace(tzinfo=KST):
                 return datetime.datetime.now().replace(tzinfo=KST)
             else:
                 return maximum
     os.unlink(tfname)
Пример #14
0
def read_orc_inventory_file(filename, keys):
    with open(filename, "rb") as data:
        reader = pyorc.Reader(data)
        for row in reader:
            record = {keys[i].lower(): v for i, v in enumerate(row)}
            yield record
Пример #15
0
def decode_orc(filename: str):
    with open(filename, 'rb') as data:
        reader = pyorc.Reader(data)
        return '\n'.join(','.join([str(c) for c in row]) for row in reader)
Пример #16
0
#!/usr/local/bin/python3

import pyorc

from concurrent import futures
import logging
import grpc

import server_pb2
import server_pb2_grpc

import pyorc
import cbor2

import cbor2

with open('./data.orc', 'rb') as data:
    reader = pyorc.Reader(data)
    print(reader.schema)
    for row in reader:
        pass  #print(row)
Пример #17
0
import pyorc
import csv

example = open("./actions_20201208_0001.orc", "rb")
reader = pyorc.Reader(example)
rows = reader.read()
with open('orc.csv', 'w') as out:
    csv_out = csv.writer(out)
    csv_out.writerow(reader.schema.fields.keys())
    csv_out.writerows(rows)