def pandas_to_orc(df, file_name=None, file_io_obj=None, stripe_size=67108864): schema = get_orc_schema(df) tuple_list = _preprocess_to_orc_tuple(df) if file_name is not None: with open(file_name, "wb") as data: with pyorc.Writer(data, schema, stripe_size=stripe_size) as writer: writer.writerows(tuple_list) elif file_io_obj is not None: with pyorc.Writer(file_io_obj, schema, stripe_size=stripe_size) as writer: writer.writerows(tuple_list)
def write_orc( vineyard_socket, path, stream_id, storage_options, write_options, proc_num, proc_index, ): client = vineyard.connect(vineyard_socket) streams = client.get(stream_id) if len(streams) != proc_num or streams[proc_index] is None: raise ValueError( f"Fetch stream error with proc_num={proc_num},proc_index={proc_index}" ) instream: DataframeStream = streams[proc_index] reader = instream.open_reader(client) writer = None path += f"_{proc_index}" with fsspec.open(path, "wb", **storage_options) as f: while True: try: batch = reader.next() except (StopIteration, vineyard.StreamDrainedException): writer.close() break if writer is None: # get schema schema = {} for field in batch.schema: schema[field.name] = orc_type(field.type) writer = pyorc.Writer(f, pyorc.Struct(**schema)) writer.writerows(batch.to_pandas.itertuples(False, None)) writer.close()
def write_hdfs_orc(vineyard_socket, stream_id, path, proc_num, proc_index): client = vineyard.connect(vineyard_socket) streams = client.get(stream_id) if len(streams) != proc_num or streams[proc_index] is None: raise ValueError( f'Fetch stream error with proc_num={proc_num},proc_index={proc_index}' ) instream = streams[proc_index] reader = instream.open_reader(client) host, port = urlparse(path).netloc.split(':') hdfs = HDFileSystem(host=host, port=int(port)) path = urlparse(path).path writer = None with hdfs.open(path, 'wb') as f: while True: try: buf = reader.next() except: writer.close() break buf_reader = pa.ipc.open_stream(buf) if writer is None: #get schema schema = {} for field in buf_reader.schema: schema[field.name] = orc_type(field.type) writer = pyorc.Writer(f, pyorc.Struct(**schema)) for batch in buf_reader: df = batch.to_pandas() writer.writerows(df.itertuples(False, None))
def write_local_orc(vineyard_socket, stream_id, path, proc_num, proc_index): client = vineyard.connect(vineyard_socket) streams = client.get(stream_id) if len(streams) != proc_num or streams[proc_index] is None: raise ValueError( f'Fetch stream error with proc_num={proc_num},proc_index={proc_index}' ) instream = streams[proc_index] reader = instream.open_reader(client) writer = None with open(path, 'wb') as f: while True: try: buf = reader.next() except vineyard.StreamDrainedException: writer.close() break buf_reader = pa.ipc.open_stream(buf) if writer is None: # get schema schema = {} for field in buf_reader.schema: schema[field.name] = orc_type(field.type) writer = pyorc.Writer(f, pyorc.Struct(**schema)) while True: try: batch = buf_reader.read_next_batch() except StopIteration: break df = batch.to_pandas() writer.writerows(df.itertuples(False, None))
def encode_orc(filename: str, compression: str = None, columns: Iterable[str] = None, column_types: Iterable[str] = None, skip_header=True): buffer = io.BytesIO() with open(filename, 'rt') as fd: reader = csv.reader(fd) # If columns not provided try to read header from the file if skip_header or columns is None: columns = next(reader) column_types = ['string'] * len(columns) struct = 'struct<{columns}>'.format(columns=','.join( name + ':' + (col_type if col_type else 'string') for name, col_type in zip_longest(columns, column_types))) if compression in (None, 'zlib', 'zstd'): compression_type = getattr(pyorc.CompressionKind, str(compression).upper()) else: compression_type = pyorc.CompressionKind.NONE with pyorc.Writer(buffer, struct, compression=compression_type) as writer: for row in reader: writer.write(tuple(row)) if compression in (None, 'zlib', 'zstd'): return buffer.getvalue() else: buffer.seek(0) return compress(buffer, compression)
def test_read(self): schema = 'struct<a:int,b:struct<x:string,y:boolean>>' files = [] with tempfile.NamedTemporaryFile() as f1, \ tempfile.NamedTemporaryFile() as f2: files.append(f1.name) with pyorc.Writer(f1, schema) as writer: writer.write((1, ('x', True))) files.append(f2.name) with pyorc.Writer(f2, schema) as writer: writer.write((2, ('y', False))) writer.write((3, ('z', False))) with TestPipeline() as p: pc = (p | Read( FileSource( file_patterns=files, reader=OrcReader(pyorc_options={ 'struct_repr': pyorc.StructRepr.DICT, })))) assert_that( pc, equal_to([ { 'a': 1, 'b': { 'x': 'x', 'y': True, }, }, { 'a': 2, 'b': { 'x': 'y', 'y': False, }, }, { 'a': 3, 'b': { 'x': 'z', 'y': False, }, }, ]))
def write_with_compression(df, schema, compression): with open(OUTPUT_FILE_PATH, "wb") as f: with pyorc.Writer(f, schema, compression=compression, compression_strategy=pyorc.CompressionStrategy.COMPRESSION) as writer: start = timer() for i in range(len(df)): writer.write(tuple([x for x in df.iloc[i, :]])) end = timer() print('Time to write orc with {} compression: {} seconds'.format(compression, end - start)) print('Resulting size: {}'.format(util.get_readable_file_size(OUTPUT_FILE_PATH)))
def write(self, srt_type: dict = None): headers = self.get_header() cols = [] for key in headers: ctype = srt_type[key] cols.append(f'{key}:{ctype}') str_cols = ",".join(cols) struct_col = f"struct<{str_cols}>" with self.filepath().open("wb") as f: with pyorc.Writer(f, struct_col) as writer: for r in self.content: writer.write(tuple(r.values()))
def _hdfs_flush(self, date, data): with self.conn.write(f"/krwordcloud/add-article/{date}.orc", overwrite=True) as hf: tfname = '' with tempfile.NamedTemporaryFile(mode="wb+", delete=False) as tf: tfname = tf.name with pyorc.Writer( tf, schema="struct<field0:timestamp,field1:string," + "field2:string,field3:string>", ) as of: of.writerows(data) with open(tfname, 'rb') as tf: for line in tf: hf.write(line) os.unlink(tfname)
def test_empty_statistics(): buff = BytesIO() orc_schema = po.Struct( a=po.BigInt(), b=po.Double(), c=po.String(), d=po.Decimal(11, 2), e=po.Date(), f=po.Timestamp(), g=po.Boolean(), h=po.Binary(), i=po.BigInt(), # One column with non null value, else cudf/pyorc readers crash ) data = tuple([None] * (len(orc_schema.fields) - 1) + [1]) with po.Writer(buff, orc_schema) as writer: writer.write(data) got = cudf.io.orc.read_orc_statistics([buff]) # Check for both file and stripe stats for stats in got: # Similar expected stats for the first 6 columns in this case for col_name in ascii_lowercase[:6]: assert stats[0][col_name].get("number_of_values") == 0 assert stats[0][col_name].get("has_null") is True assert stats[0][col_name].get("minimum") is None assert stats[0][col_name].get("maximum") is None for col_name in ascii_lowercase[:3]: assert stats[0][col_name].get("sum") == 0 # Sum for decimal column is a string assert stats[0]["d"].get("sum") == "0" assert stats[0]["g"].get("number_of_values") == 0 assert stats[0]["g"].get("has_null") is True assert stats[0]["g"].get("true_count") == 0 assert stats[0]["g"].get("false_count") == 0 assert stats[0]["h"].get("number_of_values") == 0 assert stats[0]["h"].get("has_null") is True assert stats[0]["h"].get("sum") == 0 assert stats[0]["i"].get("number_of_values") == 1 assert stats[0]["i"].get("has_null") is False assert stats[0]["i"].get("minimum") == 1 assert stats[0]["i"].get("maximum") == 1 assert stats[0]["i"].get("sum") == 1
def test_statistics_sum_overflow(): maxint64 = np.iinfo(np.int64).max minint64 = np.iinfo(np.int64).min buff = BytesIO() with po.Writer(buff, po.Struct(a=po.BigInt(), b=po.BigInt(), c=po.BigInt())) as writer: writer.write((maxint64, minint64, minint64)) writer.write((1, -1, 1)) file_stats, stripe_stats = cudf.io.orc.read_orc_statistics([buff]) assert file_stats[0]["a"].get("sum") is None assert file_stats[0]["b"].get("sum") is None assert file_stats[0]["c"].get("sum") == minint64 + 1 assert stripe_stats[0]["a"].get("sum") is None assert stripe_stats[0]["b"].get("sum") is None assert stripe_stats[0]["c"].get("sum") == minint64 + 1
def start_exporting(self): """ Triggered when Scrapy starts exporting. Useful to configure headers etc. """ if not SUPPORTED_EXPORTERS['orc']: raise RuntimeError( "Error: Cannot export to orc. Cannot import pyorc. Have you installed it?" ) self.orcwriter = pyorc.Writer( self.file, schema=self.orc_schemastring, batch_size=self.orc_batchsize, stripe_size=self.orc_stripesize, compression=self.orc_compression, compression_strategy=self.orc_compressionstrategy, compression_block_size=self.orc_blocksize, bloom_filter_columns=self.orc_bloomfiltercolumns, bloom_filter_fpp=self.orc_bloomfilterfpp, struct_repr=pyorc.StructRepr.DICT, converters=self.orc_converters)
def test_orc_read_skiprows(tmpdir): buff = BytesIO() df = pd.DataFrame( {"a": [1, 0, 1, 0, None, 1, 1, 1, 0, None, 0, 0, 1, 1, 1, 1]}, dtype=pd.BooleanDtype(), ) writer = pyorc.Writer(buff, pyorc.Struct(a=pyorc.Boolean())) tuples = list( map( lambda x: (None, ) if x[0] is pd.NA else x, list(df.itertuples(index=False, name=None)), )) writer.writerows(tuples) writer.close() skiprows = 10 expected = cudf.read_orc(buff)[skiprows::].reset_index(drop=True) got = cudf.read_orc(buff, skiprows=skiprows) assert_eq(expected, got)
ORC_FILE = 'Orc/output/nodes.orc' ORC_SNAPPY_FILE = 'Orc/output/snappy_nodes.orc' ORC_ZLIB_FILE = 'Orc/output/zlib_nodes.orc' JSON_FILE = 'Orc/output/nodes.json' # Define data schema schema = "struct<id:int,longitude:float,latitude:float,username:string>" nodes = [] tree = ET.parse(open(SOURCE_FILE)) for node in tree.iterfind('node'): nodes.append((int(node.get('id')), float(node.get('lon')), float(node.get('lat')), node.get('user'))) with open(ORC_FILE, "wb") as data: with pyorc.Writer(data, schema, compression=pyorc.CompressionKind.NONE) as writer: for node in nodes: writer.write(node) ## Looks like SNAPPY and LZO compression aren't supported by ORC yet? # # with open(ORC_SNAPPY_FILE, "wb") as data: # with pyorc.Writer(data, schema, compression=pyorc.CompressionKind.SNAPPY) as writer: # for node in nodes: # writer.write(node) ## with open(ORC_ZLIB_FILE, "wb") as data: with pyorc.Writer(data, schema, compression=pyorc.CompressionKind.ZLIB) as writer: for node in nodes:
#!/usr/local/bin/python3 import pyorc from uuid import uuid4 with open('./data.orc', 'wb') as data: with pyorc.Writer(data, 'struct<col0:int,col1:string,col2:string,col3:string,col4:string>') as writer: for idx in range(10000000): uuid = str(uuid4()) writer.write((idx, uuid + '1', uuid + '2', uuid + '3', uuid + '4'))
def gen_map_buff(size=10000): from string import ascii_letters as al rd = random.Random(1) np.random.seed(seed=1) buff = BytesIO() schema = { "lvl1_map": po.Map(key=po.String(), value=po.BigInt()), "lvl2_map": po.Map(key=po.String(), value=po.Array(po.BigInt())), "lvl2_struct_map": po.Map( key=po.String(), value=po.Struct(**{ "a": po.BigInt(), "b": po.BigInt() }), ), } schema = po.Struct(**schema) lvl1_map = [ rd.choice([ None, [( rd.choice(al), rd.choice([None, np.random.randint(1, 1500)]), ) for y in range(2)], ]) for x in range(size) ] lvl2_map = [ rd.choice([ None, [( rd.choice(al), rd.choice([ None, [ rd.choice([None, np.random.randint(1, 1500)]) for z in range(5) ], ]), ) for y in range(2)], ]) for x in range(size) ] lvl2_struct_map = [ rd.choice([ None, [( rd.choice(al), rd.choice([ None, ( rd.choice([None, np.random.randint(1, 1500)]), rd.choice([None, np.random.randint(1, 1500)]), ), ]), ) for y in range(2)], ]) for x in range(size) ] pdf = pd.DataFrame({ "lvl1_map": lvl1_map, "lvl2_map": lvl2_map, "lvl2_struct_map": lvl2_struct_map, }) writer = po.Writer(buff, schema, stripe_size=1024, compression=po.CompressionKind.NONE) tuples = list( map( lambda x: (None, ) if x[0] is pd.NA else x, list(pdf.itertuples(index=False, name=None)), )) writer.writerows(tuples) writer.close() return buff
def generate_list_struct_buff(size=28000): rd = random.Random(1) np.random.seed(seed=1) buff = BytesIO() schema = { "lvl3_list": po.Array(po.Array(po.Array(po.BigInt()))), "lvl1_list": po.Array(po.BigInt()), "lvl1_struct": po.Struct(**{"a": po.BigInt(), "b": po.BigInt()}), "lvl2_struct": po.Struct( **{ "a": po.BigInt(), "lvl1_struct": po.Struct( **{"c": po.BigInt(), "d": po.BigInt()} ), } ), "list_nests_struct": po.Array( po.Array(po.Struct(**{"a": po.BigInt(), "b": po.BigInt()})) ), "struct_nests_list": po.Struct( **{ "struct": po.Struct(**{"a": po.BigInt(), "b": po.BigInt()}), "list": po.Array(po.BigInt()), } ), } schema = po.Struct(**schema) lvl3_list = [ rd.choice( [ None, [ [ [ rd.choice([None, np.random.randint(1, 3)]) for z in range(np.random.randint(1, 3)) ] for z in range(np.random.randint(0, 3)) ] for y in range(np.random.randint(0, 3)) ], ] ) for x in range(size) ] lvl1_list = [ [ rd.choice([None, np.random.randint(0, 3)]) for y in range(np.random.randint(1, 4)) ] for x in range(size) ] lvl1_struct = [ rd.choice([None, (np.random.randint(0, 3), np.random.randint(0, 3))]) for x in range(size) ] lvl2_struct = [ rd.choice( [ None, ( rd.choice([None, np.random.randint(0, 3)]), ( rd.choice([None, np.random.randint(0, 3)]), np.random.randint(0, 3), ), ), ] ) for x in range(size) ] list_nests_struct = [ [ [rd.choice(lvl1_struct), rd.choice(lvl1_struct)] for y in range(np.random.randint(1, 4)) ] for x in range(size) ] struct_nests_list = [(lvl1_struct[x], lvl1_list[x]) for x in range(size)] df = pd.DataFrame( { "lvl3_list": lvl3_list, "lvl1_list": lvl1_list, "lvl1_struct": lvl1_struct, "lvl2_struct": lvl2_struct, "list_nests_struct": list_nests_struct, "struct_nests_list": struct_nests_list, } ) writer = po.Writer(buff, schema, stripe_size=1024) tuples = list( map( lambda x: (None,) if x[0] is pd.NA else x, list(df.itertuples(index=False, name=None)), ) ) writer.writerows(tuples) writer.close() return buff
response = detectlanguage.detect(df["text"].values.tolist()) first_languages = list( map( lambda x: x[0] if x else { 'isReliable': False, 'confidence': 0, 'language': '' }, response)) new_df = pd.concat([df, pd.DataFrame(first_languages)], axis=1) orc_file = ORC_FILE.format(datetime.now().strftime("%y%m%d")) with open(orc_file, "wb") as data: with pyorc.Writer( data, "struct<text:string,isSpam:boolean,language:string,isReliable:boolean,confidence:float>", compression=pyorc.CompressionKind.ZLIB) as writer: for index, row in new_df.iterrows(): writer.write((row['text'], row['isSpam'], row['language'], row['isReliable'], row['confidence'])) new_df.to_csv(index=True) print(f"Saved {len(new_df)} messages in {orc_file}.") ## For the future, to read the dataset # with open(ORC_FILE, 'rb') as orc_file: # reader = pyorc.Reader(orc_file) # # Read embedded schema # print(str(reader.schema))