def pyarrow_cpu_count(cpu_count=pyarrow.cpu_count()): old_cpu_count = pyarrow.cpu_count() pyarrow.set_cpu_count(cpu_count) try: yield finally: pyarrow.set_cpu_count(old_cpu_count)
def test_cpu_count(): n = pa.cpu_count() assert n > 0 try: pa.set_cpu_count(n + 5) assert pa.cpu_count() == n + 5 finally: pa.set_cpu_count(n)
def get_dataframe(parameters, use_threads): # Initialize seeds if parameters.seed is not None: np.random.seed(parameters.seed) # For each column, use a generic Mimesis producer to create an Iterable # for generating data for i, column_params in enumerate(parameters.column_parameters): if column_params.dtype is None: column_params.generator = column_params.generator( Generic("en", seed=parameters.seed)) else: column_params.generator = column_params.generator() # Get schema for each column schema = pa.schema([ pa.field( name=str(i), type=pa.dictionary( index_type=pa.int64(), value_type=pa.from_numpy_dtype( type(next(iter(column_params.generator)))), ) if isinstance(column_params.dtype, str) and column_params.dtype == "category" else pa.from_numpy_dtype( type(next(iter(column_params.generator))) if column_params.dtype is None else column_params.dtype), nullable=column_params.null_frequency > 0, ) for i, column_params in enumerate(parameters.column_parameters) ]) # Initialize column data and which columns should be sorted column_data = [None] * len(parameters.column_parameters) columns_to_sort = [ str(i) for i, column_params in enumerate(parameters.column_parameters) if column_params.is_sorted ] # Generate data if not use_threads: for i, column_params in enumerate(parameters.column_parameters): column_data[i] = _generate_column(column_params, parameters.num_rows) else: pool = Pool(pa.cpu_count()) column_data = pool.starmap( _generate_column, [(column_params, parameters.num_rows) for i, column_params in enumerate(parameters.column_parameters)], ) pool.close() pool.join() # Convert to Pandas DataFrame and sort columns appropriately tbl = pa.Table.from_arrays( column_data, schema=schema, ) if columns_to_sort: tbl = tbl.to_pandas() tbl = tbl.sort_values(columns_to_sort) tbl = pa.Table.from_pandas(tbl, schema) return tbl
def dataframe_to_arrays(df, schema, preserve_index, nthreads=1): names = [] index_columns = [] type = None if preserve_index: n = len(getattr(df.index, 'levels', [df.index])) index_columns.extend(df.index.get_level_values(i) for i in range(n)) columns_to_convert = [] convert_types = [] for name in df.columns: col = df[name] if not isinstance(name, six.string_types): name = str(_column_name_to_strings(name)) if schema is not None: field = schema.field_by_name(name) type = getattr(field, "type", None) columns_to_convert.append(col) convert_types.append(type) names.append(name) for i, column in enumerate(index_columns): columns_to_convert.append(column) convert_types.append(None) names.append(index_level_name(column, i)) # NOTE(wesm): If nthreads=None, then we use a heuristic to decide whether # using a thread pool is worth it. Currently the heuristic is whether the # nrows > 100 * ncols. if nthreads is None: nrows, ncols = len(df), len(df.columns) if nrows > ncols * 100: nthreads = pa.cpu_count() else: nthreads = 1 def convert_column(col, ty): return pa.array(col, from_pandas=True, type=ty) if nthreads == 1: arrays = [convert_column(c, t) for c, t in zip(columns_to_convert, convert_types)] else: from concurrent import futures with futures.ThreadPoolExecutor(nthreads) as executor: arrays = list(executor.map(convert_column, columns_to_convert, convert_types)) types = [x.type for x in arrays] metadata = construct_metadata( df, names, index_columns, preserve_index, types ) return names, arrays, metadata
def dataframe_to_arrays(df, schema, preserve_index, nthreads=1): names = [] index_columns = [] type = None if preserve_index: n = len(getattr(df.index, 'levels', [df.index])) index_columns.extend(df.index.get_level_values(i) for i in range(n)) columns_to_convert = [] convert_types = [] for name in df.columns: col = df[name] if not isinstance(name, six.string_types): name = str(_column_name_to_strings(name)) if schema is not None: field = schema.field_by_name(name) type = getattr(field, "type", None) columns_to_convert.append(col) convert_types.append(type) names.append(name) for i, column in enumerate(index_columns): columns_to_convert.append(column) convert_types.append(None) names.append(index_level_name(column, i)) # NOTE(wesm): If nthreads=None, then we use a heuristic to decide whether # using a thread pool is worth it. Currently the heuristic is whether the # nrows > 100 * ncols. if nthreads is None: nrows, ncols = len(df), len(df.columns) if nrows > ncols * 100: nthreads = pa.cpu_count() else: nthreads = 1 def convert_column(col, ty): return pa.array(col, from_pandas=True, type=ty) if nthreads == 1: arrays = [ convert_column(c, t) for c, t in zip(columns_to_convert, convert_types) ] else: from concurrent import futures with futures.ThreadPoolExecutor(nthreads) as executor: arrays = list( executor.map(convert_column, columns_to_convert, convert_types)) types = [x.type for x in arrays] metadata = construct_metadata(df, names, index_columns, preserve_index, types) return names, arrays, metadata
def dataframe_to_arrays(df, schema, preserve_index, nthreads=1, columns=None, safe=True): names, column_names, index_columns, index_column_names, \ columns_to_convert, convert_types = _get_columns_to_convert( df, schema, preserve_index, columns ) # NOTE(wesm): If nthreads=None, then we use a heuristic to decide whether # using a thread pool is worth it. Currently the heuristic is whether the # nrows > 100 * ncols. if nthreads is None: nrows, ncols = len(df), len(df.columns) if nrows > ncols * 100: nthreads = pa.cpu_count() else: nthreads = 1 def convert_column(col, ty): try: return pa.array(col, type=ty, from_pandas=True, safe=safe) except (pa.ArrowInvalid, pa.ArrowNotImplementedError, pa.ArrowTypeError) as e: e.args += ( "Conversion failed for column {0!s} with type {1!s}".format( col.name, col.dtype), ) raise e if nthreads == 1: arrays = [ convert_column(c, t) for c, t in zip(columns_to_convert, convert_types) ] else: from concurrent import futures with futures.ThreadPoolExecutor(nthreads) as executor: arrays = list( executor.map(convert_column, columns_to_convert, convert_types)) types = [x.type for x in arrays] metadata = construct_metadata(df, column_names, index_columns, index_column_names, preserve_index, types) return names, arrays, metadata
def dataframe_to_arrays(df, schema, preserve_index, nthreads=1, columns=None, safe=True): (all_names, column_names, index_descriptors, index_columns, columns_to_convert, convert_types) = _get_columns_to_convert(df, schema, preserve_index, columns) # NOTE(wesm): If nthreads=None, then we use a heuristic to decide whether # using a thread pool is worth it. Currently the heuristic is whether the # nrows > 100 * ncols. if nthreads is None: nrows, ncols = len(df), len(df.columns) if nrows > ncols * 100: nthreads = pa.cpu_count() else: nthreads = 1 def convert_column(col, ty): try: return pa.array(col, type=ty, from_pandas=True, safe=safe) except (pa.ArrowInvalid, pa.ArrowNotImplementedError, pa.ArrowTypeError) as e: e.args += ("Conversion failed for column {0!s} with type {1!s}" .format(col.name, col.dtype),) raise e if nthreads == 1: arrays = [convert_column(c, t) for c, t in zip(columns_to_convert, convert_types)] else: from concurrent import futures with futures.ThreadPoolExecutor(nthreads) as executor: arrays = list(executor.map(convert_column, columns_to_convert, convert_types)) types = [x.type for x in arrays] metadata = construct_metadata(df, column_names, index_columns, index_descriptors, preserve_index, types) return all_names, arrays, metadata
def dataframe_to_arrays(df, schema, preserve_index, nthreads=1, columns=None, safe=True): (all_names, column_names, index_column_names, index_descriptors, index_columns, columns_to_convert, convert_fields) = _get_columns_to_convert(df, schema, preserve_index, columns) # NOTE(wesm): If nthreads=None, then we use a heuristic to decide whether # using a thread pool is worth it. Currently the heuristic is whether the # nrows > 100 * ncols and ncols > 1. if nthreads is None: nrows, ncols = len(df), len(df.columns) if nrows > ncols * 100 and ncols > 1: nthreads = pa.cpu_count() else: nthreads = 1 def convert_column(col, field): if field is None: field_nullable = True type_ = None else: field_nullable = field.nullable type_ = field.type try: result = pa.array(col, type=type_, from_pandas=True, safe=safe) except (pa.ArrowInvalid, pa.ArrowNotImplementedError, pa.ArrowTypeError) as e: e.args += ( "Conversion failed for column {!s} with type {!s}".format( col.name, col.dtype), ) raise e if not field_nullable and result.null_count > 0: raise ValueError("Field {} was non-nullable but pandas column " "had {} null values".format( str(field), result.null_count)) return result def _can_definitely_zero_copy(arr): return (isinstance(arr, np.ndarray) and arr.flags.contiguous and issubclass(arr.dtype.type, np.integer)) if nthreads == 1: arrays = [ convert_column(c, f) for c, f in zip(columns_to_convert, convert_fields) ] else: from concurrent import futures arrays = [] with futures.ThreadPoolExecutor(nthreads) as executor: for c, f in zip(columns_to_convert, convert_fields): if _can_definitely_zero_copy(c.values): arrays.append(convert_column(c, f)) else: arrays.append(executor.submit(convert_column, c, f)) for i, maybe_fut in enumerate(arrays): if isinstance(maybe_fut, futures.Future): arrays[i] = maybe_fut.result() types = [x.type for x in arrays] if schema is None: fields = [] for name, type_ in zip(all_names, types): name = name if name is not None else 'None' fields.append(pa.field(name, type_)) schema = pa.schema(fields) pandas_metadata = construct_metadata(columns_to_convert, df, column_names, index_columns, index_descriptors, preserve_index, types) metadata = deepcopy(schema.metadata) if schema.metadata else dict() metadata.update(pandas_metadata) schema = schema.with_metadata(metadata) return arrays, schema
def dataframe_to_arrays(df, schema, preserve_index, nthreads=1, columns=None, safe=True): (all_names, column_names, index_column_names, index_descriptors, index_columns, columns_to_convert, convert_fields) = _get_columns_to_convert(df, schema, preserve_index, columns) # NOTE(wesm): If nthreads=None, then we use a heuristic to decide whether # using a thread pool is worth it. Currently the heuristic is whether the # nrows > 100 * ncols. if nthreads is None: nrows, ncols = len(df), len(df.columns) if nrows > ncols * 100: nthreads = pa.cpu_count() else: nthreads = 1 def convert_column(col, field): if field is None: field_nullable = True type_ = None else: field_nullable = field.nullable type_ = field.type try: result = pa.array(col, type=type_, from_pandas=True, safe=safe) except (pa.ArrowInvalid, pa.ArrowNotImplementedError, pa.ArrowTypeError) as e: e.args += ( "Conversion failed for column {0!s} with type {1!s}".format( col.name, col.dtype), ) raise e if not field_nullable and result.null_count > 0: raise ValueError("Field {} was non-nullable but pandas column " "had {} null values".format( str(field), result.null_count)) return result if nthreads == 1: arrays = [ convert_column(c, f) for c, f in zip(columns_to_convert, convert_fields) ] else: from concurrent import futures with futures.ThreadPoolExecutor(nthreads) as executor: arrays = list( executor.map(convert_column, columns_to_convert, convert_fields)) types = [x.type for x in arrays] if schema is not None: # add index columns index_types = types[len(column_names):] for name, type_ in zip(index_column_names, index_types): name = name if name is not None else 'None' schema = schema.append(pa.field(name, type_)) else: fields = [] for name, type_ in zip(all_names, types): name = name if name is not None else 'None' fields.append(pa.field(name, type_)) schema = pa.schema(fields) metadata = construct_metadata(df, column_names, index_columns, index_descriptors, preserve_index, types) schema = schema.with_metadata(metadata) return arrays, schema
import concurrent.futures import json import os import numpy as np import pandas as pd import pyarrow as pa import pyarrow.parquet as pq import scipy.sparse from anndata import AnnData from cirrocumulus.abstract_dataset import AbstractDataset max_workers = min(12, pa.cpu_count()) executor = concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) def read_table(path, filesystem, columns=None): return pq.read_table(path, filesystem=filesystem, columns=columns, use_threads=False) def read_tables(paths, filesystem, columns=None): futures = [] for path in paths: future = executor.submit(read_table, path, filesystem=filesystem, columns=columns) futures.append(future)
opath = DOWNLOAD_DIR + '/' + f['filename'] if not vitaldb.api.download(f['filename'], opath): print('failed') else: print('done') quit() import boto3 import pandas as pd import pyarrow as pa import pyarrow.parquet as pq from joblib import Parallel, delayed # cpu core 갯수를 설정 ncpu = pa.cpu_count() print('{} cpu core'.format(ncpu)) pa.set_cpu_count(ncpu) pa.set_io_thread_count(ncpu) bucket_name = 'vitaldb-parquets' prefix = 'vitaldb2017/1608/D1' track_names = ['Solar8000/HR', 'SNUADC/ECG_II'] odir = 'vital_files' if not os.path.exists(odir): os.mkdir(odir) # 병렬 처리 s3 = boto3.resource('s3')
def dataframe_to_arrays(df, schema, preserve_index, nthreads=1, columns=None): if columns is None: columns = df.columns column_names = [] index_columns = [] index_column_names = [] type = None if preserve_index: n = len(getattr(df.index, 'levels', [df.index])) index_columns.extend(df.index.get_level_values(i) for i in range(n)) columns_to_convert = [] convert_types = [] if not df.columns.is_unique: raise ValueError( 'Duplicate column names found: {}'.format(list(df.columns)) ) for name in columns: col = df[name] name = _column_name_to_strings(name) if schema is not None: field = schema.field_by_name(name) type = getattr(field, "type", None) columns_to_convert.append(col) convert_types.append(type) column_names.append(name) for i, column in enumerate(index_columns): columns_to_convert.append(column) convert_types.append(None) name = _index_level_name(column, i, column_names) index_column_names.append(name) # NOTE(wesm): If nthreads=None, then we use a heuristic to decide whether # using a thread pool is worth it. Currently the heuristic is whether the # nrows > 100 * ncols. if nthreads is None: nrows, ncols = len(df), len(df.columns) if nrows > ncols * 100: nthreads = pa.cpu_count() else: nthreads = 1 def convert_column(col, ty): try: return pa.array(col, from_pandas=True, type=ty) except (pa.ArrowInvalid, pa.ArrowNotImplementedError, pa.ArrowTypeError) as e: e.args += ("Conversion failed for column {0!s} with type {1!s}" .format(col.name, col.dtype),) raise e if nthreads == 1: arrays = [convert_column(c, t) for c, t in zip(columns_to_convert, convert_types)] else: from concurrent import futures with futures.ThreadPoolExecutor(nthreads) as executor: arrays = list(executor.map(convert_column, columns_to_convert, convert_types)) types = [x.type for x in arrays] metadata = construct_metadata( df, column_names, index_columns, index_column_names, preserve_index, types ) names = column_names + index_column_names return names, arrays, metadata
def dataframe_to_arrays(df, schema, preserve_index, nthreads=1, columns=None): if columns is None: columns = df.columns column_names = [] index_columns = [] index_column_names = [] type = None if preserve_index: n = len(getattr(df.index, 'levels', [df.index])) index_columns.extend(df.index.get_level_values(i) for i in range(n)) columns_to_convert = [] convert_types = [] if not df.columns.is_unique: raise ValueError('Duplicate column names found: {}'.format( list(df.columns))) for name in columns: col = df[name] name = _column_name_to_strings(name) if schema is not None: field = schema.field_by_name(name) type = getattr(field, "type", None) columns_to_convert.append(col) convert_types.append(type) column_names.append(name) for i, column in enumerate(index_columns): columns_to_convert.append(column) convert_types.append(None) name = _index_level_name(column, i, column_names) index_column_names.append(name) # NOTE(wesm): If nthreads=None, then we use a heuristic to decide whether # using a thread pool is worth it. Currently the heuristic is whether the # nrows > 100 * ncols. if nthreads is None: nrows, ncols = len(df), len(df.columns) if nrows > ncols * 100: nthreads = pa.cpu_count() else: nthreads = 1 def convert_column(col, ty): try: return pa.array(col, from_pandas=True, type=ty) except (pa.ArrowInvalid, pa.ArrowTypeError) as e: e.args += ("Conversion failed for column %s" % col.name, ) raise e if nthreads == 1: arrays = [ convert_column(c, t) for c, t in zip(columns_to_convert, convert_types) ] else: from concurrent import futures with futures.ThreadPoolExecutor(nthreads) as executor: arrays = list( executor.map(convert_column, columns_to_convert, convert_types)) types = [x.type for x in arrays] metadata = construct_metadata(df, column_names, index_columns, index_column_names, preserve_index, types) names = column_names + index_column_names return names, arrays, metadata
def generate( path, parameters, format={ "name": "parquet", "row_group_size": 64 }, use_threads=True, ): """ Generate dataset using given parameters and write to given format Parameters ---------- path : str or file-like object Path to write to parameters : Parameters Parameters specifying how to randomly generate data format : Dict Format to write """ # Initialize seeds if parameters.seed is not None: np.random.seed(parameters.seed) column_seeds = np.arange(len(parameters.column_parameters)) np.random.shuffle(column_seeds) # For each column, use a generic Mimesis producer to create an Iterable # for generating data for i, column_params in enumerate(parameters.column_parameters): column_params.generator = column_params.generator( Generic("en", seed=column_seeds[i])) # Get schema for each column schema = pa.schema([ pa.field( name=str(i), type=pa.from_numpy_dtype(type(next(iter( column_params.generator)))), nullable=column_params.null_frequency > 0, ) for i, column_params in enumerate(parameters.column_parameters) ]) # Initialize column data and which columns should be sorted column_data = [None] * len(parameters.column_parameters) columns_to_sort = [ str(i) for i, column_params in enumerate(parameters.column_parameters) if column_params.is_sorted ] # Generate data if not use_threads: for i, column_params in enumerate(parameters.column_parameters): column_data[i] = _generate_column(column_params, parameters.num_rows) else: pool = Pool(pa.cpu_count()) column_data = pool.starmap( _generate_column, [(column_params, parameters.num_rows) for i, column_params in enumerate(parameters.column_parameters)], ) pool.close() pool.join() # Convert to Pandas DataFrame and sort columns appropriately tbl = pa.Table.from_arrays( column_data, schema=schema, ) if columns_to_sort: tbl = tbl.to_pandas() tbl = tbl.sort_values(columns_to_sort) tbl = pa.Table.from_pandas(tbl, schema) # Write _write(tbl, path, format)