Пример #1
0
def pyarrow_cpu_count(cpu_count=pyarrow.cpu_count()):
    old_cpu_count = pyarrow.cpu_count()
    pyarrow.set_cpu_count(cpu_count)
    try:
        yield
    finally:
        pyarrow.set_cpu_count(old_cpu_count)
Пример #2
0
def test_cpu_count():
    n = pa.cpu_count()
    assert n > 0
    try:
        pa.set_cpu_count(n + 5)
        assert pa.cpu_count() == n + 5
    finally:
        pa.set_cpu_count(n)
Пример #3
0
def test_cpu_count():
    n = pa.cpu_count()
    assert n > 0
    try:
        pa.set_cpu_count(n + 5)
        assert pa.cpu_count() == n + 5
    finally:
        pa.set_cpu_count(n)
Пример #4
0
def get_dataframe(parameters, use_threads):
    # Initialize seeds
    if parameters.seed is not None:
        np.random.seed(parameters.seed)

    # For each column, use a generic Mimesis producer to create an Iterable
    # for generating data
    for i, column_params in enumerate(parameters.column_parameters):
        if column_params.dtype is None:
            column_params.generator = column_params.generator(
                Generic("en", seed=parameters.seed))
        else:
            column_params.generator = column_params.generator()

    # Get schema for each column
    schema = pa.schema([
        pa.field(
            name=str(i),
            type=pa.dictionary(
                index_type=pa.int64(),
                value_type=pa.from_numpy_dtype(
                    type(next(iter(column_params.generator)))),
            ) if isinstance(column_params.dtype, str)
            and column_params.dtype == "category" else pa.from_numpy_dtype(
                type(next(iter(column_params.generator)))
                if column_params.dtype is None else column_params.dtype),
            nullable=column_params.null_frequency > 0,
        ) for i, column_params in enumerate(parameters.column_parameters)
    ])

    # Initialize column data and which columns should be sorted
    column_data = [None] * len(parameters.column_parameters)
    columns_to_sort = [
        str(i) for i, column_params in enumerate(parameters.column_parameters)
        if column_params.is_sorted
    ]
    # Generate data
    if not use_threads:
        for i, column_params in enumerate(parameters.column_parameters):
            column_data[i] = _generate_column(column_params,
                                              parameters.num_rows)
    else:
        pool = Pool(pa.cpu_count())
        column_data = pool.starmap(
            _generate_column,
            [(column_params, parameters.num_rows)
             for i, column_params in enumerate(parameters.column_parameters)],
        )
        pool.close()
        pool.join()
    # Convert to Pandas DataFrame and sort columns appropriately
    tbl = pa.Table.from_arrays(
        column_data,
        schema=schema,
    )
    if columns_to_sort:
        tbl = tbl.to_pandas()
        tbl = tbl.sort_values(columns_to_sort)
        tbl = pa.Table.from_pandas(tbl, schema)
    return tbl
Пример #5
0
def dataframe_to_arrays(df, schema, preserve_index, nthreads=1):
    names = []
    index_columns = []
    type = None

    if preserve_index:
        n = len(getattr(df.index, 'levels', [df.index]))
        index_columns.extend(df.index.get_level_values(i) for i in range(n))

    columns_to_convert = []
    convert_types = []
    for name in df.columns:
        col = df[name]
        if not isinstance(name, six.string_types):
            name = str(_column_name_to_strings(name))

        if schema is not None:
            field = schema.field_by_name(name)
            type = getattr(field, "type", None)

        columns_to_convert.append(col)
        convert_types.append(type)
        names.append(name)

    for i, column in enumerate(index_columns):
        columns_to_convert.append(column)
        convert_types.append(None)
        names.append(index_level_name(column, i))

    # NOTE(wesm): If nthreads=None, then we use a heuristic to decide whether
    # using a thread pool is worth it. Currently the heuristic is whether the
    # nrows > 100 * ncols.
    if nthreads is None:
        nrows, ncols = len(df), len(df.columns)
        if nrows > ncols * 100:
            nthreads = pa.cpu_count()
        else:
            nthreads = 1

    def convert_column(col, ty):
        return pa.array(col, from_pandas=True, type=ty)

    if nthreads == 1:
        arrays = [convert_column(c, t)
                  for c, t in zip(columns_to_convert,
                                  convert_types)]
    else:
        from concurrent import futures
        with futures.ThreadPoolExecutor(nthreads) as executor:
            arrays = list(executor.map(convert_column,
                                       columns_to_convert,
                                       convert_types))

    types = [x.type for x in arrays]

    metadata = construct_metadata(
        df, names, index_columns, preserve_index, types
    )
    return names, arrays, metadata
Пример #6
0
def dataframe_to_arrays(df, schema, preserve_index, nthreads=1):
    names = []
    index_columns = []
    type = None

    if preserve_index:
        n = len(getattr(df.index, 'levels', [df.index]))
        index_columns.extend(df.index.get_level_values(i) for i in range(n))

    columns_to_convert = []
    convert_types = []
    for name in df.columns:
        col = df[name]
        if not isinstance(name, six.string_types):
            name = str(_column_name_to_strings(name))

        if schema is not None:
            field = schema.field_by_name(name)
            type = getattr(field, "type", None)

        columns_to_convert.append(col)
        convert_types.append(type)
        names.append(name)

    for i, column in enumerate(index_columns):
        columns_to_convert.append(column)
        convert_types.append(None)
        names.append(index_level_name(column, i))

    # NOTE(wesm): If nthreads=None, then we use a heuristic to decide whether
    # using a thread pool is worth it. Currently the heuristic is whether the
    # nrows > 100 * ncols.
    if nthreads is None:
        nrows, ncols = len(df), len(df.columns)
        if nrows > ncols * 100:
            nthreads = pa.cpu_count()
        else:
            nthreads = 1

    def convert_column(col, ty):
        return pa.array(col, from_pandas=True, type=ty)

    if nthreads == 1:
        arrays = [
            convert_column(c, t)
            for c, t in zip(columns_to_convert, convert_types)
        ]
    else:
        from concurrent import futures
        with futures.ThreadPoolExecutor(nthreads) as executor:
            arrays = list(
                executor.map(convert_column, columns_to_convert,
                             convert_types))

    types = [x.type for x in arrays]

    metadata = construct_metadata(df, names, index_columns, preserve_index,
                                  types)
    return names, arrays, metadata
Пример #7
0
def dataframe_to_arrays(df,
                        schema,
                        preserve_index,
                        nthreads=1,
                        columns=None,
                        safe=True):
    names, column_names, index_columns, index_column_names, \
        columns_to_convert, convert_types = _get_columns_to_convert(
            df, schema, preserve_index, columns
        )

    # NOTE(wesm): If nthreads=None, then we use a heuristic to decide whether
    # using a thread pool is worth it. Currently the heuristic is whether the
    # nrows > 100 * ncols.
    if nthreads is None:
        nrows, ncols = len(df), len(df.columns)
        if nrows > ncols * 100:
            nthreads = pa.cpu_count()
        else:
            nthreads = 1

    def convert_column(col, ty):
        try:
            return pa.array(col, type=ty, from_pandas=True, safe=safe)
        except (pa.ArrowInvalid, pa.ArrowNotImplementedError,
                pa.ArrowTypeError) as e:
            e.args += (
                "Conversion failed for column {0!s} with type {1!s}".format(
                    col.name, col.dtype), )
            raise e

    if nthreads == 1:
        arrays = [
            convert_column(c, t)
            for c, t in zip(columns_to_convert, convert_types)
        ]
    else:
        from concurrent import futures
        with futures.ThreadPoolExecutor(nthreads) as executor:
            arrays = list(
                executor.map(convert_column, columns_to_convert,
                             convert_types))

    types = [x.type for x in arrays]

    metadata = construct_metadata(df, column_names, index_columns,
                                  index_column_names, preserve_index, types)

    return names, arrays, metadata
Пример #8
0
def dataframe_to_arrays(df, schema, preserve_index, nthreads=1, columns=None,
                        safe=True):
    (all_names,
     column_names,
     index_descriptors,
     index_columns,
     columns_to_convert,
     convert_types) = _get_columns_to_convert(df, schema, preserve_index,
                                              columns)

    # NOTE(wesm): If nthreads=None, then we use a heuristic to decide whether
    # using a thread pool is worth it. Currently the heuristic is whether the
    # nrows > 100 * ncols.
    if nthreads is None:
        nrows, ncols = len(df), len(df.columns)
        if nrows > ncols * 100:
            nthreads = pa.cpu_count()
        else:
            nthreads = 1

    def convert_column(col, ty):
        try:
            return pa.array(col, type=ty, from_pandas=True, safe=safe)
        except (pa.ArrowInvalid,
                pa.ArrowNotImplementedError,
                pa.ArrowTypeError) as e:
            e.args += ("Conversion failed for column {0!s} with type {1!s}"
                       .format(col.name, col.dtype),)
            raise e

    if nthreads == 1:
        arrays = [convert_column(c, t)
                  for c, t in zip(columns_to_convert,
                                  convert_types)]
    else:
        from concurrent import futures
        with futures.ThreadPoolExecutor(nthreads) as executor:
            arrays = list(executor.map(convert_column,
                                       columns_to_convert,
                                       convert_types))

    types = [x.type for x in arrays]

    metadata = construct_metadata(df, column_names, index_columns,
                                  index_descriptors, preserve_index,
                                  types)
    return all_names, arrays, metadata
Пример #9
0
def dataframe_to_arrays(df,
                        schema,
                        preserve_index,
                        nthreads=1,
                        columns=None,
                        safe=True):
    (all_names, column_names, index_column_names, index_descriptors,
     index_columns, columns_to_convert,
     convert_fields) = _get_columns_to_convert(df, schema, preserve_index,
                                               columns)

    # NOTE(wesm): If nthreads=None, then we use a heuristic to decide whether
    # using a thread pool is worth it. Currently the heuristic is whether the
    # nrows > 100 * ncols and ncols > 1.
    if nthreads is None:
        nrows, ncols = len(df), len(df.columns)
        if nrows > ncols * 100 and ncols > 1:
            nthreads = pa.cpu_count()
        else:
            nthreads = 1

    def convert_column(col, field):
        if field is None:
            field_nullable = True
            type_ = None
        else:
            field_nullable = field.nullable
            type_ = field.type

        try:
            result = pa.array(col, type=type_, from_pandas=True, safe=safe)
        except (pa.ArrowInvalid, pa.ArrowNotImplementedError,
                pa.ArrowTypeError) as e:
            e.args += (
                "Conversion failed for column {!s} with type {!s}".format(
                    col.name, col.dtype), )
            raise e
        if not field_nullable and result.null_count > 0:
            raise ValueError("Field {} was non-nullable but pandas column "
                             "had {} null values".format(
                                 str(field), result.null_count))
        return result

    def _can_definitely_zero_copy(arr):
        return (isinstance(arr, np.ndarray) and arr.flags.contiguous
                and issubclass(arr.dtype.type, np.integer))

    if nthreads == 1:
        arrays = [
            convert_column(c, f)
            for c, f in zip(columns_to_convert, convert_fields)
        ]
    else:
        from concurrent import futures

        arrays = []
        with futures.ThreadPoolExecutor(nthreads) as executor:
            for c, f in zip(columns_to_convert, convert_fields):
                if _can_definitely_zero_copy(c.values):
                    arrays.append(convert_column(c, f))
                else:
                    arrays.append(executor.submit(convert_column, c, f))

        for i, maybe_fut in enumerate(arrays):
            if isinstance(maybe_fut, futures.Future):
                arrays[i] = maybe_fut.result()

    types = [x.type for x in arrays]

    if schema is None:
        fields = []
        for name, type_ in zip(all_names, types):
            name = name if name is not None else 'None'
            fields.append(pa.field(name, type_))
        schema = pa.schema(fields)

    pandas_metadata = construct_metadata(columns_to_convert, df, column_names,
                                         index_columns, index_descriptors,
                                         preserve_index, types)
    metadata = deepcopy(schema.metadata) if schema.metadata else dict()
    metadata.update(pandas_metadata)
    schema = schema.with_metadata(metadata)

    return arrays, schema
Пример #10
0
def dataframe_to_arrays(df,
                        schema,
                        preserve_index,
                        nthreads=1,
                        columns=None,
                        safe=True):
    (all_names, column_names, index_column_names, index_descriptors,
     index_columns, columns_to_convert,
     convert_fields) = _get_columns_to_convert(df, schema, preserve_index,
                                               columns)

    # NOTE(wesm): If nthreads=None, then we use a heuristic to decide whether
    # using a thread pool is worth it. Currently the heuristic is whether the
    # nrows > 100 * ncols.
    if nthreads is None:
        nrows, ncols = len(df), len(df.columns)
        if nrows > ncols * 100:
            nthreads = pa.cpu_count()
        else:
            nthreads = 1

    def convert_column(col, field):
        if field is None:
            field_nullable = True
            type_ = None
        else:
            field_nullable = field.nullable
            type_ = field.type

        try:
            result = pa.array(col, type=type_, from_pandas=True, safe=safe)
        except (pa.ArrowInvalid, pa.ArrowNotImplementedError,
                pa.ArrowTypeError) as e:
            e.args += (
                "Conversion failed for column {0!s} with type {1!s}".format(
                    col.name, col.dtype), )
            raise e
        if not field_nullable and result.null_count > 0:
            raise ValueError("Field {} was non-nullable but pandas column "
                             "had {} null values".format(
                                 str(field), result.null_count))
        return result

    if nthreads == 1:
        arrays = [
            convert_column(c, f)
            for c, f in zip(columns_to_convert, convert_fields)
        ]
    else:
        from concurrent import futures
        with futures.ThreadPoolExecutor(nthreads) as executor:
            arrays = list(
                executor.map(convert_column, columns_to_convert,
                             convert_fields))

    types = [x.type for x in arrays]

    if schema is not None:
        # add index columns
        index_types = types[len(column_names):]
        for name, type_ in zip(index_column_names, index_types):
            name = name if name is not None else 'None'
            schema = schema.append(pa.field(name, type_))
    else:
        fields = []
        for name, type_ in zip(all_names, types):
            name = name if name is not None else 'None'
            fields.append(pa.field(name, type_))
        schema = pa.schema(fields)

    metadata = construct_metadata(df, column_names, index_columns,
                                  index_descriptors, preserve_index, types)
    schema = schema.with_metadata(metadata)

    return arrays, schema
Пример #11
0
import concurrent.futures
import json
import os

import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import scipy.sparse
from anndata import AnnData
from cirrocumulus.abstract_dataset import AbstractDataset

max_workers = min(12, pa.cpu_count())
executor = concurrent.futures.ThreadPoolExecutor(max_workers=max_workers)


def read_table(path, filesystem, columns=None):
    return pq.read_table(path,
                         filesystem=filesystem,
                         columns=columns,
                         use_threads=False)


def read_tables(paths, filesystem, columns=None):
    futures = []
    for path in paths:
        future = executor.submit(read_table,
                                 path,
                                 filesystem=filesystem,
                                 columns=columns)
        futures.append(future)
Пример #12
0
        opath = DOWNLOAD_DIR + '/' + f['filename']
        if not vitaldb.api.download(f['filename'], opath):
            print('failed')
        else:
            print('done')

quit()

import boto3
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from joblib import Parallel, delayed

# cpu core 갯수를 설정
ncpu = pa.cpu_count()
print('{} cpu core'.format(ncpu))
pa.set_cpu_count(ncpu)
pa.set_io_thread_count(ncpu)

bucket_name = 'vitaldb-parquets'
prefix = 'vitaldb2017/1608/D1'
track_names = ['Solar8000/HR', 'SNUADC/ECG_II']

odir = 'vital_files'
if not os.path.exists(odir):
    os.mkdir(odir)

# 병렬 처리
s3 = boto3.resource('s3')
Пример #13
0
def dataframe_to_arrays(df, schema, preserve_index, nthreads=1, columns=None):
    if columns is None:
        columns = df.columns
    column_names = []
    index_columns = []
    index_column_names = []
    type = None

    if preserve_index:
        n = len(getattr(df.index, 'levels', [df.index]))
        index_columns.extend(df.index.get_level_values(i) for i in range(n))

    columns_to_convert = []
    convert_types = []

    if not df.columns.is_unique:
        raise ValueError(
            'Duplicate column names found: {}'.format(list(df.columns))
        )

    for name in columns:
        col = df[name]
        name = _column_name_to_strings(name)

        if schema is not None:
            field = schema.field_by_name(name)
            type = getattr(field, "type", None)

        columns_to_convert.append(col)
        convert_types.append(type)
        column_names.append(name)

    for i, column in enumerate(index_columns):
        columns_to_convert.append(column)
        convert_types.append(None)
        name = _index_level_name(column, i, column_names)
        index_column_names.append(name)

    # NOTE(wesm): If nthreads=None, then we use a heuristic to decide whether
    # using a thread pool is worth it. Currently the heuristic is whether the
    # nrows > 100 * ncols.
    if nthreads is None:
        nrows, ncols = len(df), len(df.columns)
        if nrows > ncols * 100:
            nthreads = pa.cpu_count()
        else:
            nthreads = 1

    def convert_column(col, ty):
        try:
            return pa.array(col, from_pandas=True, type=ty)
        except (pa.ArrowInvalid,
                pa.ArrowNotImplementedError,
                pa.ArrowTypeError) as e:
            e.args += ("Conversion failed for column {0!s} with type {1!s}"
                       .format(col.name, col.dtype),)
            raise e

    if nthreads == 1:
        arrays = [convert_column(c, t)
                  for c, t in zip(columns_to_convert,
                                  convert_types)]
    else:
        from concurrent import futures
        with futures.ThreadPoolExecutor(nthreads) as executor:
            arrays = list(executor.map(convert_column,
                                       columns_to_convert,
                                       convert_types))

    types = [x.type for x in arrays]

    metadata = construct_metadata(
        df, column_names, index_columns, index_column_names, preserve_index,
        types
    )
    names = column_names + index_column_names
    return names, arrays, metadata
Пример #14
0
def dataframe_to_arrays(df, schema, preserve_index, nthreads=1, columns=None):
    if columns is None:
        columns = df.columns
    column_names = []
    index_columns = []
    index_column_names = []
    type = None

    if preserve_index:
        n = len(getattr(df.index, 'levels', [df.index]))
        index_columns.extend(df.index.get_level_values(i) for i in range(n))

    columns_to_convert = []
    convert_types = []

    if not df.columns.is_unique:
        raise ValueError('Duplicate column names found: {}'.format(
            list(df.columns)))

    for name in columns:
        col = df[name]
        name = _column_name_to_strings(name)

        if schema is not None:
            field = schema.field_by_name(name)
            type = getattr(field, "type", None)

        columns_to_convert.append(col)
        convert_types.append(type)
        column_names.append(name)

    for i, column in enumerate(index_columns):
        columns_to_convert.append(column)
        convert_types.append(None)
        name = _index_level_name(column, i, column_names)
        index_column_names.append(name)

    # NOTE(wesm): If nthreads=None, then we use a heuristic to decide whether
    # using a thread pool is worth it. Currently the heuristic is whether the
    # nrows > 100 * ncols.
    if nthreads is None:
        nrows, ncols = len(df), len(df.columns)
        if nrows > ncols * 100:
            nthreads = pa.cpu_count()
        else:
            nthreads = 1

    def convert_column(col, ty):
        try:
            return pa.array(col, from_pandas=True, type=ty)
        except (pa.ArrowInvalid, pa.ArrowTypeError) as e:
            e.args += ("Conversion failed for column %s" % col.name, )
            raise e

    if nthreads == 1:
        arrays = [
            convert_column(c, t)
            for c, t in zip(columns_to_convert, convert_types)
        ]
    else:
        from concurrent import futures
        with futures.ThreadPoolExecutor(nthreads) as executor:
            arrays = list(
                executor.map(convert_column, columns_to_convert,
                             convert_types))

    types = [x.type for x in arrays]

    metadata = construct_metadata(df, column_names, index_columns,
                                  index_column_names, preserve_index, types)
    names = column_names + index_column_names
    return names, arrays, metadata
Пример #15
0
def generate(
    path,
    parameters,
    format={
        "name": "parquet",
        "row_group_size": 64
    },
    use_threads=True,
):
    """
    Generate dataset using given parameters and write to given format

    Parameters
    ----------
    path : str or file-like object
        Path to write to
    parameters : Parameters
        Parameters specifying how to randomly generate data
    format : Dict
        Format to write
    """

    # Initialize seeds
    if parameters.seed is not None:
        np.random.seed(parameters.seed)
    column_seeds = np.arange(len(parameters.column_parameters))
    np.random.shuffle(column_seeds)

    # For each column, use a generic Mimesis producer to create an Iterable
    # for generating data
    for i, column_params in enumerate(parameters.column_parameters):
        column_params.generator = column_params.generator(
            Generic("en", seed=column_seeds[i]))

    # Get schema for each column
    schema = pa.schema([
        pa.field(
            name=str(i),
            type=pa.from_numpy_dtype(type(next(iter(
                column_params.generator)))),
            nullable=column_params.null_frequency > 0,
        ) for i, column_params in enumerate(parameters.column_parameters)
    ])

    # Initialize column data and which columns should be sorted
    column_data = [None] * len(parameters.column_parameters)
    columns_to_sort = [
        str(i) for i, column_params in enumerate(parameters.column_parameters)
        if column_params.is_sorted
    ]

    # Generate data
    if not use_threads:
        for i, column_params in enumerate(parameters.column_parameters):
            column_data[i] = _generate_column(column_params,
                                              parameters.num_rows)
    else:
        pool = Pool(pa.cpu_count())
        column_data = pool.starmap(
            _generate_column,
            [(column_params, parameters.num_rows)
             for i, column_params in enumerate(parameters.column_parameters)],
        )
        pool.close()
        pool.join()

    # Convert to Pandas DataFrame and sort columns appropriately
    tbl = pa.Table.from_arrays(
        column_data,
        schema=schema,
    )
    if columns_to_sort:
        tbl = tbl.to_pandas()
        tbl = tbl.sort_values(columns_to_sort)
        tbl = pa.Table.from_pandas(tbl, schema)

    # Write
    _write(tbl, path, format)