Python StringField примеры использования

Язык программирования: Python

Пространство имен/Пакет: mars.serialization.serializables

Класс/Тип: StringField

Примеров на hotexamples.com: 6

Python StringField - 6 примеров найдено. Это лучшие примеры Python кода для mars.serialization.serializables.StringField, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

StringField(6)

Основные методы

StringField (6)

Пример #1

Показать файл

Файл: test_serializable.py Проект: TianhaoFu/mars

class MySerializable(Serializable):
    _id = IdentityField('id')
    _any_val = AnyField('any_val')
    _bool_val = BoolField('bool_val')
    _int8_val = Int8Field('int8_val')
    _int16_val = Int16Field('int16_val')
    _int32_val = Int32Field('int32_val')
    _int64_val = Int64Field('int64_val')
    _uint8_val = UInt8Field('uint8_val')
    _uint16_val = UInt16Field('uint16_val')
    _uint32_val = UInt32Field('uint32_val')
    _uint64_val = UInt64Field('uint64_val')
    _float16_val = Float16Field('float16_val')
    _float32_val = Float32Field('float32_val',
                                on_serialize=lambda x: x + 1,
                                on_deserialize=lambda x: x - 1)
    _float64_val = Float64Field('float64_val')
    _complex64_val = Complex64Field('complex64_val')
    _complex128_val = Complex128Field('complex128_val')
    _string_val = StringField('string_val')
    _bytes_val = BytesField('bytes_val')
    _key_val = KeyField('key_val')
    _ndarray_val = NDArrayField('ndarray_val')
    _datetime64_val = Datetime64Field('datetime64_val')
    _timedelta64_val = Timedelta64Field('timedelta64_val')
    _datatype_val = DataTypeField('datatype_val')
    _index_val = IndexField('index_val')
    _series_val = SeriesField('series_val')
    _dataframe_val = DataFrameField('dataframe_val')
    _interval_array_val = IntervalArrayField('interval_array_val')
    _slice_val = SliceField('slice_val')
    _function_val = FunctionField('function_val')
    _named_tuple_val = NamedTupleField('named_tuple_val')
    _tzinfo_val = TZInfoField('tzinfo_val')
    _list_val = ListField('list_val', FieldTypes.int64)
    _tuple_val = TupleField('tuple_val', FieldTypes.string)
    _dict_val = DictField('dict_val', FieldTypes.string, FieldTypes.bytes)
    _ref_val = ReferenceField('ref_val', 'self')
    _ref_val2 = ReferenceField('ref_val2', MySimpleSerializable)
    _oneof_val = OneOfField('ref_val',
                            oneof1_val=f'{__name__}.MySerializable',
                            oneof2_val=MySimpleSerializable)

Пример #2

Показать файл

Файл: datastore.py Проект: wjsi/aliyun-odps-python-sdk

class DataFrameWriteTableCommit(DataFrameOperand, DataFrameOperandMixin):
    _op_type_ = 123462

    _dtypes = SeriesField('dtypes')

    _odps_params = DictField('odps_params')
    _table_name = StringField('table_name')
    _overwrite = BoolField('overwrite')
    _blocks = DictField('blocks')
    _cupid_handle = StringField('cupid_handle')
    _is_terminal = BoolField('is_terminal')

    def __init__(self,
                 dtypes=None,
                 odps_params=None,
                 table_name=None,
                 blocks=None,
                 cupid_handle=None,
                 overwrite=False,
                 is_terminal=None,
                 **kw):
        kw.update(_output_type_kw)
        super(DataFrameWriteTableCommit,
              self).__init__(_dtypes=dtypes,
                             _odps_params=odps_params,
                             _table_name=table_name,
                             _blocks=blocks,
                             _overwrite=overwrite,
                             _cupid_handle=cupid_handle,
                             _is_terminal=is_terminal,
                             **kw)

    @property
    def dtypes(self):
        return self._dtypes

    @property
    def table_name(self):
        return self._table_name

    @property
    def blocks(self):
        return self._blocks

    @property
    def overwrite(self):
        return self._overwrite

    @property
    def cupid_handle(self):
        return self._cupid_handle

    @property
    def odps_params(self):
        return self._odps_params

    @property
    def is_terminal(self):
        return self._is_terminal

    @classmethod
    def execute(cls, ctx, op):
        import pandas as pd
        from ..cupid_service import CupidServiceClient

        if op.is_terminal:
            odps_params = op.odps_params.copy()
            project = os.environ.get('ODPS_PROJECT_NAME', None)
            if project:
                odps_params['project'] = project

            client = CupidServiceClient()
            client.commit_table_upload_session(odps_params, op.table_name,
                                               op.cupid_handle, op.blocks,
                                               op.overwrite)

        ctx[op.outputs[0].key] = pd.DataFrame()

Пример #3

Показать файл

Файл: datastore.py Проект: wjsi/aliyun-odps-python-sdk

class DataFrameWriteTable(DataFrameOperand, DataFrameOperandMixin):
    _op_type_ = 123460

    _dtypes = SeriesField('dtypes')

    _odps_params = DictField('odps_params')
    _table_name = StringField('table_name')
    _partition_spec = StringField('partition_spec')
    _overwrite = BoolField('overwrite')
    _write_batch_size = Int64Field('write_batch_size')
    _unknown_as_string = BoolField('unknown_as_string')

    def __init__(self,
                 dtypes=None,
                 odps_params=None,
                 table_name=None,
                 partition_spec=None,
                 unknown_as_string=None,
                 over_write=None,
                 write_batch_size=None,
                 **kw):
        kw.update(_output_type_kw)
        super(DataFrameWriteTable,
              self).__init__(_dtypes=dtypes,
                             _odps_params=odps_params,
                             _table_name=table_name,
                             _partition_spec=partition_spec,
                             _unknown_as_string=unknown_as_string,
                             _overwrite=over_write,
                             _write_batch_size=write_batch_size,
                             **kw)

    @property
    def retryable(self):
        return False

    @property
    def dtypes(self):
        return self._dtypes

    @property
    def unknown_as_string(self):
        return self._unknown_as_string

    @property
    def odps_params(self):
        return self._odps_params

    @property
    def table_name(self):
        return self._table_name

    @property
    def partition_spec(self):
        return self._partition_spec

    @property
    def overwrite(self):
        return self._overwrite

    @property
    def write_batch_size(self):
        return self._write_batch_size

    def __call__(self, x):
        shape = (0, ) * len(x.shape)
        index_value = parse_index(x.index_value.to_pandas()[:0], x.key,
                                  'index')
        columns_value = parse_index(x.columns_value.to_pandas()[:0],
                                    x.key,
                                    'columns',
                                    store_data=True)
        return self.new_dataframe([x],
                                  shape=shape,
                                  dtypes=x.dtypes[:0],
                                  index_value=index_value,
                                  columns_value=columns_value)

    @classmethod
    def _tile_cupid(cls, op):
        from mars.dataframe.utils import build_concatenated_rows_frame

        cupid_client = CupidServiceClient()
        upload_handle = cupid_client.create_table_upload_session(
            op.odps_params, op.table_name)

        input_df = build_concatenated_rows_frame(op.inputs[0])
        out_df = op.outputs[0]

        out_chunks = []
        out_chunk_shape = (0, ) * len(input_df.shape)
        blocks = {}
        for chunk in input_df.chunks:
            block_id = str(int(time.time())) + '_' + str(uuid.uuid4()).replace(
                '-', '')
            chunk_op = DataFrameWriteTableSplit(
                dtypes=op.dtypes,
                table_name=op.table_name,
                odps_params=op.odps_params,
                unknown_as_string=op.unknown_as_string,
                partition_spec=op.partition_spec,
                cupid_handle=to_str(upload_handle),
                block_id=block_id,
                write_batch_size=op.write_batch_size)
            out_chunk = chunk_op.new_chunk([chunk],
                                           shape=out_chunk_shape,
                                           index=chunk.index,
                                           index_value=out_df.index_value,
                                           dtypes=chunk.dtypes)
            out_chunks.append(out_chunk)
            blocks[block_id] = op.partition_spec

        # build commit tree
        combine_size = 8
        chunks = out_chunks
        while len(chunks) >= combine_size:
            new_chunks = []
            for i in range(0, len(chunks), combine_size):
                chks = chunks[i:i + combine_size]
                if len(chks) == 1:
                    chk = chks[0]
                else:
                    chk_op = DataFrameWriteTableCommit(dtypes=op.dtypes,
                                                       is_terminal=False)
                    chk = chk_op.new_chunk(chks,
                                           shape=out_chunk_shape,
                                           index_value=out_df.index_value,
                                           dtypes=op.dtypes)
                new_chunks.append(chk)
            chunks = new_chunks

        assert len(chunks) < combine_size

        commit_table_op = DataFrameWriteTableCommit(
            dtypes=op.dtypes,
            table_name=op.table_name,
            blocks=blocks,
            cupid_handle=to_str(upload_handle),
            overwrite=op.overwrite,
            odps_params=op.odps_params,
            is_terminal=True)
        commit_table_chunk = commit_table_op.new_chunk(
            chunks,
            shape=out_chunk_shape,
            dtypes=op.dtypes,
            index_value=out_df.index_value,
            index=(0, ) * len(out_chunk_shape))

        new_op = op.copy()
        return new_op.new_dataframes(op.inputs,
                                     shape=out_df.shape,
                                     index_value=out_df.index_value,
                                     dtypes=out_df.dtypes,
                                     columns_value=out_df.columns_value,
                                     chunks=[commit_table_chunk],
                                     nsplits=((0, ), ) * len(out_chunk_shape))

    @classmethod
    def _tile_tunnel(cls, op):
        out_df = op.outputs[0]
        in_df = build_concatenated_rows_frame(op.inputs[0])

        out_chunks = []
        for chunk in in_df.chunks:
            chunk_op = DataFrameWriteTableSplit(
                dtypes=op.dtypes,
                table_name=op.table_name,
                odps_params=op.odps_params,
                partition_spec=op.partition_spec)
            index_value = parse_index(chunk.index_value.to_pandas()[:0], chunk)
            out_chunk = chunk_op.new_chunk([chunk],
                                           shape=(0, 0),
                                           index_value=index_value,
                                           columns_value=out_df.columns_value,
                                           dtypes=out_df.dtypes,
                                           index=chunk.index)
            out_chunks.append(out_chunk)

        new_op = op.copy()
        params = out_df.params.copy()
        params.update(
            dict(chunks=out_chunks,
                 nsplits=((0, ) * in_df.chunk_shape[0], (0, ))))
        return new_op.new_tileables([in_df], **params)

    @classmethod
    def tile(cls, op):
        if 'CUPID_SERVICE_SOCKET' in os.environ:
            return cls._tile_cupid(op)
        else:
            return cls._tile_tunnel(op)

Пример #4

Показать файл

Файл: datastore.py Проект: wjsi/aliyun-odps-python-sdk

class DataFrameWriteTableSplit(DataFrameOperand, DataFrameOperandMixin):
    _op_type_ = 123461

    _dtypes = SeriesField('dtypes')

    _table_name = StringField('table_name')
    _partition_spec = StringField('partition_spec')
    _cupid_handle = StringField('cupid_handle')
    _block_id = StringField('block_id')
    _write_batch_size = Int64Field('write_batch_size')
    _unknown_as_string = BoolField('unknown_as_string')

    # for tunnel
    _odps_params = DictField('odps_params')

    def __init__(self,
                 dtypes=None,
                 table_name=None,
                 odps_params=None,
                 partition_spec=None,
                 cupid_handle=None,
                 unknown_as_string=None,
                 block_id=None,
                 write_batch_size=None,
                 **kw):
        kw.update(_output_type_kw)
        super(DataFrameWriteTableSplit,
              self).__init__(_dtypes=dtypes,
                             _table_name=table_name,
                             _odps_params=odps_params,
                             _partition_spec=partition_spec,
                             _unknown_as_string=unknown_as_string,
                             _cupid_handle=cupid_handle,
                             _block_id=block_id,
                             _write_batch_size=write_batch_size,
                             **kw)

    @property
    def retryable(self):
        return False

    @property
    def dtypes(self):
        return self._dtypes

    @property
    def table_name(self):
        return self._table_name

    @property
    def odps_params(self):
        return self._odps_params

    @property
    def unknown_as_string(self):
        return self._unknown_as_string

    @property
    def partition_spec(self):
        return self._partition_spec

    @property
    def cupid_handle(self):
        return self._cupid_handle

    @property
    def block_id(self):
        return self._block_id

    @property
    def write_batch_size(self):
        return self._write_batch_size

    @classmethod
    def _execute_in_cupid(cls, ctx, op):
        import os

        import pandas as pd
        from odps import ODPS
        from odps.accounts import BearerTokenAccount

        cupid_client = CupidServiceClient()
        to_store_data = ctx[op.inputs[0].key]

        bearer_token = cupid_client.get_bearer_token()
        account = BearerTokenAccount(bearer_token)
        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(None,
                 None,
                 account=account,
                 project=odps_params['project'],
                 endpoint=endpoint)
        odps_schema = o.get_table(op.table_name).schema
        project_name, table_name = op.table_name.split('.')

        writer_config = dict(_table_name=table_name,
                             _project_name=project_name,
                             _table_schema=odps_schema,
                             _partition_spec=op.partition_spec,
                             _block_id=op.block_id,
                             _handle=op.cupid_handle)
        cupid_client.write_table_data(writer_config, to_store_data,
                                      op.write_batch_size)
        ctx[op.outputs[0].key] = pd.DataFrame()

    @classmethod
    def _execute_arrow_tunnel(cls, ctx, op):
        from odps import ODPS
        from odps.tunnel import TableTunnel
        import pyarrow as pa
        import pandas as pd

        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(odps_params['access_id'],
                 odps_params['secret_access_key'],
                 project=odps_params['project'],
                 endpoint=endpoint)

        t = o.get_table(op.table_name)
        tunnel = TableTunnel(o, project=t.project)
        retry_times = options.retry_times

        retries = 0
        while True:
            try:
                if op.partition_spec is not None:
                    upload_session = tunnel.create_upload_session(
                        t.name, partition_spec=op.partition_spec)
                else:
                    upload_session = tunnel.create_upload_session(t.name)
                break
            except:
                if retries >= retry_times:
                    raise
                time.sleep(1)

        logger.debug('Start writing table %s split index: %s', op.table_name,
                     op.inputs[0].index)

        retries = 0
        while True:
            try:
                writer = upload_session.open_arrow_writer(0)
                arrow_rb = pa.RecordBatch.from_pandas(ctx[op.inputs[0].key])
                writer.write(arrow_rb)
                writer.close()
                break
            except:
                if retries >= retry_times:
                    raise
                time.sleep(1)

        upload_session.commit([0])
        logger.debug('Finish writing table %s split index: %s', op.table_name,
                     op.inputs[0].index)

        ctx[op.outputs[0].key] = pd.DataFrame()

    @classmethod
    def execute(cls, ctx, op):
        if op.cupid_handle is not None:
            cls._execute_in_cupid(ctx, op)
        else:
            cls._execute_arrow_tunnel(ctx, op)

Пример #5

Показать файл

class DataFrameReadTable(*_BASE):
    _op_type_ = 123450

    _odps_params = DictField('odps_params')
    _table_name = StringField('table_name')
    _partition_spec = StringField('partition_spec')
    _dtypes = SeriesField('dtypes')
    _incremental_index = BoolField('incremental_index')
    _columns = AnyField('columns')
    _nrows = Int64Field('nrows')
    _use_arrow_dtype = BoolField('use_arrow_dtype')
    _string_as_binary = BoolField('string_as_binary')
    _append_partitions = BoolField('append_partitions')
    _last_modified_time = Int64Field('last_modified_time')
    _with_split_meta_on_tile = BoolField('with_split_meta_on_tile')
    _retry_times = Int64Field('retry_times')

    def __init__(self,
                 odps_params=None,
                 table_name=None,
                 partition_spec=None,
                 columns=None,
                 dtypes=None,
                 nrows=None,
                 sparse=None,
                 incremental_index=True,
                 use_arrow_dtype=None,
                 string_as_binary=None,
                 memory_scale=None,
                 append_partitions=None,
                 last_modified_time=None,
                 with_split_meta_on_tile=False,
                 retry_times=None,
                 **kw):
        super(DataFrameReadTable,
              self).__init__(_odps_params=odps_params,
                             _table_name=table_name,
                             _partition_spec=partition_spec,
                             _columns=columns,
                             _dtypes=dtypes,
                             _nrows=nrows,
                             _sparse=sparse,
                             _use_arrow_dtype=use_arrow_dtype,
                             _string_as_binary=string_as_binary,
                             _incremental_index=incremental_index,
                             _append_partitions=append_partitions,
                             _last_modified_time=last_modified_time,
                             _memory_scale=memory_scale,
                             _with_split_meta_on_tile=with_split_meta_on_tile,
                             _retry_times=retry_times,
                             _output_types=[OutputType.dataframe],
                             **kw)

    @property
    def retryable(self):
        return False

    @property
    def odps_params(self):
        return self._odps_params

    @property
    def table_name(self):
        return self._table_name

    @property
    def partition(self):
        return getattr(self, '_partition_spec', None)

    @property
    def columns(self):
        return self._columns

    @property
    def dtypes(self):
        return self._dtypes

    @property
    def nrows(self):
        return self._nrows

    @property
    def use_arrow_dtype(self):
        return self._use_arrow_dtype

    @property
    def string_as_binary(self):
        return self._string_as_binary

    @property
    def incremental_index(self):
        return self._incremental_index

    @property
    def append_partitions(self):
        return self._append_partitions

    @property
    def with_split_meta_on_tile(self):
        return self._with_split_meta_on_tile

    @property
    def retry_times(self):
        return self._retry_times

    def get_columns(self):
        return self._columns

    def set_pruned_columns(self,
                           columns,
                           *,
                           keep_order=None):  # pragma: no cover
        self._columns = columns

    def __call__(self, shape, chunk_bytes=None, chunk_size=None):
        import numpy as np
        import pandas as pd

        if np.isnan(shape[0]):
            index_value = parse_index(pd.RangeIndex(0))
        else:
            index_value = parse_index(pd.RangeIndex(shape[0]))
        columns_value = parse_index(self.dtypes.index, store_data=True)
        return self.new_dataframe(None,
                                  shape,
                                  dtypes=self.dtypes,
                                  index_value=index_value,
                                  columns_value=columns_value,
                                  chunk_bytes=chunk_bytes,
                                  chunk_size=chunk_size)

    @classmethod
    def _tile_cupid(cls, op):
        import numpy as np
        import pandas as pd
        from mars.core.context import get_context

        df = op.outputs[0]
        split_size = df.extra_params.chunk_bytes or CHUNK_BYTES_LIMIT

        out_dtypes = df.dtypes
        out_shape = df.shape
        out_columns_value = df.columns_value
        if op.columns is not None:
            out_dtypes = out_dtypes[op.columns]
            out_shape = (df.shape[0], len(op.columns))
            out_columns_value = parse_index(out_dtypes.index, store_data=True)

        mars_context = get_context()
        if mars_context is not None:
            worker_count = len(mars_context.get_worker_addresses())
        else:
            worker_count = None

        cupid_client = CupidServiceClient()
        try:
            parts = cupid_client.enum_table_partitions(op.odps_params,
                                                       op.table_name,
                                                       op.partition)
            if parts is None:
                parts = [None]

            out_chunks = []
            chunk_idx = 0

            for partition_spec in parts:
                splits, split_size = cupid_client.create_table_download_session(
                    op.odps_params, op.table_name, partition_spec, op.columns,
                    worker_count, split_size, MAX_CHUNK_NUM,
                    op.with_split_meta_on_tile)

                logger.debug('%s table splits have been created.',
                             str(len(splits)))

                meta_chunk_rows = [split.meta_row_count for split in splits]
                if np.isnan(out_shape[0]):
                    est_chunk_rows = meta_chunk_rows
                else:
                    sp_file_sizes = np.array([
                        sp.split_file_end - sp.split_file_start
                        for sp in splits
                    ])
                    total_size = sp_file_sizes.sum()
                    ratio_chunk_rows = (sp_file_sizes * out_shape[0] //
                                        total_size).tolist()
                    est_chunk_rows = [
                        mr if mr is not None else rr
                        for mr, rr in zip(meta_chunk_rows, ratio_chunk_rows)
                    ]

                logger.warning('Estimated chunk rows: %r', est_chunk_rows)

                if len(splits) == 0:
                    logger.debug('Table %s has no data', op.table_name)
                    chunk_op = DataFrameReadTableSplit()
                    index_value = parse_index(pd.RangeIndex(0))
                    columns_value = parse_index(out_dtypes.index,
                                                store_data=True)
                    out_chunk = chunk_op.new_chunk(None,
                                                   shape=(np.nan,
                                                          out_shape[1]),
                                                   dtypes=op.dtypes,
                                                   index_value=index_value,
                                                   columns_value=columns_value,
                                                   index=(chunk_idx, 0))
                    out_chunks.append(out_chunk)
                    chunk_idx += 1
                else:
                    for idx, split in enumerate(splits):
                        chunk_op = DataFrameReadTableSplit(
                            cupid_handle=to_str(split.handle),
                            split_index=split.split_index,
                            split_file_start=split.split_file_start,
                            split_file_end=split.split_file_end,
                            schema_file_start=split.schema_file_start,
                            schema_file_end=split.schema_file_end,
                            incremental_index=op.incremental_index,
                            dtypes=out_dtypes,
                            sparse=op.sparse,
                            split_size=split_size,
                            string_as_binary=op.string_as_binary,
                            use_arrow_dtype=op.use_arrow_dtype,
                            estimate_rows=est_chunk_rows[idx],
                            partition_spec=partition_spec,
                            append_partitions=op.append_partitions,
                            meta_raw_size=split.meta_raw_size,
                            nrows=meta_chunk_rows[idx] or op.nrows,
                            memory_scale=op.memory_scale,
                            extra_params=op.extra_params)
                        # the chunk shape is unknown
                        index_value = parse_index(pd.RangeIndex(0))
                        columns_value = parse_index(out_dtypes.index,
                                                    store_data=True)
                        out_chunk = chunk_op.new_chunk(
                            None,
                            shape=(np.nan, out_shape[1]),
                            dtypes=out_dtypes,
                            index_value=index_value,
                            columns_value=columns_value,
                            index=(chunk_idx, 0))
                        chunk_idx += 1
                        out_chunks.append(out_chunk)
        finally:
            cupid_client.close()

        if op.incremental_index and _NEED_STANDARDIZE:
            out_chunks = standardize_range_index(out_chunks)
        new_op = op.copy()
        nsplits = ((np.nan, ) * len(out_chunks), (out_shape[1], ))
        return new_op.new_dataframes(None,
                                     shape=out_shape,
                                     dtypes=op.dtypes,
                                     index_value=df.index_value,
                                     columns_value=out_columns_value,
                                     chunks=out_chunks,
                                     nsplits=nsplits)

    @classmethod
    def _tile_tunnel(cls, op):
        from odps import ODPS

        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(odps_params['access_id'],
                 odps_params['secret_access_key'],
                 project=odps_params['project'],
                 endpoint=endpoint)

        table_obj = o.get_table(op.table_name)
        if not table_obj.schema.partitions:
            data_srcs = [table_obj]
        elif op.partition is not None and check_partition_exist(
                table_obj, op.partition):
            data_srcs = [table_obj.get_partition(op.partition)]
        else:
            data_srcs = list(table_obj.partitions)
            if op.partition is not None:
                data_srcs = filter_partitions(o, data_srcs, op.partition)

        out_chunks = []
        row_nsplits = []
        index_start = 0
        df = op.outputs[0]

        out_dtypes = df.dtypes
        out_shape = df.shape
        out_columns_value = df.columns_value
        if op.columns is not None:
            out_dtypes = out_dtypes[op.columns]
            out_shape = (df.shape[0], len(op.columns))
            out_columns_value = parse_index(out_dtypes.index, store_data=True)

        if len(data_srcs) == 0:
            # no partitions are selected
            chunk_op = DataFrameReadTableSplit()
            index_value = parse_index(pd.RangeIndex(0))
            columns_value = parse_index(out_dtypes.index, store_data=True)
            out_chunk = chunk_op.new_chunk(None,
                                           shape=(0, out_shape[1]),
                                           dtypes=op.dtypes,
                                           index_value=index_value,
                                           columns_value=columns_value,
                                           index=(index_start, 0))
            out_chunks.append(out_chunk)
        else:
            retry_times = op.retry_times or options.retry_times
            for data_src in data_srcs:
                data_store_size = data_src.size

                retries = 0
                while True:
                    try:
                        with data_src.open_reader() as reader:
                            record_count = reader.count
                        break
                    except:
                        if retries >= retry_times:
                            raise
                        retries += 1
                        time.sleep(1)
                if data_store_size == 0:
                    # empty table
                    chunk_op = DataFrameReadTableSplit()
                    index_value = parse_index(pd.RangeIndex(0))
                    columns_value = parse_index(out_dtypes.index,
                                                store_data=True)
                    out_chunk = chunk_op.new_chunk(None,
                                                   shape=(0, out_shape[1]),
                                                   dtypes=op.dtypes,
                                                   index_value=index_value,
                                                   columns_value=columns_value,
                                                   index=(index_start, 0))
                    out_chunks.append(out_chunk)
                    index_start += 1
                    continue
                chunk_size = df.extra_params.chunk_size

                partition_spec = str(data_src.partition_spec) \
                    if getattr(data_src, 'partition_spec', None) else None

                if chunk_size is None:
                    chunk_bytes = df.extra_params.chunk_bytes or CHUNK_BYTES_LIMIT
                    chunk_count = data_store_size // chunk_bytes + (
                        data_store_size % chunk_bytes != 0)
                    chunk_size = ceildiv(record_count, chunk_count)
                    split_size = chunk_bytes
                else:
                    chunk_count = ceildiv(record_count, chunk_size)
                    split_size = data_store_size // chunk_count

                for i in range(chunk_count):
                    start_index = chunk_size * i
                    end_index = min(chunk_size * (i + 1), record_count)
                    row_size = end_index - start_index
                    chunk_op = DataFrameReadTableSplit(
                        table_name=op.table_name,
                        partition_spec=partition_spec,
                        start_index=start_index,
                        end_index=end_index,
                        nrows=op.nrows,
                        odps_params=op.odps_params,
                        columns=op.columns,
                        incremental_index=op.incremental_index,
                        dtypes=out_dtypes,
                        sparse=op.sparse,
                        split_size=split_size,
                        use_arrow_dtype=op.use_arrow_dtype,
                        estimate_rows=row_size,
                        append_partitions=op.append_partitions,
                        memory_scale=op.memory_scale,
                        retry_times=op.retry_times,
                        extra_params=op.extra_params)
                    index_value = parse_index(
                        pd.RangeIndex(start_index, end_index))
                    columns_value = parse_index(out_dtypes.index,
                                                store_data=True)
                    out_chunk = chunk_op.new_chunk(None,
                                                   shape=(row_size,
                                                          out_shape[1]),
                                                   dtypes=out_dtypes,
                                                   index_value=index_value,
                                                   columns_value=columns_value,
                                                   index=(index_start + i, 0))
                    row_nsplits.append(row_size)
                    out_chunks.append(out_chunk)

                index_start += chunk_count

        if op.incremental_index and _NEED_STANDARDIZE:
            out_chunks = standardize_range_index(out_chunks)

        new_op = op.copy()
        nsplits = (tuple(row_nsplits), (out_shape[1], ))
        return new_op.new_dataframes(None,
                                     shape=out_shape,
                                     dtypes=op.dtypes,
                                     index_value=df.index_value,
                                     columns_value=out_columns_value,
                                     chunks=out_chunks,
                                     nsplits=nsplits)

    @classmethod
    def _tile(cls, op):
        if 'CUPID_SERVICE_SOCKET' in os.environ:
            return cls._tile_cupid(op)
        else:
            return cls._tile_tunnel(op)

Пример #6

Показать файл

class DataFrameReadTableSplit(*_BASE):
    _op_type_ = 123451

    # for cupid
    _cupid_handle = StringField('cupid_handle')
    _split_index = Int64Field('split_index')
    _split_file_start = Int64Field('split_file_start')
    _split_file_end = Int64Field('split_file_end')
    _schema_file_start = Int64Field('schema_file_start')
    _schema_file_end = Int64Field('schema_file_end')
    _use_arrow_dtype = BoolField('use_arrow_dtype')
    _string_as_binary = BoolField('string_as_binary')
    _dtypes = SeriesField('dtypes')
    _nrows = Int64Field('nrows')
    _incremental_index = BoolField('incremental_index')

    # for tunnel
    _table_name = StringField('table_name')
    _partition_spec = StringField('partition_spec')
    _start_index = Int64Field('start_index')
    _end_index = Int64Field('end_index')
    _odps_params = DictField('odps_params')
    _columns = AnyField('columns')

    _split_size = Int64Field('split_size')
    _append_partitions = BoolField('append_partitions')
    _estimate_rows = Int64Field('estimate_rows')
    _meta_raw_size = Int64Field('meta_raw_size')
    _retry_times = Int64Field('retry_times')

    def __init__(self,
                 cupid_handle=None,
                 split_index=None,
                 split_file_start=None,
                 split_file_end=None,
                 schema_file_start=None,
                 schema_file_end=None,
                 table_name=None,
                 partition_spec=None,
                 start_index=None,
                 end_index=None,
                 odps_params=None,
                 columns=None,
                 nrows=None,
                 incremental_index=None,
                 dtypes=None,
                 string_as_binary=None,
                 split_size=None,
                 use_arrow_dtype=None,
                 memory_scale=None,
                 estimate_rows=None,
                 meta_raw_size=None,
                 retry_times=None,
                 append_partitions=None,
                 sparse=None,
                 **kw):
        super(DataFrameReadTableSplit,
              self).__init__(_cupid_handle=cupid_handle,
                             _split_index=split_index,
                             _split_file_start=split_file_start,
                             _split_file_end=split_file_end,
                             _schema_file_start=schema_file_start,
                             _schema_file_end=schema_file_end,
                             _table_name=table_name,
                             _partition_spec=partition_spec,
                             _columns=columns,
                             _start_index=start_index,
                             _end_index=end_index,
                             _odps_params=odps_params,
                             _use_arrow_dtype=use_arrow_dtype,
                             _string_as_binary=string_as_binary,
                             _nrows=nrows,
                             _incremental_index=incremental_index,
                             _estimate_rows=estimate_rows,
                             _split_size=split_size,
                             _dtypes=dtypes,
                             _append_partitions=append_partitions,
                             _sparse=sparse,
                             _meta_raw_size=meta_raw_size,
                             _memory_scale=memory_scale,
                             _retry_times=retry_times,
                             _output_types=[OutputType.dataframe],
                             **kw)

    @property
    def retryable(self):
        return False

    @property
    def output_limit(self):
        return 1

    @property
    def cupid_handle(self):
        return self._cupid_handle

    @property
    def split_index(self):
        return self._split_index

    @property
    def split_file_start(self):
        return self._split_file_start

    @property
    def split_file_end(self):
        return self._split_file_end

    @property
    def schema_file_start(self):
        return self._schema_file_start

    @property
    def schema_file_end(self):
        return self._schema_file_end

    @property
    def table_name(self):
        return self._table_name

    @property
    def partition_spec(self):
        return self._partition_spec

    @property
    def start_index(self):
        return self._start_index

    @property
    def end_index(self):
        return self._end_index

    @property
    def odps_params(self):
        return self._odps_params

    @property
    def columns(self):
        return self._columns

    @property
    def nrows(self):
        return self._nrows

    @property
    def incremental_index(self):
        return self._incremental_index

    @property
    def dtypes(self):
        return self._dtypes

    @property
    def split_size(self):
        return self._split_size

    @property
    def estimate_rows(self):
        return self._estimate_rows

    @property
    def use_arrow_dtype(self):
        return self._use_arrow_dtype

    @property
    def string_as_binary(self):
        return self._string_as_binary

    @property
    def append_partitions(self):
        return self._append_partitions

    @property
    def meta_raw_size(self):
        return self._meta_raw_size

    @property
    def retry_times(self):
        return self._retry_times

    def set_pruned_columns(self,
                           columns,
                           *,
                           keep_order=None):  # pragma: no cover
        if isinstance(columns, str):
            columns = [columns]
        self._columns = list(columns)

    def get_columns(self):
        return self._columns

    @classmethod
    def estimate_size(cls, ctx, op):
        import numpy as np

        def is_object_dtype(dtype):
            try:
                return np.issubdtype(dtype, np.object_) \
                       or np.issubdtype(dtype, np.unicode_) \
                       or np.issubdtype(dtype, np.bytes_)
            except TypeError:  # pragma: no cover
                return False

        if op.split_size is None:
            ctx[op.outputs[0].key] = (0, 0)
            return

        arrow_size = (op.memory_scale or ORC_COMPRESSION_RATIO) * op.split_size
        if op.meta_raw_size is not None:
            raw_arrow_size = (op.memory_scale or 1) * op.meta_raw_size
            arrow_size = max(arrow_size, raw_arrow_size)

        n_strings = len([dt for dt in op.dtypes if is_object_dtype(dt)])
        if op.estimate_rows or op.nrows:
            rows = op.nrows if op.nrows is not None else op.estimate_rows
            pd_size = arrow_size + n_strings * rows * STRING_FIELD_OVERHEAD
            logger.debug('Estimate pandas memory cost: %r', pd_size)
        else:
            pd_size = arrow_size * 10 if n_strings else arrow_size

        ctx[op.outputs[0].key] = (pd_size, pd_size + arrow_size)

    @classmethod
    def _cast_string_to_binary(cls, arrow_table):
        import pyarrow as pa

        new_schema = []
        for field in arrow_table.schema:
            if field.type == pa.string():
                new_schema.append(pa.field(field.name, pa.binary()))
            else:
                new_schema.append(field)

        return arrow_table.cast(pa.schema(new_schema))

    @classmethod
    def _append_partition_values(cls, arrow_table, op):
        import pyarrow as pa

        if op.append_partitions and op.partition_spec:
            from odps.types import PartitionSpec
            spec = PartitionSpec(op.partition_spec)

            for col_name, pt_val in spec.items():
                arrow_table = arrow_table.append_column(
                    col_name,
                    pa.array([pt_val] * arrow_table.num_rows, pa.string()))

        return arrow_table

    @staticmethod
    def _align_columns(data, expected_dtypes):
        data_columns = data.dtypes.index
        expected_columns = expected_dtypes.index
        if not data_columns.equals(expected_columns):
            logger.debug(
                "Data columns differs from output columns, "
                "data columns: %s, output columns: %s", data_columns,
                expected_columns)
            data.columns = expected_columns[:len(data.columns)]
            for extra_col in expected_columns[len(data.columns):]:
                data[extra_col] = pd.Series([],
                                            dtype=expected_dtypes[extra_col])
            if not data.dtypes.index.equals(expected_columns):
                data = data[expected_columns]
        return data

    @classmethod
    def _align_output_data(cls, op, data):
        if isinstance(op.outputs[0], DATAFRAME_CHUNK_TYPE):
            dtypes = op.outputs[0].dtypes
            data = cls._align_columns(data, dtypes)
        else:
            dtypes = pd.Series([op.outputs[0].dtype],
                               index=[op.outputs[0].name])
            data = cls._align_columns(data, dtypes)
            data = data[op.outputs[0].name]
        return data

    @classmethod
    def _build_empty_df(cls, out):
        empty_df = pd.DataFrame()
        for name, dtype in out.dtypes.items():
            empty_df[name] = pd.Series(dtype=dtype)
        return empty_df

    @classmethod
    def _execute_in_cupid(cls, ctx, op):
        out = op.outputs[0]

        if op.cupid_handle is None:
            ctx[out.key] = cls._build_empty_df(out)
            return

        split_config = dict(
            _handle=op.cupid_handle,
            _split_index=op.split_index,
            _split_file_start=op.split_file_start,
            _split_file_end=op.split_file_end,
            _schema_file_start=op.schema_file_start,
            _schema_file_end=op.schema_file_end,
        )
        cupid_client = CupidServiceClient()
        try:
            pa_table = cupid_client.read_table_data(split_config, op.nrows)
        finally:
            cupid_client.close()
            cupid_client = None
        pa_table = cls._append_partition_values(pa_table, op)

        if op.string_as_binary:
            pa_table = cls._cast_string_to_binary(pa_table)
        data = arrow_table_to_pandas_dataframe(
            pa_table, use_arrow_dtype=op.use_arrow_dtype)[:op.nrows]

        data = cls._align_output_data(op, data)

        logger.debug('Read split table finished, split index: %s',
                     op.split_index)
        logger.debug('Split data shape is %s', data.shape)
        ctx[out.key] = data

    @classmethod
    def _execute_arrow_tunnel(cls, ctx, op):
        from odps import ODPS
        from odps.tunnel import TableTunnel

        out = op.outputs[0]

        if op.table_name is None:
            # is empty table
            ctx[out.key] = cls._build_empty_df(out)
            return

        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(odps_params['access_id'],
                 odps_params['secret_access_key'],
                 project=odps_params['project'],
                 endpoint=endpoint)

        t = o.get_table(op.table_name)
        tunnel = TableTunnel(o, project=t.project)
        retry_times = op.retry_times or options.retry_times

        retries = 0
        while True:
            try:
                if op.partition_spec is not None:
                    download_session = tunnel.create_download_session(
                        t.name, partition_spec=op.partition_spec)
                else:
                    download_session = tunnel.create_download_session(t.name)
                break
            except:
                if retries >= retry_times:
                    raise
                retries += 1
                time.sleep(1)

        logger.debug('Start reading table %s(%s) split from %s to %s',
                     op.table_name, op.partition_spec, op.start_index,
                     op.end_index)
        if op.nrows is None:
            count = op.end_index - op.start_index
        else:
            count = op.nrows

        retries = 0
        while True:
            try:
                with download_session.open_arrow_reader(
                        op.start_index, count, columns=op.columns) as reader:
                    table = reader.read()
                break
            except:
                if retries >= retry_times:
                    raise
                retries += 1
                time.sleep(1)

        table = cls._append_partition_values(table, op)
        if op.string_as_binary:
            table = cls._cast_string_to_binary(table)
        data = arrow_table_to_pandas_dataframe(
            table, use_arrow_dtype=op.use_arrow_dtype)
        data = cls._align_output_data(op, data)

        logger.debug('Finish reading table %s(%s) split from %s to %s',
                     op.table_name, op.partition_spec, op.start_index,
                     op.end_index)
        ctx[op.outputs[0].key] = data

    @classmethod
    def execute(cls, ctx, op):
        if 'CUPID_SERVICE_SOCKET' in os.environ:
            cls._execute_in_cupid(ctx, op)
        else:
            cls._execute_arrow_tunnel(ctx, op)