Пример #1
0
class MySerializable(Serializable):
    _id = IdentityField('id')
    _any_val = AnyField('any_val')
    _bool_val = BoolField('bool_val')
    _int8_val = Int8Field('int8_val')
    _int16_val = Int16Field('int16_val')
    _int32_val = Int32Field('int32_val')
    _int64_val = Int64Field('int64_val')
    _uint8_val = UInt8Field('uint8_val')
    _uint16_val = UInt16Field('uint16_val')
    _uint32_val = UInt32Field('uint32_val')
    _uint64_val = UInt64Field('uint64_val')
    _float16_val = Float16Field('float16_val')
    _float32_val = Float32Field('float32_val',
                                on_serialize=lambda x: x + 1,
                                on_deserialize=lambda x: x - 1)
    _float64_val = Float64Field('float64_val')
    _complex64_val = Complex64Field('complex64_val')
    _complex128_val = Complex128Field('complex128_val')
    _string_val = StringField('string_val')
    _bytes_val = BytesField('bytes_val')
    _key_val = KeyField('key_val')
    _ndarray_val = NDArrayField('ndarray_val')
    _datetime64_val = Datetime64Field('datetime64_val')
    _timedelta64_val = Timedelta64Field('timedelta64_val')
    _datatype_val = DataTypeField('datatype_val')
    _index_val = IndexField('index_val')
    _series_val = SeriesField('series_val')
    _dataframe_val = DataFrameField('dataframe_val')
    _interval_array_val = IntervalArrayField('interval_array_val')
    _slice_val = SliceField('slice_val')
    _function_val = FunctionField('function_val')
    _named_tuple_val = NamedTupleField('named_tuple_val')
    _tzinfo_val = TZInfoField('tzinfo_val')
    _list_val = ListField('list_val', FieldTypes.int64)
    _tuple_val = TupleField('tuple_val', FieldTypes.string)
    _dict_val = DictField('dict_val', FieldTypes.string, FieldTypes.bytes)
    _ref_val = ReferenceField('ref_val', 'self')
    _ref_val2 = ReferenceField('ref_val2', MySimpleSerializable)
    _oneof_val = OneOfField('ref_val',
                            oneof1_val=f'{__name__}.MySerializable',
                            oneof2_val=MySimpleSerializable)
Пример #2
0
class DataFrameWriteTableCommit(DataFrameOperand, DataFrameOperandMixin):
    _op_type_ = 123462

    _dtypes = SeriesField('dtypes')

    _odps_params = DictField('odps_params')
    _table_name = StringField('table_name')
    _overwrite = BoolField('overwrite')
    _blocks = DictField('blocks')
    _cupid_handle = StringField('cupid_handle')
    _is_terminal = BoolField('is_terminal')

    def __init__(self,
                 dtypes=None,
                 odps_params=None,
                 table_name=None,
                 blocks=None,
                 cupid_handle=None,
                 overwrite=False,
                 is_terminal=None,
                 **kw):
        kw.update(_output_type_kw)
        super(DataFrameWriteTableCommit,
              self).__init__(_dtypes=dtypes,
                             _odps_params=odps_params,
                             _table_name=table_name,
                             _blocks=blocks,
                             _overwrite=overwrite,
                             _cupid_handle=cupid_handle,
                             _is_terminal=is_terminal,
                             **kw)

    @property
    def dtypes(self):
        return self._dtypes

    @property
    def table_name(self):
        return self._table_name

    @property
    def blocks(self):
        return self._blocks

    @property
    def overwrite(self):
        return self._overwrite

    @property
    def cupid_handle(self):
        return self._cupid_handle

    @property
    def odps_params(self):
        return self._odps_params

    @property
    def is_terminal(self):
        return self._is_terminal

    @classmethod
    def execute(cls, ctx, op):
        import pandas as pd
        from ..cupid_service import CupidServiceClient

        if op.is_terminal:
            odps_params = op.odps_params.copy()
            project = os.environ.get('ODPS_PROJECT_NAME', None)
            if project:
                odps_params['project'] = project

            client = CupidServiceClient()
            client.commit_table_upload_session(odps_params, op.table_name,
                                               op.cupid_handle, op.blocks,
                                               op.overwrite)

        ctx[op.outputs[0].key] = pd.DataFrame()
Пример #3
0
class DataFrameWriteTable(DataFrameOperand, DataFrameOperandMixin):
    _op_type_ = 123460

    _dtypes = SeriesField('dtypes')

    _odps_params = DictField('odps_params')
    _table_name = StringField('table_name')
    _partition_spec = StringField('partition_spec')
    _overwrite = BoolField('overwrite')
    _write_batch_size = Int64Field('write_batch_size')
    _unknown_as_string = BoolField('unknown_as_string')

    def __init__(self,
                 dtypes=None,
                 odps_params=None,
                 table_name=None,
                 partition_spec=None,
                 unknown_as_string=None,
                 over_write=None,
                 write_batch_size=None,
                 **kw):
        kw.update(_output_type_kw)
        super(DataFrameWriteTable,
              self).__init__(_dtypes=dtypes,
                             _odps_params=odps_params,
                             _table_name=table_name,
                             _partition_spec=partition_spec,
                             _unknown_as_string=unknown_as_string,
                             _overwrite=over_write,
                             _write_batch_size=write_batch_size,
                             **kw)

    @property
    def retryable(self):
        return False

    @property
    def dtypes(self):
        return self._dtypes

    @property
    def unknown_as_string(self):
        return self._unknown_as_string

    @property
    def odps_params(self):
        return self._odps_params

    @property
    def table_name(self):
        return self._table_name

    @property
    def partition_spec(self):
        return self._partition_spec

    @property
    def overwrite(self):
        return self._overwrite

    @property
    def write_batch_size(self):
        return self._write_batch_size

    def __call__(self, x):
        shape = (0, ) * len(x.shape)
        index_value = parse_index(x.index_value.to_pandas()[:0], x.key,
                                  'index')
        columns_value = parse_index(x.columns_value.to_pandas()[:0],
                                    x.key,
                                    'columns',
                                    store_data=True)
        return self.new_dataframe([x],
                                  shape=shape,
                                  dtypes=x.dtypes[:0],
                                  index_value=index_value,
                                  columns_value=columns_value)

    @classmethod
    def _tile_cupid(cls, op):
        from mars.dataframe.utils import build_concatenated_rows_frame

        cupid_client = CupidServiceClient()
        upload_handle = cupid_client.create_table_upload_session(
            op.odps_params, op.table_name)

        input_df = build_concatenated_rows_frame(op.inputs[0])
        out_df = op.outputs[0]

        out_chunks = []
        out_chunk_shape = (0, ) * len(input_df.shape)
        blocks = {}
        for chunk in input_df.chunks:
            block_id = str(int(time.time())) + '_' + str(uuid.uuid4()).replace(
                '-', '')
            chunk_op = DataFrameWriteTableSplit(
                dtypes=op.dtypes,
                table_name=op.table_name,
                odps_params=op.odps_params,
                unknown_as_string=op.unknown_as_string,
                partition_spec=op.partition_spec,
                cupid_handle=to_str(upload_handle),
                block_id=block_id,
                write_batch_size=op.write_batch_size)
            out_chunk = chunk_op.new_chunk([chunk],
                                           shape=out_chunk_shape,
                                           index=chunk.index,
                                           index_value=out_df.index_value,
                                           dtypes=chunk.dtypes)
            out_chunks.append(out_chunk)
            blocks[block_id] = op.partition_spec

        # build commit tree
        combine_size = 8
        chunks = out_chunks
        while len(chunks) >= combine_size:
            new_chunks = []
            for i in range(0, len(chunks), combine_size):
                chks = chunks[i:i + combine_size]
                if len(chks) == 1:
                    chk = chks[0]
                else:
                    chk_op = DataFrameWriteTableCommit(dtypes=op.dtypes,
                                                       is_terminal=False)
                    chk = chk_op.new_chunk(chks,
                                           shape=out_chunk_shape,
                                           index_value=out_df.index_value,
                                           dtypes=op.dtypes)
                new_chunks.append(chk)
            chunks = new_chunks

        assert len(chunks) < combine_size

        commit_table_op = DataFrameWriteTableCommit(
            dtypes=op.dtypes,
            table_name=op.table_name,
            blocks=blocks,
            cupid_handle=to_str(upload_handle),
            overwrite=op.overwrite,
            odps_params=op.odps_params,
            is_terminal=True)
        commit_table_chunk = commit_table_op.new_chunk(
            chunks,
            shape=out_chunk_shape,
            dtypes=op.dtypes,
            index_value=out_df.index_value,
            index=(0, ) * len(out_chunk_shape))

        new_op = op.copy()
        return new_op.new_dataframes(op.inputs,
                                     shape=out_df.shape,
                                     index_value=out_df.index_value,
                                     dtypes=out_df.dtypes,
                                     columns_value=out_df.columns_value,
                                     chunks=[commit_table_chunk],
                                     nsplits=((0, ), ) * len(out_chunk_shape))

    @classmethod
    def _tile_tunnel(cls, op):
        out_df = op.outputs[0]
        in_df = build_concatenated_rows_frame(op.inputs[0])

        out_chunks = []
        for chunk in in_df.chunks:
            chunk_op = DataFrameWriteTableSplit(
                dtypes=op.dtypes,
                table_name=op.table_name,
                odps_params=op.odps_params,
                partition_spec=op.partition_spec)
            index_value = parse_index(chunk.index_value.to_pandas()[:0], chunk)
            out_chunk = chunk_op.new_chunk([chunk],
                                           shape=(0, 0),
                                           index_value=index_value,
                                           columns_value=out_df.columns_value,
                                           dtypes=out_df.dtypes,
                                           index=chunk.index)
            out_chunks.append(out_chunk)

        new_op = op.copy()
        params = out_df.params.copy()
        params.update(
            dict(chunks=out_chunks,
                 nsplits=((0, ) * in_df.chunk_shape[0], (0, ))))
        return new_op.new_tileables([in_df], **params)

    @classmethod
    def tile(cls, op):
        if 'CUPID_SERVICE_SOCKET' in os.environ:
            return cls._tile_cupid(op)
        else:
            return cls._tile_tunnel(op)
Пример #4
0
class DataFrameWriteTableSplit(DataFrameOperand, DataFrameOperandMixin):
    _op_type_ = 123461

    _dtypes = SeriesField('dtypes')

    _table_name = StringField('table_name')
    _partition_spec = StringField('partition_spec')
    _cupid_handle = StringField('cupid_handle')
    _block_id = StringField('block_id')
    _write_batch_size = Int64Field('write_batch_size')
    _unknown_as_string = BoolField('unknown_as_string')

    # for tunnel
    _odps_params = DictField('odps_params')

    def __init__(self,
                 dtypes=None,
                 table_name=None,
                 odps_params=None,
                 partition_spec=None,
                 cupid_handle=None,
                 unknown_as_string=None,
                 block_id=None,
                 write_batch_size=None,
                 **kw):
        kw.update(_output_type_kw)
        super(DataFrameWriteTableSplit,
              self).__init__(_dtypes=dtypes,
                             _table_name=table_name,
                             _odps_params=odps_params,
                             _partition_spec=partition_spec,
                             _unknown_as_string=unknown_as_string,
                             _cupid_handle=cupid_handle,
                             _block_id=block_id,
                             _write_batch_size=write_batch_size,
                             **kw)

    @property
    def retryable(self):
        return False

    @property
    def dtypes(self):
        return self._dtypes

    @property
    def table_name(self):
        return self._table_name

    @property
    def odps_params(self):
        return self._odps_params

    @property
    def unknown_as_string(self):
        return self._unknown_as_string

    @property
    def partition_spec(self):
        return self._partition_spec

    @property
    def cupid_handle(self):
        return self._cupid_handle

    @property
    def block_id(self):
        return self._block_id

    @property
    def write_batch_size(self):
        return self._write_batch_size

    @classmethod
    def _execute_in_cupid(cls, ctx, op):
        import os

        import pandas as pd
        from odps import ODPS
        from odps.accounts import BearerTokenAccount

        cupid_client = CupidServiceClient()
        to_store_data = ctx[op.inputs[0].key]

        bearer_token = cupid_client.get_bearer_token()
        account = BearerTokenAccount(bearer_token)
        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(None,
                 None,
                 account=account,
                 project=odps_params['project'],
                 endpoint=endpoint)
        odps_schema = o.get_table(op.table_name).schema
        project_name, table_name = op.table_name.split('.')

        writer_config = dict(_table_name=table_name,
                             _project_name=project_name,
                             _table_schema=odps_schema,
                             _partition_spec=op.partition_spec,
                             _block_id=op.block_id,
                             _handle=op.cupid_handle)
        cupid_client.write_table_data(writer_config, to_store_data,
                                      op.write_batch_size)
        ctx[op.outputs[0].key] = pd.DataFrame()

    @classmethod
    def _execute_arrow_tunnel(cls, ctx, op):
        from odps import ODPS
        from odps.tunnel import TableTunnel
        import pyarrow as pa
        import pandas as pd

        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(odps_params['access_id'],
                 odps_params['secret_access_key'],
                 project=odps_params['project'],
                 endpoint=endpoint)

        t = o.get_table(op.table_name)
        tunnel = TableTunnel(o, project=t.project)
        retry_times = options.retry_times

        retries = 0
        while True:
            try:
                if op.partition_spec is not None:
                    upload_session = tunnel.create_upload_session(
                        t.name, partition_spec=op.partition_spec)
                else:
                    upload_session = tunnel.create_upload_session(t.name)
                break
            except:
                if retries >= retry_times:
                    raise
                time.sleep(1)

        logger.debug('Start writing table %s split index: %s', op.table_name,
                     op.inputs[0].index)

        retries = 0
        while True:
            try:
                writer = upload_session.open_arrow_writer(0)
                arrow_rb = pa.RecordBatch.from_pandas(ctx[op.inputs[0].key])
                writer.write(arrow_rb)
                writer.close()
                break
            except:
                if retries >= retry_times:
                    raise
                time.sleep(1)

        upload_session.commit([0])
        logger.debug('Finish writing table %s split index: %s', op.table_name,
                     op.inputs[0].index)

        ctx[op.outputs[0].key] = pd.DataFrame()

    @classmethod
    def execute(cls, ctx, op):
        if op.cupid_handle is not None:
            cls._execute_in_cupid(ctx, op)
        else:
            cls._execute_arrow_tunnel(ctx, op)
Пример #5
0
class DataFrameReadTable(*_BASE):
    _op_type_ = 123450

    _odps_params = DictField('odps_params')
    _table_name = StringField('table_name')
    _partition_spec = StringField('partition_spec')
    _dtypes = SeriesField('dtypes')
    _incremental_index = BoolField('incremental_index')
    _columns = AnyField('columns')
    _nrows = Int64Field('nrows')
    _use_arrow_dtype = BoolField('use_arrow_dtype')
    _string_as_binary = BoolField('string_as_binary')
    _append_partitions = BoolField('append_partitions')
    _last_modified_time = Int64Field('last_modified_time')
    _with_split_meta_on_tile = BoolField('with_split_meta_on_tile')
    _retry_times = Int64Field('retry_times')

    def __init__(self,
                 odps_params=None,
                 table_name=None,
                 partition_spec=None,
                 columns=None,
                 dtypes=None,
                 nrows=None,
                 sparse=None,
                 incremental_index=True,
                 use_arrow_dtype=None,
                 string_as_binary=None,
                 memory_scale=None,
                 append_partitions=None,
                 last_modified_time=None,
                 with_split_meta_on_tile=False,
                 retry_times=None,
                 **kw):
        super(DataFrameReadTable,
              self).__init__(_odps_params=odps_params,
                             _table_name=table_name,
                             _partition_spec=partition_spec,
                             _columns=columns,
                             _dtypes=dtypes,
                             _nrows=nrows,
                             _sparse=sparse,
                             _use_arrow_dtype=use_arrow_dtype,
                             _string_as_binary=string_as_binary,
                             _incremental_index=incremental_index,
                             _append_partitions=append_partitions,
                             _last_modified_time=last_modified_time,
                             _memory_scale=memory_scale,
                             _with_split_meta_on_tile=with_split_meta_on_tile,
                             _retry_times=retry_times,
                             _output_types=[OutputType.dataframe],
                             **kw)

    @property
    def retryable(self):
        return False

    @property
    def odps_params(self):
        return self._odps_params

    @property
    def table_name(self):
        return self._table_name

    @property
    def partition(self):
        return getattr(self, '_partition_spec', None)

    @property
    def columns(self):
        return self._columns

    @property
    def dtypes(self):
        return self._dtypes

    @property
    def nrows(self):
        return self._nrows

    @property
    def use_arrow_dtype(self):
        return self._use_arrow_dtype

    @property
    def string_as_binary(self):
        return self._string_as_binary

    @property
    def incremental_index(self):
        return self._incremental_index

    @property
    def append_partitions(self):
        return self._append_partitions

    @property
    def with_split_meta_on_tile(self):
        return self._with_split_meta_on_tile

    @property
    def retry_times(self):
        return self._retry_times

    def get_columns(self):
        return self._columns

    def set_pruned_columns(self,
                           columns,
                           *,
                           keep_order=None):  # pragma: no cover
        self._columns = columns

    def __call__(self, shape, chunk_bytes=None, chunk_size=None):
        import numpy as np
        import pandas as pd

        if np.isnan(shape[0]):
            index_value = parse_index(pd.RangeIndex(0))
        else:
            index_value = parse_index(pd.RangeIndex(shape[0]))
        columns_value = parse_index(self.dtypes.index, store_data=True)
        return self.new_dataframe(None,
                                  shape,
                                  dtypes=self.dtypes,
                                  index_value=index_value,
                                  columns_value=columns_value,
                                  chunk_bytes=chunk_bytes,
                                  chunk_size=chunk_size)

    @classmethod
    def _tile_cupid(cls, op):
        import numpy as np
        import pandas as pd
        from mars.core.context import get_context

        df = op.outputs[0]
        split_size = df.extra_params.chunk_bytes or CHUNK_BYTES_LIMIT

        out_dtypes = df.dtypes
        out_shape = df.shape
        out_columns_value = df.columns_value
        if op.columns is not None:
            out_dtypes = out_dtypes[op.columns]
            out_shape = (df.shape[0], len(op.columns))
            out_columns_value = parse_index(out_dtypes.index, store_data=True)

        mars_context = get_context()
        if mars_context is not None:
            worker_count = len(mars_context.get_worker_addresses())
        else:
            worker_count = None

        cupid_client = CupidServiceClient()
        try:
            parts = cupid_client.enum_table_partitions(op.odps_params,
                                                       op.table_name,
                                                       op.partition)
            if parts is None:
                parts = [None]

            out_chunks = []
            chunk_idx = 0

            for partition_spec in parts:
                splits, split_size = cupid_client.create_table_download_session(
                    op.odps_params, op.table_name, partition_spec, op.columns,
                    worker_count, split_size, MAX_CHUNK_NUM,
                    op.with_split_meta_on_tile)

                logger.debug('%s table splits have been created.',
                             str(len(splits)))

                meta_chunk_rows = [split.meta_row_count for split in splits]
                if np.isnan(out_shape[0]):
                    est_chunk_rows = meta_chunk_rows
                else:
                    sp_file_sizes = np.array([
                        sp.split_file_end - sp.split_file_start
                        for sp in splits
                    ])
                    total_size = sp_file_sizes.sum()
                    ratio_chunk_rows = (sp_file_sizes * out_shape[0] //
                                        total_size).tolist()
                    est_chunk_rows = [
                        mr if mr is not None else rr
                        for mr, rr in zip(meta_chunk_rows, ratio_chunk_rows)
                    ]

                logger.warning('Estimated chunk rows: %r', est_chunk_rows)

                if len(splits) == 0:
                    logger.debug('Table %s has no data', op.table_name)
                    chunk_op = DataFrameReadTableSplit()
                    index_value = parse_index(pd.RangeIndex(0))
                    columns_value = parse_index(out_dtypes.index,
                                                store_data=True)
                    out_chunk = chunk_op.new_chunk(None,
                                                   shape=(np.nan,
                                                          out_shape[1]),
                                                   dtypes=op.dtypes,
                                                   index_value=index_value,
                                                   columns_value=columns_value,
                                                   index=(chunk_idx, 0))
                    out_chunks.append(out_chunk)
                    chunk_idx += 1
                else:
                    for idx, split in enumerate(splits):
                        chunk_op = DataFrameReadTableSplit(
                            cupid_handle=to_str(split.handle),
                            split_index=split.split_index,
                            split_file_start=split.split_file_start,
                            split_file_end=split.split_file_end,
                            schema_file_start=split.schema_file_start,
                            schema_file_end=split.schema_file_end,
                            incremental_index=op.incremental_index,
                            dtypes=out_dtypes,
                            sparse=op.sparse,
                            split_size=split_size,
                            string_as_binary=op.string_as_binary,
                            use_arrow_dtype=op.use_arrow_dtype,
                            estimate_rows=est_chunk_rows[idx],
                            partition_spec=partition_spec,
                            append_partitions=op.append_partitions,
                            meta_raw_size=split.meta_raw_size,
                            nrows=meta_chunk_rows[idx] or op.nrows,
                            memory_scale=op.memory_scale,
                            extra_params=op.extra_params)
                        # the chunk shape is unknown
                        index_value = parse_index(pd.RangeIndex(0))
                        columns_value = parse_index(out_dtypes.index,
                                                    store_data=True)
                        out_chunk = chunk_op.new_chunk(
                            None,
                            shape=(np.nan, out_shape[1]),
                            dtypes=out_dtypes,
                            index_value=index_value,
                            columns_value=columns_value,
                            index=(chunk_idx, 0))
                        chunk_idx += 1
                        out_chunks.append(out_chunk)
        finally:
            cupid_client.close()

        if op.incremental_index and _NEED_STANDARDIZE:
            out_chunks = standardize_range_index(out_chunks)
        new_op = op.copy()
        nsplits = ((np.nan, ) * len(out_chunks), (out_shape[1], ))
        return new_op.new_dataframes(None,
                                     shape=out_shape,
                                     dtypes=op.dtypes,
                                     index_value=df.index_value,
                                     columns_value=out_columns_value,
                                     chunks=out_chunks,
                                     nsplits=nsplits)

    @classmethod
    def _tile_tunnel(cls, op):
        from odps import ODPS

        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(odps_params['access_id'],
                 odps_params['secret_access_key'],
                 project=odps_params['project'],
                 endpoint=endpoint)

        table_obj = o.get_table(op.table_name)
        if not table_obj.schema.partitions:
            data_srcs = [table_obj]
        elif op.partition is not None and check_partition_exist(
                table_obj, op.partition):
            data_srcs = [table_obj.get_partition(op.partition)]
        else:
            data_srcs = list(table_obj.partitions)
            if op.partition is not None:
                data_srcs = filter_partitions(o, data_srcs, op.partition)

        out_chunks = []
        row_nsplits = []
        index_start = 0
        df = op.outputs[0]

        out_dtypes = df.dtypes
        out_shape = df.shape
        out_columns_value = df.columns_value
        if op.columns is not None:
            out_dtypes = out_dtypes[op.columns]
            out_shape = (df.shape[0], len(op.columns))
            out_columns_value = parse_index(out_dtypes.index, store_data=True)

        if len(data_srcs) == 0:
            # no partitions are selected
            chunk_op = DataFrameReadTableSplit()
            index_value = parse_index(pd.RangeIndex(0))
            columns_value = parse_index(out_dtypes.index, store_data=True)
            out_chunk = chunk_op.new_chunk(None,
                                           shape=(0, out_shape[1]),
                                           dtypes=op.dtypes,
                                           index_value=index_value,
                                           columns_value=columns_value,
                                           index=(index_start, 0))
            out_chunks.append(out_chunk)
        else:
            retry_times = op.retry_times or options.retry_times
            for data_src in data_srcs:
                data_store_size = data_src.size

                retries = 0
                while True:
                    try:
                        with data_src.open_reader() as reader:
                            record_count = reader.count
                        break
                    except:
                        if retries >= retry_times:
                            raise
                        retries += 1
                        time.sleep(1)
                if data_store_size == 0:
                    # empty table
                    chunk_op = DataFrameReadTableSplit()
                    index_value = parse_index(pd.RangeIndex(0))
                    columns_value = parse_index(out_dtypes.index,
                                                store_data=True)
                    out_chunk = chunk_op.new_chunk(None,
                                                   shape=(0, out_shape[1]),
                                                   dtypes=op.dtypes,
                                                   index_value=index_value,
                                                   columns_value=columns_value,
                                                   index=(index_start, 0))
                    out_chunks.append(out_chunk)
                    index_start += 1
                    continue
                chunk_size = df.extra_params.chunk_size

                partition_spec = str(data_src.partition_spec) \
                    if getattr(data_src, 'partition_spec', None) else None

                if chunk_size is None:
                    chunk_bytes = df.extra_params.chunk_bytes or CHUNK_BYTES_LIMIT
                    chunk_count = data_store_size // chunk_bytes + (
                        data_store_size % chunk_bytes != 0)
                    chunk_size = ceildiv(record_count, chunk_count)
                    split_size = chunk_bytes
                else:
                    chunk_count = ceildiv(record_count, chunk_size)
                    split_size = data_store_size // chunk_count

                for i in range(chunk_count):
                    start_index = chunk_size * i
                    end_index = min(chunk_size * (i + 1), record_count)
                    row_size = end_index - start_index
                    chunk_op = DataFrameReadTableSplit(
                        table_name=op.table_name,
                        partition_spec=partition_spec,
                        start_index=start_index,
                        end_index=end_index,
                        nrows=op.nrows,
                        odps_params=op.odps_params,
                        columns=op.columns,
                        incremental_index=op.incremental_index,
                        dtypes=out_dtypes,
                        sparse=op.sparse,
                        split_size=split_size,
                        use_arrow_dtype=op.use_arrow_dtype,
                        estimate_rows=row_size,
                        append_partitions=op.append_partitions,
                        memory_scale=op.memory_scale,
                        retry_times=op.retry_times,
                        extra_params=op.extra_params)
                    index_value = parse_index(
                        pd.RangeIndex(start_index, end_index))
                    columns_value = parse_index(out_dtypes.index,
                                                store_data=True)
                    out_chunk = chunk_op.new_chunk(None,
                                                   shape=(row_size,
                                                          out_shape[1]),
                                                   dtypes=out_dtypes,
                                                   index_value=index_value,
                                                   columns_value=columns_value,
                                                   index=(index_start + i, 0))
                    row_nsplits.append(row_size)
                    out_chunks.append(out_chunk)

                index_start += chunk_count

        if op.incremental_index and _NEED_STANDARDIZE:
            out_chunks = standardize_range_index(out_chunks)

        new_op = op.copy()
        nsplits = (tuple(row_nsplits), (out_shape[1], ))
        return new_op.new_dataframes(None,
                                     shape=out_shape,
                                     dtypes=op.dtypes,
                                     index_value=df.index_value,
                                     columns_value=out_columns_value,
                                     chunks=out_chunks,
                                     nsplits=nsplits)

    @classmethod
    def _tile(cls, op):
        if 'CUPID_SERVICE_SOCKET' in os.environ:
            return cls._tile_cupid(op)
        else:
            return cls._tile_tunnel(op)
Пример #6
0
class DataFrameReadTableSplit(*_BASE):
    _op_type_ = 123451

    # for cupid
    _cupid_handle = StringField('cupid_handle')
    _split_index = Int64Field('split_index')
    _split_file_start = Int64Field('split_file_start')
    _split_file_end = Int64Field('split_file_end')
    _schema_file_start = Int64Field('schema_file_start')
    _schema_file_end = Int64Field('schema_file_end')
    _use_arrow_dtype = BoolField('use_arrow_dtype')
    _string_as_binary = BoolField('string_as_binary')
    _dtypes = SeriesField('dtypes')
    _nrows = Int64Field('nrows')
    _incremental_index = BoolField('incremental_index')

    # for tunnel
    _table_name = StringField('table_name')
    _partition_spec = StringField('partition_spec')
    _start_index = Int64Field('start_index')
    _end_index = Int64Field('end_index')
    _odps_params = DictField('odps_params')
    _columns = AnyField('columns')

    _split_size = Int64Field('split_size')
    _append_partitions = BoolField('append_partitions')
    _estimate_rows = Int64Field('estimate_rows')
    _meta_raw_size = Int64Field('meta_raw_size')
    _retry_times = Int64Field('retry_times')

    def __init__(self,
                 cupid_handle=None,
                 split_index=None,
                 split_file_start=None,
                 split_file_end=None,
                 schema_file_start=None,
                 schema_file_end=None,
                 table_name=None,
                 partition_spec=None,
                 start_index=None,
                 end_index=None,
                 odps_params=None,
                 columns=None,
                 nrows=None,
                 incremental_index=None,
                 dtypes=None,
                 string_as_binary=None,
                 split_size=None,
                 use_arrow_dtype=None,
                 memory_scale=None,
                 estimate_rows=None,
                 meta_raw_size=None,
                 retry_times=None,
                 append_partitions=None,
                 sparse=None,
                 **kw):
        super(DataFrameReadTableSplit,
              self).__init__(_cupid_handle=cupid_handle,
                             _split_index=split_index,
                             _split_file_start=split_file_start,
                             _split_file_end=split_file_end,
                             _schema_file_start=schema_file_start,
                             _schema_file_end=schema_file_end,
                             _table_name=table_name,
                             _partition_spec=partition_spec,
                             _columns=columns,
                             _start_index=start_index,
                             _end_index=end_index,
                             _odps_params=odps_params,
                             _use_arrow_dtype=use_arrow_dtype,
                             _string_as_binary=string_as_binary,
                             _nrows=nrows,
                             _incremental_index=incremental_index,
                             _estimate_rows=estimate_rows,
                             _split_size=split_size,
                             _dtypes=dtypes,
                             _append_partitions=append_partitions,
                             _sparse=sparse,
                             _meta_raw_size=meta_raw_size,
                             _memory_scale=memory_scale,
                             _retry_times=retry_times,
                             _output_types=[OutputType.dataframe],
                             **kw)

    @property
    def retryable(self):
        return False

    @property
    def output_limit(self):
        return 1

    @property
    def cupid_handle(self):
        return self._cupid_handle

    @property
    def split_index(self):
        return self._split_index

    @property
    def split_file_start(self):
        return self._split_file_start

    @property
    def split_file_end(self):
        return self._split_file_end

    @property
    def schema_file_start(self):
        return self._schema_file_start

    @property
    def schema_file_end(self):
        return self._schema_file_end

    @property
    def table_name(self):
        return self._table_name

    @property
    def partition_spec(self):
        return self._partition_spec

    @property
    def start_index(self):
        return self._start_index

    @property
    def end_index(self):
        return self._end_index

    @property
    def odps_params(self):
        return self._odps_params

    @property
    def columns(self):
        return self._columns

    @property
    def nrows(self):
        return self._nrows

    @property
    def incremental_index(self):
        return self._incremental_index

    @property
    def dtypes(self):
        return self._dtypes

    @property
    def split_size(self):
        return self._split_size

    @property
    def estimate_rows(self):
        return self._estimate_rows

    @property
    def use_arrow_dtype(self):
        return self._use_arrow_dtype

    @property
    def string_as_binary(self):
        return self._string_as_binary

    @property
    def append_partitions(self):
        return self._append_partitions

    @property
    def meta_raw_size(self):
        return self._meta_raw_size

    @property
    def retry_times(self):
        return self._retry_times

    def set_pruned_columns(self,
                           columns,
                           *,
                           keep_order=None):  # pragma: no cover
        if isinstance(columns, str):
            columns = [columns]
        self._columns = list(columns)

    def get_columns(self):
        return self._columns

    @classmethod
    def estimate_size(cls, ctx, op):
        import numpy as np

        def is_object_dtype(dtype):
            try:
                return np.issubdtype(dtype, np.object_) \
                       or np.issubdtype(dtype, np.unicode_) \
                       or np.issubdtype(dtype, np.bytes_)
            except TypeError:  # pragma: no cover
                return False

        if op.split_size is None:
            ctx[op.outputs[0].key] = (0, 0)
            return

        arrow_size = (op.memory_scale or ORC_COMPRESSION_RATIO) * op.split_size
        if op.meta_raw_size is not None:
            raw_arrow_size = (op.memory_scale or 1) * op.meta_raw_size
            arrow_size = max(arrow_size, raw_arrow_size)

        n_strings = len([dt for dt in op.dtypes if is_object_dtype(dt)])
        if op.estimate_rows or op.nrows:
            rows = op.nrows if op.nrows is not None else op.estimate_rows
            pd_size = arrow_size + n_strings * rows * STRING_FIELD_OVERHEAD
            logger.debug('Estimate pandas memory cost: %r', pd_size)
        else:
            pd_size = arrow_size * 10 if n_strings else arrow_size

        ctx[op.outputs[0].key] = (pd_size, pd_size + arrow_size)

    @classmethod
    def _cast_string_to_binary(cls, arrow_table):
        import pyarrow as pa

        new_schema = []
        for field in arrow_table.schema:
            if field.type == pa.string():
                new_schema.append(pa.field(field.name, pa.binary()))
            else:
                new_schema.append(field)

        return arrow_table.cast(pa.schema(new_schema))

    @classmethod
    def _append_partition_values(cls, arrow_table, op):
        import pyarrow as pa

        if op.append_partitions and op.partition_spec:
            from odps.types import PartitionSpec
            spec = PartitionSpec(op.partition_spec)

            for col_name, pt_val in spec.items():
                arrow_table = arrow_table.append_column(
                    col_name,
                    pa.array([pt_val] * arrow_table.num_rows, pa.string()))

        return arrow_table

    @staticmethod
    def _align_columns(data, expected_dtypes):
        data_columns = data.dtypes.index
        expected_columns = expected_dtypes.index
        if not data_columns.equals(expected_columns):
            logger.debug(
                "Data columns differs from output columns, "
                "data columns: %s, output columns: %s", data_columns,
                expected_columns)
            data.columns = expected_columns[:len(data.columns)]
            for extra_col in expected_columns[len(data.columns):]:
                data[extra_col] = pd.Series([],
                                            dtype=expected_dtypes[extra_col])
            if not data.dtypes.index.equals(expected_columns):
                data = data[expected_columns]
        return data

    @classmethod
    def _align_output_data(cls, op, data):
        if isinstance(op.outputs[0], DATAFRAME_CHUNK_TYPE):
            dtypes = op.outputs[0].dtypes
            data = cls._align_columns(data, dtypes)
        else:
            dtypes = pd.Series([op.outputs[0].dtype],
                               index=[op.outputs[0].name])
            data = cls._align_columns(data, dtypes)
            data = data[op.outputs[0].name]
        return data

    @classmethod
    def _build_empty_df(cls, out):
        empty_df = pd.DataFrame()
        for name, dtype in out.dtypes.items():
            empty_df[name] = pd.Series(dtype=dtype)
        return empty_df

    @classmethod
    def _execute_in_cupid(cls, ctx, op):
        out = op.outputs[0]

        if op.cupid_handle is None:
            ctx[out.key] = cls._build_empty_df(out)
            return

        split_config = dict(
            _handle=op.cupid_handle,
            _split_index=op.split_index,
            _split_file_start=op.split_file_start,
            _split_file_end=op.split_file_end,
            _schema_file_start=op.schema_file_start,
            _schema_file_end=op.schema_file_end,
        )
        cupid_client = CupidServiceClient()
        try:
            pa_table = cupid_client.read_table_data(split_config, op.nrows)
        finally:
            cupid_client.close()
            cupid_client = None
        pa_table = cls._append_partition_values(pa_table, op)

        if op.string_as_binary:
            pa_table = cls._cast_string_to_binary(pa_table)
        data = arrow_table_to_pandas_dataframe(
            pa_table, use_arrow_dtype=op.use_arrow_dtype)[:op.nrows]

        data = cls._align_output_data(op, data)

        logger.debug('Read split table finished, split index: %s',
                     op.split_index)
        logger.debug('Split data shape is %s', data.shape)
        ctx[out.key] = data

    @classmethod
    def _execute_arrow_tunnel(cls, ctx, op):
        from odps import ODPS
        from odps.tunnel import TableTunnel

        out = op.outputs[0]

        if op.table_name is None:
            # is empty table
            ctx[out.key] = cls._build_empty_df(out)
            return

        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(odps_params['access_id'],
                 odps_params['secret_access_key'],
                 project=odps_params['project'],
                 endpoint=endpoint)

        t = o.get_table(op.table_name)
        tunnel = TableTunnel(o, project=t.project)
        retry_times = op.retry_times or options.retry_times

        retries = 0
        while True:
            try:
                if op.partition_spec is not None:
                    download_session = tunnel.create_download_session(
                        t.name, partition_spec=op.partition_spec)
                else:
                    download_session = tunnel.create_download_session(t.name)
                break
            except:
                if retries >= retry_times:
                    raise
                retries += 1
                time.sleep(1)

        logger.debug('Start reading table %s(%s) split from %s to %s',
                     op.table_name, op.partition_spec, op.start_index,
                     op.end_index)
        if op.nrows is None:
            count = op.end_index - op.start_index
        else:
            count = op.nrows

        retries = 0
        while True:
            try:
                with download_session.open_arrow_reader(
                        op.start_index, count, columns=op.columns) as reader:
                    table = reader.read()
                break
            except:
                if retries >= retry_times:
                    raise
                retries += 1
                time.sleep(1)

        table = cls._append_partition_values(table, op)
        if op.string_as_binary:
            table = cls._cast_string_to_binary(table)
        data = arrow_table_to_pandas_dataframe(
            table, use_arrow_dtype=op.use_arrow_dtype)
        data = cls._align_output_data(op, data)

        logger.debug('Finish reading table %s(%s) split from %s to %s',
                     op.table_name, op.partition_spec, op.start_index,
                     op.end_index)
        ctx[op.outputs[0].key] = data

    @classmethod
    def execute(cls, ctx, op):
        if 'CUPID_SERVICE_SOCKET' in os.environ:
            cls._execute_in_cupid(ctx, op)
        else:
            cls._execute_arrow_tunnel(ctx, op)