class DataFrameWriteTableCommit(DataFrameOperand, DataFrameOperandMixin): _op_type_ = 123462 _dtypes = SeriesField('dtypes') _odps_params = DictField('odps_params') _table_name = StringField('table_name') _overwrite = BoolField('overwrite') _blocks = DictField('blocks') _cupid_handle = StringField('cupid_handle') _is_terminal = BoolField('is_terminal') def __init__(self, dtypes=None, odps_params=None, table_name=None, blocks=None, cupid_handle=None, overwrite=False, is_terminal=None, **kw): super(DataFrameWriteTableCommit, self).__init__(_dtypes=dtypes, _odps_params=odps_params, _table_name=table_name, _blocks=blocks, _overwrite=overwrite, _cupid_handle=cupid_handle, _is_terminal=is_terminal, _object_type=ObjectType.dataframe, **kw) @property def dtypes(self): return self._dtypes @property def table_name(self): return self._table_name @property def blocks(self): return self._blocks @property def overwrite(self): return self._overwrite @property def cupid_handle(self): return self._cupid_handle @property def odps_params(self): return self._odps_params @property def is_terminal(self): return self._is_terminal @classmethod def execute(cls, ctx, op): import pandas as pd from odps import ODPS from odps.accounts import BearerTokenAccount from cupid import CupidSession, context from cupid.io.table import CupidTableUploadSession if op.is_terminal: bearer_token = context().get_bearer_token() account = BearerTokenAccount(bearer_token) o = ODPS(None, None, account=account, **op.odps_params) cupid_session = CupidSession(o) project_name, table_name = op.table_name.split('.') upload_session = CupidTableUploadSession(session=cupid_session, table_name=table_name, project_name=project_name, handle=op.cupid_handle, blocks=op.blocks) upload_session.commit(overwrite=op.overwrite) ctx[op.outputs[0].key] = pd.DataFrame()
class DataFrameWriteTable(DataFrameOperand, DataFrameOperandMixin): _op_type_ = 123460 _dtypes = SeriesField('dtypes') _odps_params = DictField('odps_params') _table_name = StringField('table_name') _partition_spec = StringField('partition_spec') _overwrite = BoolField('overwrite') _write_batch_size = Int64Field('write_batch_size') def __init__(self, dtypes=None, odps_params=None, table_name=None, partition_spec=None, over_write=None, write_batch_size=None, **kw): super(DataFrameWriteTable, self).__init__(_dtypes=dtypes, _odps_params=odps_params, _table_name=table_name, _partition_spec=partition_spec, _overwrite=over_write, _write_batch_size=write_batch_size, _object_type=ObjectType.dataframe, **kw) @property def retryable(self): return False @property def dtypes(self): return self._dtypes @property def odps_params(self): return self._odps_params @property def table_name(self): return self._table_name @property def partition_spec(self): return self._partition_spec @property def overwrite(self): return self._overwrite @property def write_batch_size(self): return self._write_batch_size def __call__(self, x): shape = (0, ) * len(x.shape) return self.new_dataframe([x], shape=shape) @classmethod def tile(cls, op): from odps import ODPS from odps.accounts import BearerTokenAccount from cupid import CupidSession, context from mars.dataframe.utils import build_concatenated_rows_frame bearer_token = context().get_bearer_token() account = BearerTokenAccount(bearer_token) project = os.environ.get('ODPS_PROJECT_NAME', None) odps_params = op.odps_params.copy() if project: odps_params['project'] = project o = ODPS(None, None, account=account, **op.odps_params) cupid_session = CupidSession(o) data_src = o.get_table(op.table_name) logger.debug('Start creating upload session from cupid.') upload_session = cupid_session.create_upload_session(data_src) input_df = build_concatenated_rows_frame(op.inputs[0]) out_chunks = [] out_chunk_shape = (0, ) * len(input_df.shape) blocks = {} for chunk in input_df.chunks: block_id = str(int(time.time())) + '_' + str(uuid.uuid4()).replace( '-', '') chunk_op = DataFrameWriteTableSplit( dtypes=op.dtypes, table_name=op.table_name, partition_spec=op.partition_spec, cupid_handle=to_str(upload_session.handle), block_id=block_id, write_batch_size=op.write_batch_size) out_chunk = chunk_op.new_chunk([chunk], shape=out_chunk_shape, index=chunk.index, dtypes=chunk.dtypes) out_chunks.append(out_chunk) blocks[block_id] = op.partition_spec # build commit tree combine_size = 8 chunks = out_chunks while len(chunks) > combine_size: new_chunks = [] for i in range(0, len(chunks), combine_size): chks = chunks[i:i + combine_size] if len(chks) == 1: chk = chks[0] else: chk_op = DataFrameWriteTableCommit(dtypes=op.dtypes, is_terminal=False) chk = chk_op.new_chunk(chks, shape=out_chunk_shape, dtypes=op.dtypes) new_chunks.append(chk) chunks = new_chunks assert len(chunks) < combine_size commit_table_op = DataFrameWriteTableCommit(dtypes=op.dtypes, table_name=op.table_name, blocks=blocks, cupid_handle=to_str( upload_session.handle), overwrite=op.overwrite, odps_params=op.odps_params, is_terminal=True) commit_table_chunk = commit_table_op.new_chunk(chunks, shape=out_chunk_shape, dtypes=op.dtypes) out_df = op.outputs[0] new_op = op.copy() return new_op.new_dataframes(op.inputs, shape=out_df.shape, dtypes=out_df.dtypes, chunks=[commit_table_chunk], nsplits=((0, ), ) * len(out_chunk_shape))
class DataFrameWriteTableSplit(DataFrameOperand, DataFrameOperandMixin): _op_type_ = 123461 _dtypes = SeriesField('dtypes') _table_name = StringField('table_name') _partition_spec = StringField('partition_spec') _cupid_handle = StringField('cupid_handle') _block_id = StringField('block_id') _write_batch_size = Int64Field('write_batch_size') _unknown_as_string = BoolField('unknown_as_string') # for tunnel _odps_params = DictField('odps_params') def __init__(self, dtypes=None, table_name=None, odps_params=None, partition_spec=None, cupid_handle=None, unknown_as_string=None, block_id=None, write_batch_size=None, **kw): kw.update(_output_type_kw) super(DataFrameWriteTableSplit, self).__init__(_dtypes=dtypes, _table_name=table_name, _odps_params=odps_params, _partition_spec=partition_spec, _unknown_as_string=unknown_as_string, _cupid_handle=cupid_handle, _block_id=block_id, _write_batch_size=write_batch_size, **kw) @property def retryable(self): return False @property def dtypes(self): return self._dtypes @property def table_name(self): return self._table_name @property def odps_params(self): return self._odps_params @property def unknown_as_string(self): return self._unknown_as_string @property def partition_spec(self): return self._partition_spec @property def cupid_handle(self): return self._cupid_handle @property def block_id(self): return self._block_id @property def write_batch_size(self): return self._write_batch_size @classmethod def _execute_in_cupid(cls, ctx, op): import pyarrow as pa import pandas as pd from ....df.backends.pd.types import pd_to_df_schema from cupid.io.table.core import BlockWriter to_store_data = ctx[op.inputs[0].key] odps_schema = pd_to_df_schema(to_store_data, unknown_as_string=op.unknown_as_string) project_name, table_name = op.table_name.split('.') block_writer = BlockWriter(_table_name=table_name, _project_name=project_name, _table_schema=odps_schema, _partition_spec=op.partition_spec, _block_id=op.block_id, _handle=op.cupid_handle) logger.debug('Start writing table block, block id: %s', op.block_id) with block_writer.open_arrow_writer() as cupid_writer: sink = pa.BufferOutputStream() batch_size = op.write_batch_size or 1024 batch_idx = 0 batch_data = to_store_data[batch_size * batch_idx:batch_size * (batch_idx + 1)] batch_data = convert_pandas_object_to_string(batch_data) schema = pa.RecordBatch.from_pandas(to_store_data[:1], preserve_index=False).schema arrow_writer = pa.RecordBatchStreamWriter(sink, schema) while len(batch_data) > 0: batch = pa.RecordBatch.from_pandas(batch_data, preserve_index=False) arrow_writer.write_batch(batch) batch_idx += 1 batch_data = to_store_data[batch_size * batch_idx:batch_size * (batch_idx + 1)] arrow_writer.close() cupid_writer.write(sink.getvalue()) logger.debug('Write table block finished, block id: %s', op.block_id) block_writer.commit() ctx[op.outputs[0].key] = pd.DataFrame() @classmethod def _execute_arrow_tunnel(cls, ctx, op): from odps import ODPS from odps.tunnel import TableTunnel import pyarrow as pa import pandas as pd project = os.environ.get('ODPS_PROJECT_NAME', None) odps_params = op.odps_params.copy() if project: odps_params['project'] = project endpoint = os.environ.get( 'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint'] o = ODPS(odps_params['access_id'], odps_params['secret_access_key'], project=odps_params['project'], endpoint=endpoint) t = o.get_table(op.table_name) tunnel = TableTunnel(o, project=t.project) if op.partition_spec is not None: upload_session = tunnel.create_upload_session( t.name, partition_spec=op.partition_spec) else: upload_session = tunnel.create_upload_session(t.name) logger.debug('Start writing table %s split index: %s', op.table_name, op.inputs[0].index) writer = upload_session.open_arrow_writer(0) arrow_rb = pa.RecordBatch.from_pandas(ctx[op.inputs[0].key]) writer.write(arrow_rb) writer.close() upload_session.commit([0]) logger.debug('Finish writing table %s split index: %s', op.table_name, op.inputs[0].index) ctx[op.outputs[0].key] = pd.DataFrame() @classmethod def execute(cls, ctx, op): if op.cupid_handle is not None: cls._execute_in_cupid(ctx, op) else: cls._execute_arrow_tunnel(ctx, op)
class DataFrameWriteTable(DataFrameOperand, DataFrameOperandMixin): _op_type_ = 123460 _dtypes = SeriesField('dtypes') _odps_params = DictField('odps_params') _table_name = StringField('table_name') _partition_spec = StringField('partition_spec') _overwrite = BoolField('overwrite') _write_batch_size = Int64Field('write_batch_size') _unknown_as_string = BoolField('unknown_as_string') def __init__(self, dtypes=None, odps_params=None, table_name=None, partition_spec=None, unknown_as_string=None, over_write=None, write_batch_size=None, **kw): kw.update(_output_type_kw) super(DataFrameWriteTable, self).__init__(_dtypes=dtypes, _odps_params=odps_params, _table_name=table_name, _partition_spec=partition_spec, _unknown_as_string=unknown_as_string, _overwrite=over_write, _write_batch_size=write_batch_size, **kw) @property def retryable(self): return False @property def dtypes(self): return self._dtypes @property def unknown_as_string(self): return self._unknown_as_string @property def odps_params(self): return self._odps_params @property def table_name(self): return self._table_name @property def partition_spec(self): return self._partition_spec @property def overwrite(self): return self._overwrite @property def write_batch_size(self): return self._write_batch_size def __call__(self, x): shape = (0, ) * len(x.shape) index_value = parse_index(x.index_value.to_pandas()[:0], x.key, 'index') columns_value = parse_index(x.columns_value.to_pandas()[:0], x.key, 'columns', store_data=True) return self.new_dataframe([x], shape=shape, dtypes=x.dtypes[:0], index_value=index_value, columns_value=columns_value) @classmethod def _tile_cupid(cls, op): from odps import ODPS from odps.accounts import BearerTokenAccount from cupid import CupidSession, context from cupid.runtime import RuntimeContext if not RuntimeContext.is_context_ready(): raise SystemError( 'No Mars cluster found, please create via `o.create_mars_cluster`.' ) cupid_ctx = context() bearer_token = cupid_ctx.get_bearer_token() account = BearerTokenAccount(bearer_token) project = os.environ.get('ODPS_PROJECT_NAME', None) odps_params = op.odps_params.copy() if project: odps_params['project'] = project endpoint = os.environ.get( 'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint'] o = ODPS(None, None, account=account, project=odps_params['project'], endpoint=endpoint) cupid_session = CupidSession(o) data_src = o.get_table(op.table_name) logger.debug('Start creating upload session from cupid.') upload_session = cupid_session.create_upload_session(data_src) input_df = build_concatenated_rows_frame(op.inputs[0]) out_df = op.outputs[0] out_chunks = [] out_chunk_shape = (0, ) * len(input_df.shape) blocks = {} for chunk in input_df.chunks: block_id = str(int(time.time())) + '_' + str(uuid.uuid4()).replace( '-', '') chunk_op = DataFrameWriteTableSplit( dtypes=op.dtypes, table_name=op.table_name, unknown_as_string=op.unknown_as_string, partition_spec=op.partition_spec, cupid_handle=to_str(upload_session.handle), block_id=block_id, write_batch_size=op.write_batch_size) out_chunk = chunk_op.new_chunk([chunk], shape=out_chunk_shape, index=chunk.index, index_value=out_df.index_value, dtypes=chunk.dtypes) out_chunks.append(out_chunk) blocks[block_id] = op.partition_spec # build commit tree combine_size = 8 chunks = out_chunks while len(chunks) >= combine_size: new_chunks = [] for i in range(0, len(chunks), combine_size): chks = chunks[i:i + combine_size] if len(chks) == 1: chk = chks[0] else: chk_op = DataFrameWriteTableCommit(dtypes=op.dtypes, is_terminal=False) chk = chk_op.new_chunk(chks, shape=out_chunk_shape, index_value=out_df.index_value, dtypes=op.dtypes) new_chunks.append(chk) chunks = new_chunks assert len(chunks) < combine_size commit_table_op = DataFrameWriteTableCommit(dtypes=op.dtypes, table_name=op.table_name, blocks=blocks, cupid_handle=to_str( upload_session.handle), overwrite=op.overwrite, odps_params=op.odps_params, is_terminal=True) commit_table_chunk = commit_table_op.new_chunk( chunks, shape=out_chunk_shape, dtypes=op.dtypes, index_value=out_df.index_value) new_op = op.copy() return new_op.new_dataframes(op.inputs, shape=out_df.shape, index_value=out_df.index_value, dtypes=out_df.dtypes, columns_value=out_df.columns_value, chunks=[commit_table_chunk], nsplits=((0, ), ) * len(out_chunk_shape)) @classmethod def _tile_tunnel(cls, op): out_df = op.outputs[0] in_df = build_concatenated_rows_frame(op.inputs[0]) out_chunks = [] for chunk in in_df.chunks: chunk_op = DataFrameWriteTableSplit( dtypes=op.dtypes, table_name=op.table_name, odps_params=op.odps_params, partition_spec=op.partition_spec) index_value = parse_index(chunk.index_value.to_pandas()[:0], chunk) out_chunk = chunk_op.new_chunk([chunk], shape=(0, 0), index_value=index_value, columns_value=out_df.columns_value, dtypes=out_df.dtypes, index=chunk.index) out_chunks.append(out_chunk) new_op = op.copy() params = out_df.params.copy() params.update( dict(chunks=out_chunks, nsplits=((0, ) * in_df.chunk_shape[0], (0, )))) return new_op.new_tileables([in_df], **params) @classmethod def tile(cls, op): from cupid.runtime import RuntimeContext if RuntimeContext.is_context_ready(): return cls._tile_cupid(op) else: return cls._tile_tunnel(op)
class TensorStoreCOO(TensorDataStore): _op_type_ = OperandDef.STORE_COO _input = KeyField('input') _path = StringField('path') _dim_cols = ListField('dim_cols', ValueType.string) _value_col = StringField('value_col') _storage_options = StringField('storage_options') _global_index = BoolField('global_index', default=False) _axis_offsets = TupleField('axis_offsets') def __init__(self, dtype=None, path=None, dim_cols=None, value_col=None, storage_options=None, sparse=True, global_index=False, **kw): super(TensorStoreCOO, self).__init__(_path=path, _dim_cols=dim_cols, _value_col=value_col, _dtype=dtype, _storage_options=storage_options, _global_index=global_index, _sparse=sparse, **kw) @property def input(self): return self._input @property def path(self): return self._path @property def dim_cols(self): return self._dim_cols @property def value_col(self): return self._value_col @property def storage_options(self): return self._storage_options @property def global_index(self): return self._global_index @property def axis_offsets(self): return self._axis_offsets def _set_inputs(self, inputs): super(TensorStoreCOO, self)._set_inputs(inputs) self._input = self._inputs[0] def calc_shape(self, *inputs_shape): return (0,) * len(inputs_shape[0]) @classmethod def tile(cls, op): in_tensor = op.input out_chunks = [] out_chunk_shape = (0,) * in_tensor.ndim axis_offsets = [[0] + np.cumsum(ns)[:-1].tolist() for ns in in_tensor.nsplits] for chunk in in_tensor.chunks: chunk_op = op.copy().reset_key() chunk_path = '%s/%s.parquet' % ( chunk_op.path, ','.join(str(j) for j in chunk.index)) chunk_op._path = chunk_path chunk_op._axis_offsets = \ tuple(axis_offsets[axis][idx] for axis, idx in enumerate(chunk.index)) out_chunk = chunk_op.new_chunk([chunk], shape=out_chunk_shape, index=chunk.index) out_chunks.append(out_chunk) new_op = op.copy() return new_op.new_tensors(op.inputs, op.outputs[0].shape, chunks=out_chunks, nsplits=((0,) * len(ns) for ns in in_tensor.nsplits)) @classmethod def execute(cls, ctx, op): import numpy as np import pandas as pd import pyarrow as pa import pyarrow.parquet as pq from ..io import open as fs_open to_store_data = ctx[op.inputs[0].key] storage_opts = json.loads(op.storage_options) axis_offsets = op.axis_offsets store_global_index = op.global_index dim_cols = op.dim_cols col_to_array = {} if isinstance(to_store_data, SparseNDArray): # sparse, convert to coo matrix matrix = to_store_data.raw.tocoo(copy=False) ndim = matrix.ndim if len(dim_cols) > 1: col_to_array[dim_cols[0]] = matrix.row if store_global_index: # global index col_to_array['global_' + dim_cols[0]] = matrix.row + axis_offsets[0] col_to_array[dim_cols[1]] = matrix.col if store_global_index: col_to_array['global_' + dim_cols[1]] = matrix.col + axis_offsets[1] else: col_to_array[dim_cols[0]] = matrix.col if store_global_index: col_to_array['global_' + dim_cols[0]] = matrix.col + axis_offsets[0] col_to_array[op.value_col] = matrix.data else: # dense, convert to numpy array arr = as_np_array(to_store_data) ndim = arr.ndim index = np.array(np.meshgrid(*[np.arange(s) for s in arr.shape])).T.reshape(-1, arr.ndim).T for j, col, ind in zip(range(len(dim_cols)), dim_cols, index): col_to_array[col] = ind if store_global_index: col_to_array['global_' + col] = ind + axis_offsets[j] col_to_array[op.value_col] = arr.ravel() df = pd.DataFrame(col_to_array) if len(op.dim_cols) > ndim: for col in op.dim_cols[ndim:]: df[col] = None table = pa.Table.from_pandas(df) bio = BytesIO() pq.write_table(table, bio) bio.seek(0) # write oss with fs_open(op.path, 'wb', **storage_opts) as out_file: out_file.write(bio.read()) ctx[op.outputs[0].key] = np.empty((0,) * to_store_data.ndim)
class DataFrameWhere(DataFrameOperand, DataFrameOperandMixin): _op_type_ = opcodes.WHERE _input = AnyField('input') _cond = AnyField('cond') _other = AnyField('other') _axis = Int32Field('axis') _level = AnyField('level') _errors = StringField('errors') _try_cast = BoolField('try_cast') _replace_true = BoolField('replace_true') def __init__(self, input=None, cond=None, other=None, # pylint: disable=redefined-builtin axis=None, level=None, errors=None, try_cast=None, replace_true=None, **kw): super().__init__(_input=input, _cond=cond, _other=other, _axis=axis, _level=level, _errors=errors, _try_cast=try_cast, _replace_true=replace_true, **kw) @property def input(self): return self._input @property def cond(self): return self._cond @property def other(self): return self._other @property def axis(self): return self._axis @property def level(self): return self._level @property def errors(self): return self._errors @property def try_cast(self): return self._try_cast @property def replace_true(self): return self._replace_true def __call__(self, df_or_series): def _check_input_index(obj, axis=None): axis = axis if axis is not None else self.axis if isinstance(obj, DATAFRAME_TYPE) \ and ( df_or_series.columns_value.key != obj.columns_value.key or df_or_series.index_value.key != obj.index_value.key ): raise NotImplementedError('Aligning different indices not supported') elif isinstance(obj, SERIES_TYPE) \ and df_or_series.axes[axis].index_value.key != obj.index_value.key: raise NotImplementedError('Aligning different indices not supported') _check_input_index(self.cond, axis=0) _check_input_index(self.other) if isinstance(df_or_series, DATAFRAME_TYPE): mock_obj = build_df(df_or_series) else: mock_obj = build_series(df_or_series) if isinstance(self.other, (pd.DataFrame, DATAFRAME_TYPE)): mock_other = build_df(self.other) elif isinstance(self.other, (pd.Series, SERIES_TYPE)): mock_other = build_series(self.other) else: mock_other = self.other result_df = mock_obj.where(np.zeros(mock_obj.shape).astype(bool), other=mock_other, axis=self.axis, level=self.level, errors=self.errors, try_cast=self.try_cast) inputs = filter_inputs([df_or_series, self.cond, self.other]) if isinstance(df_or_series, DATAFRAME_TYPE): return self.new_dataframe(inputs, shape=df_or_series.shape, dtypes=result_df.dtypes, index_value=df_or_series.index_value, columns_value=df_or_series.columns_value) else: return self.new_series(inputs, shape=df_or_series.shape, name=df_or_series.name, dtype=result_df.dtype, index_value=df_or_series.index_value) def _set_inputs(self, inputs): super()._set_inputs(inputs) inputs_iter = iter(self._inputs) self._input = next(inputs_iter) if isinstance(self._cond, (Base, Entity)): self._cond = next(inputs_iter) if isinstance(self._other, (Base, Entity)): self._other = next(inputs_iter) @classmethod def tile(cls, op: "DataFrameWhere"): def rechunk_input(inp, axis=None): axis = axis if axis is not None else op.axis if isinstance(inp, DATAFRAME_TYPE): inp = inp.rechunk(op.input.nsplits)._inplace_tile() elif isinstance(inp, SERIES_TYPE): inp = inp.rechunk({0: op.input.nsplits[axis]})._inplace_tile() return inp def get_tiled_chunk(obj, index, axis=None): if isinstance(obj, DATAFRAME_TYPE): return obj.cix[index[0], index[1]] elif isinstance(obj, SERIES_TYPE): axis = axis if axis is not None else op.axis return obj.cix[index[axis], ] else: return obj # TODO support axis alignment for three objects cond = rechunk_input(op.cond, axis=0) other = rechunk_input(op.other) chunks = [] for c in op.input.chunks: cond_chunk = get_tiled_chunk(cond, c.index, axis=0) other_chunk = get_tiled_chunk(other, c.index) new_op = op.copy().reset_key() new_op._cond = cond_chunk new_op._other = other_chunk inputs = filter_inputs([c, cond_chunk, other_chunk]) chunks.append(new_op.new_chunk(inputs, **c.params)) new_op = op.copy().reset_key() return new_op.new_tileables(op.inputs, chunks=chunks, nsplits=op.input.nsplits, **op.input.params) @classmethod def execute(cls, ctx, op: "DataFrameWhere"): out_obj = op.outputs[0] input_data = ctx[op.input.key] cond = op.cond if isinstance(cond, (Base, Entity)): cond = ctx[cond.key] other = op.other if isinstance(other, (Base, Entity)): other = ctx[other.key] if op.replace_true: ctx[out_obj.key] = input_data.mask(cond, other, axis=op.axis, level=op.level, errors=op.errors, try_cast=op.try_cast) else: ctx[out_obj.key] = input_data.where(cond, other, axis=op.axis, level=op.level, errors=op.errors, try_cast=op.try_cast)
class DataFrameReadTable(DataFrameOperand, DataFrameOperandMixin): _op_type_ = 123450 _odps_params = DictField('odps_params') _table_name = StringField('table_name') _partition_spec = StringField('partition_spec') _dtypes = SeriesField('dtypes') _add_offset = BoolField('add_offset') _columns = ListField('columns') def __init__(self, odps_params=None, table_name=None, partition_spec=None, columns=None, dtypes=None, sparse=None, add_offset=True, **kw): super(DataFrameReadTable, self).__init__(_odps_params=odps_params, _table_name=table_name, _partition_spec=partition_spec, _columns=columns, _dtypes=dtypes, _sparse=sparse, _add_offset=add_offset, _object_type=ObjectType.dataframe, **kw) @property def retryable(self): return False @property def odps_params(self): return self._odps_params @property def table_name(self): return self._table_name @property def partition(self): return getattr(self, '_partition_spec', None) @property def columns(self): return self._columns @property def dtypes(self): return self._dtypes @property def add_offset(self): return self._add_offset def __call__(self, shape, chunk_bytes=None): import numpy as np import pandas as pd if np.isnan(shape[0]): index_value = parse_index(pd.RangeIndex(0)) else: index_value = parse_index(pd.RangeIndex(shape[0])) columns_value = parse_index(self.dtypes.index, store_data=True) return self.new_dataframe(None, shape, dtypes=self.dtypes, index_value=index_value, columns_value=columns_value, chunk_bytes=chunk_bytes) @classmethod def tile(cls, op): import numpy as np import pandas as pd from odps import ODPS from odps.accounts import BearerTokenAccount from cupid import CupidSession, context bearer_token = context().get_bearer_token() account = BearerTokenAccount(bearer_token) project = os.environ.get('ODPS_PROJECT_NAME', None) odps_params = op.odps_params.copy() if project: odps_params['project'] = project o = ODPS(None, None, account=account, **odps_params) cupid_session = CupidSession(o) df = op.outputs[0] split_size = df.extra_params.chunk_bytes or CHUNK_LIMIT data_src = o.get_table(op.table_name) if op.partition is not None: data_src = data_src.get_partition(op.partition) logger.debug('Start creating download session from cupid.') while True: try: download_session = cupid_session.create_download_session( data_src, split_size=split_size, columns=op.columns) break except CupidError: logger.debug( 'The number of splits exceeds 100000, split_size is {}'. format(split_size)) if split_size >= MAX_CHUNK_SIZE: raise else: split_size *= 2 logger.debug('%s table splits have been created.', str(len(download_session.splits))) out_chunks = [] # Ignore add_offset at this time. op._add_offset = False for idx, split in enumerate(download_session.splits): chunk_op = DataFrameReadTableSplit( cupid_handle=to_str(split.handle), split_index=split.split_index, split_file_start=split.split_file_start, split_file_end=split.split_file_end, schema_file_start=split.schema_file_start, schema_file_end=split.schema_file_end, add_offset=op.add_offset, dtypes=op.dtypes, sparse=op.sparse) # the chunk shape is unknown index_value = parse_index(pd.RangeIndex(0)) columns_value = parse_index(df.dtypes.index, store_data=True) out_chunk = chunk_op.new_chunk(None, shape=(np.nan, df.shape[1]), dtypes=op.dtypes, index_value=index_value, columns_value=columns_value, index=(idx, 0)) out_chunks.append(out_chunk) if op.add_offset: out_chunks = standardize_range_index(out_chunks) new_op = op.copy() nsplits = ((np.nan, ) * len(out_chunks), (df.shape[1], )) return new_op.new_dataframes(None, shape=df.shape, dtypes=op.dtypes, index_value=df.index_value, columns_value=df.columns_value, chunks=out_chunks, nsplits=nsplits)
class DataFrameWriteTableSplit(DataFrameOperand, DataFrameOperandMixin): _op_type_ = 123461 _dtypes = SeriesField('dtypes') _table_name = StringField('table_name') _partition_spec = StringField('partition_spec') _cupid_handle = StringField('cupid_handle') _block_id = StringField('block_id') _write_batch_size = Int64Field('write_batch_size') _unknown_as_string = BoolField('unknown_as_string') def __init__(self, dtypes=None, table_name=None, partition_spec=None, cupid_handle=None, unknown_as_string=None, block_id=None, write_batch_size=None, **kw): kw.update(_output_type_kw) super(DataFrameWriteTableSplit, self).__init__(_dtypes=dtypes, _table_name=table_name, _partition_spec=partition_spec, _unknown_as_string=unknown_as_string, _cupid_handle=cupid_handle, _block_id=block_id, _write_batch_size=write_batch_size, **kw) @property def retryable(self): return False @property def dtypes(self): return self._dtypes @property def table_name(self): return self._table_name @property def unknown_as_string(self): return self._unknown_as_string @property def partition_spec(self): return self._partition_spec @property def cupid_handle(self): return self._cupid_handle @property def block_id(self): return self._block_id @property def write_batch_size(self): return self._write_batch_size @classmethod def execute(cls, ctx, op): import pyarrow as pa import pandas as pd from ...df.backends.pd.types import pd_to_df_schema from cupid.io.table.core import BlockWriter to_store_data = ctx[op.inputs[0].key] odps_schema = pd_to_df_schema(to_store_data, unknown_as_string=op.unknown_as_string) project_name, table_name = op.table_name.split('.') block_writer = BlockWriter(_table_name=table_name, _project_name=project_name, _table_schema=odps_schema, _partition_spec=op.partition_spec, _block_id=op.block_id, _handle=op.cupid_handle) logger.debug('Start writing table block, block id: %s', op.block_id) with block_writer.open_arrow_writer() as cupid_writer: sink = pa.BufferOutputStream() batch_size = op.write_batch_size or 1024 batch_idx = 0 batch_data = to_store_data[batch_size * batch_idx:batch_size * (batch_idx + 1)] batch_data = convert_pandas_object_to_string(batch_data) schema = pa.RecordBatch.from_pandas(to_store_data[:1], preserve_index=False).schema arrow_writer = pa.RecordBatchStreamWriter(sink, schema) while len(batch_data) > 0: batch = pa.RecordBatch.from_pandas(batch_data, preserve_index=False) arrow_writer.write_batch(batch) batch_idx += 1 batch_data = to_store_data[batch_size * batch_idx:batch_size * (batch_idx + 1)] arrow_writer.close() cupid_writer.write(sink.getvalue()) logger.debug('Write table block finished, block id: %s', op.block_id) block_writer.commit() ctx[op.outputs[0].key] = pd.DataFrame()
class DataFrameReadTable(DataFrameOperand, DataFrameOperandMixin): _op_type_ = 123450 _odps_params = DictField('odps_params') _table_name = StringField('table_name') _partition_spec = StringField('partition_spec') _dtypes = SeriesField('dtypes') _add_offset = BoolField('add_offset') _columns = ListField('columns') _nrows = Int64Field('nrows') _use_arrow_dtype = BoolField('use_arrow_dtype') def __init__(self, odps_params=None, table_name=None, partition_spec=None, columns=None, dtypes=None, nrows=None, sparse=None, add_offset=True, use_arrow_dtype=None, **kw): kw.update(_output_type_kw) super(DataFrameReadTable, self).__init__(_odps_params=odps_params, _table_name=table_name, _partition_spec=partition_spec, _columns=columns, _dtypes=dtypes, _nrows=nrows, _sparse=sparse, _use_arrow_dtype=use_arrow_dtype, _add_offset=add_offset, **kw) @property def retryable(self): return False @property def odps_params(self): return self._odps_params @property def table_name(self): return self._table_name @property def partition(self): return getattr(self, '_partition_spec', None) @property def columns(self): return self._columns @property def dtypes(self): return self._dtypes @property def nrows(self): return self._nrows @property def use_arrow_dtype(self): return self._use_arrow_dtype @property def add_offset(self): return self._add_offset def __call__(self, shape, chunk_bytes=None): import numpy as np import pandas as pd if np.isnan(shape[0]): index_value = parse_index(pd.RangeIndex(0)) else: index_value = parse_index(pd.RangeIndex(shape[0])) columns_value = parse_index(self.dtypes.index, store_data=True) return self.new_dataframe(None, shape, dtypes=self.dtypes, index_value=index_value, columns_value=columns_value, chunk_bytes=chunk_bytes) @classmethod def tile(cls, op): import numpy as np import pandas as pd from odps import ODPS from odps.accounts import BearerTokenAccount from cupid import CupidSession, context from mars.context import get_context cupid_ctx = context() if cupid_ctx is None: raise SystemError( 'No Mars cluster found, please create via `o.create_mars_cluster`.' ) bearer_token = cupid_ctx.get_bearer_token() account = BearerTokenAccount(bearer_token) project = os.environ.get('ODPS_PROJECT_NAME', None) odps_params = op.odps_params.copy() if project: odps_params['project'] = project o = ODPS(None, None, account=account, **odps_params) cupid_session = CupidSession(o) mars_context = get_context() df = op.outputs[0] split_size = df.extra_params.chunk_bytes or READ_CHUNK_LIMIT data_src = o.get_table(op.table_name) if op.partition is not None: data_src = data_src.get_partition(op.partition) try: data_store_size = data_src.size except ODPSError: # fail to get data size, just ignore pass else: if data_store_size < split_size and mars_context is not None: # get worker counts worker_count = max(len(mars_context.get_worker_addresses()), 1) # data is too small, split as many as number of cores split_size = data_store_size // worker_count # at least 1M split_size = max(split_size, 1 * 1024**2) logger.debug( 'Input data size is too small, split_size is {}'.format( split_size)) logger.debug( 'Start creating download session of table {} from cupid.'.format( op.table_name)) while True: try: download_session = cupid_session.create_download_session( data_src, split_size=split_size, columns=op.columns) break except CupidError: logger.debug( 'The number of splits exceeds 100000, split_size is {}'. format(split_size)) if split_size >= MAX_CHUNK_SIZE: raise else: split_size *= 2 logger.debug('%s table splits have been created.', str(len(download_session.splits))) if np.isnan(df.shape[0]): est_chunk_rows = [None] * len(download_session.splits) else: sp_file_sizes = np.array([ sp.split_file_end - sp.split_file_start for sp in download_session.splits ]) total_size = sp_file_sizes.sum() est_chunk_rows = sp_file_sizes * df.shape[0] // total_size logger.warning('Estimated chunk rows: %r', est_chunk_rows) out_chunks = [] # Ignore add_offset at this time. op._add_offset = False if len(download_session.splits) == 0: logger.debug('Table {} has no data'.format(op.table_name)) chunk_op = DataFrameReadTableSplit() index_value = parse_index(pd.RangeIndex(0)) columns_value = parse_index(df.dtypes.index, store_data=True) out_chunk = chunk_op.new_chunk(None, shape=(np.nan, df.shape[1]), dtypes=op.dtypes, index_value=index_value, columns_value=columns_value, index=(0, 0)) out_chunks = [out_chunk] else: for idx, split in enumerate(download_session.splits): chunk_op = DataFrameReadTableSplit( cupid_handle=to_str(split.handle), split_index=split.split_index, split_file_start=split.split_file_start, split_file_end=split.split_file_end, schema_file_start=split.schema_file_start, schema_file_end=split.schema_file_end, add_offset=op.add_offset, dtypes=op.dtypes, sparse=op.sparse, split_size=split_size, use_arrow_dtype=op.use_arrow_dtype, estimate_rows=est_chunk_rows[idx]) # the chunk shape is unknown index_value = parse_index(pd.RangeIndex(0)) columns_value = parse_index(df.dtypes.index, store_data=True) out_chunk = chunk_op.new_chunk(None, shape=(np.nan, df.shape[1]), dtypes=op.dtypes, index_value=index_value, columns_value=columns_value, index=(idx, 0)) out_chunks.append(out_chunk) if op.add_offset: out_chunks = standardize_range_index(out_chunks) new_op = op.copy() nsplits = ((np.nan, ) * len(out_chunks), (df.shape[1], )) return new_op.new_dataframes(None, shape=df.shape, dtypes=op.dtypes, index_value=df.index_value, columns_value=df.columns_value, chunks=out_chunks, nsplits=nsplits)
class DataFrameReadTableSplit(DataFrameOperand, DataFrameOperandMixin): _op_type_ = 123451 _cupid_handle = StringField('cupid_handle') _split_index = Int64Field('split_index') _split_file_start = Int64Field('split_file_start') _split_file_end = Int64Field('split_file_end') _schema_file_start = Int64Field('schema_file_start') _schema_file_end = Int64Field('schema_file_end') _use_arrow_dtype = BoolField('use_arrow_dtype') _dtypes = SeriesField('dtypes') _nrows = Int64Field('nrows') _split_size = Int64Field('split_size') _estimate_rows = Int64Field('estimate_rows') def __init__(self, cupid_handle=None, split_index=None, split_file_start=None, split_file_end=None, schema_file_start=None, schema_file_end=None, nrows=None, dtypes=None, split_size=None, use_arrow_dtype=None, estimate_rows=None, sparse=None, **kw): kw.update(_output_type_kw) super(DataFrameReadTableSplit, self).__init__(_cupid_handle=cupid_handle, _split_index=split_index, _split_file_start=split_file_start, _split_file_end=split_file_end, _schema_file_start=schema_file_start, _schema_file_end=schema_file_end, _use_arrow_dtype=use_arrow_dtype, _nrows=nrows, _estimate_rows=estimate_rows, _split_size=split_size, _dtypes=dtypes, _sparse=sparse, **kw) @property def retryable(self): return False @property def output_limit(self): return 1 @property def cupid_handle(self): return self._cupid_handle @property def split_index(self): return self._split_index @property def split_file_start(self): return self._split_file_start @property def split_file_end(self): return self._split_file_end @property def schema_file_start(self): return self._schema_file_start @property def schema_file_end(self): return self._schema_file_end @property def nrows(self): return self._nrows @property def dtypes(self): return self._dtypes @property def split_size(self): return self._split_size @property def estimate_rows(self): return self._estimate_rows @property def use_arrow_dtype(self): return self._use_arrow_dtype @classmethod def estimate_size(cls, ctx, op): import numpy as np def is_object_dtype(dtype): try: return np.issubdtype(dtype, np.object_) \ or np.issubdtype(dtype, np.unicode_) \ or np.issubdtype(dtype, np.bytes_) except TypeError: # pragma: no cover return False if op.split_size is None: ctx[op.outputs[0].key] = (0, 0) return arrow_size = ORC_COMPRESSION_RATIO * op.split_size n_strings = len([dt for dt in op.dtypes if is_object_dtype(dt)]) if op.estimate_rows or op.nrows: rows = op.nrows if op.nrows is not None else op.estimate_rows pd_size = arrow_size + n_strings * rows * STRING_FIELD_OVERHEAD logger.debug('Estimate pandas memory cost: %r', pd_size) else: pd_size = arrow_size * 10 if n_strings else arrow_size ctx[op.outputs[0].key] = (pd_size, pd_size + arrow_size) @classmethod def execute(cls, ctx, op): import pyarrow as pa from cupid.io.table import TableSplit if op.cupid_handle is None: empty_df = pd.DataFrame() for name, dtype in op.outputs[0].dtypes.items(): empty_df[name] = pd.Series(dtype=dtype) ctx[op.outputs[0].key] = empty_df return tsp = TableSplit( _handle=op.cupid_handle, _split_index=op.split_index, _split_file_start=op.split_file_start, _split_file_end=op.split_file_end, _schema_file_start=op.schema_file_start, _schema_file_end=op.schema_file_end, ) logger.debug('Read split table, split index: %s', op.split_index) reader = tsp.open_arrow_reader() if op.nrows is not None: nrows = 0 batches = [] while nrows < op.nrows: try: batch = reader.read_next_batch() nrows += batch.num_rows batches.append(batch) except StopIteration: break logger.debug('Read %s rows of this split.', op.nrows) data = arrow_table_to_pandas_dataframe( pa.Table.from_batches(batches), use_arrow_dtype=op.use_arrow_dtype)[:op.nrows] else: arrow_table = reader.read_all() data = arrow_table_to_pandas_dataframe( arrow_table, use_arrow_dtype=op.use_arrow_dtype) data_columns = data.dtypes.index expected_columns = op.outputs[0].dtypes.index if not data_columns.equals(expected_columns): logger.debug("Data columns differs from output columns, " "data columns: {}, output columns: {}".format( data_columns, expected_columns)) data.columns = expected_columns logger.debug('Read split table finished, split index: %s', op.split_index) logger.debug('Split data shape is {}, size is {}'.format( data.shape, data.memory_usage(deep=True).sum())) ctx[op.outputs[0].key] = data
class DataFrameReadTable(DataFrameOperand, DataFrameOperandMixin): _op_type_ = 123450 _odps_params = DictField('odps_params') _table_name = StringField('table_name') _partition_spec = StringField('partition_spec') _dtypes = SeriesField('dtypes') _add_offset = BoolField('add_offset') def __init__(self, odps_params=None, table_name=None, partition_spec=None, dtypes=None, sparse=None, add_offset=True, **kw): super(DataFrameReadTable, self).__init__(_odps_params=odps_params, _table_name=table_name, _partition_spec=partition_spec, _dtypes=dtypes, _sparse=sparse, _add_offset=add_offset, _object_type=ObjectType.dataframe, **kw) @property def odps_params(self): return self._odps_params @property def table_name(self): return self._table_name @property def partition(self): return getattr(self, '_partition_spec', None) @property def dtypes(self): return self._dtypes @property def add_offset(self): return self._add_offset def __call__(self, shape, chunk_store_limit=None): import numpy as np import pandas as pd if np.isnan(shape[0]): index_value = parse_index(pd.RangeIndex(0)) else: index_value = parse_index(pd.RangeIndex(shape[0])) columns_value = parse_index(self.dtypes.index, store_data=True) return self.new_dataframe(None, shape, dtypes=self.dtypes, index_value=index_value, columns_value=columns_value, chunk_store_limit=chunk_store_limit) @classmethod def tile(cls, op): import numpy as np import pandas as pd from odps import ODPS from odps.accounts import BearerTokenAccount from cupid import CupidSession, context bearer_token = context().get_bearer_token() account = BearerTokenAccount(bearer_token) o = ODPS(None, None, account=account, **op.odps_params) cupid_session = CupidSession(o) df = op.outputs[0] split_size = df.extra_params.chunk_store_limit or options.tensor.chunk_store_limit data_src = o.get_table(op.table_name) if op.partition is not None: data_src = data_src.get_partition(op.partition) logger.debug('Start creating download session from cupid.') download_session = cupid_session.create_download_session( data_src, split_size=split_size) logger.debug('%s table splits have been created.', str(len(download_session.splits))) out_chunks = [] out_count_chunks = [] for idx, split in enumerate(download_session.splits): chunk_op = DataFrameReadTableSplit( cupid_handle=to_str(split.handle), split_index=split.split_index, split_file_start=split.split_file_start, split_file_end=split.split_file_end, schema_file_start=split.schema_file_start, schema_file_end=split.schema_file_end, dtypes=op.dtypes, sparse=op.sparse) # the chunk shape is unknown index_value = parse_index(pd.RangeIndex(0)) columns_value = parse_index(df.dtypes.index, store_data=True) out_chunk, out_count_chunk = chunk_op.new_chunks( None, kws=[{ 'shape': (np.nan, df.shape[1]), 'dtypes': op.dtypes, 'index_value': index_value, 'columns_value': columns_value, 'index': (idx, ) }, { 'shape': (1, ), 'index': (idx, ) }]) out_chunks.append(out_chunk) out_count_chunks.append(out_count_chunk) if op.add_offset: output_chunks = [] for i, chunk in enumerate(out_chunks): if i == 0: output_chunks.append(chunk) continue counts = out_count_chunks[:i] inputs = [chunk] + counts output_chunk = DataFrameReadTableWithOffset( dtypes=chunk.dtypes).new_chunk( inputs, shape=chunk.shape, index=chunk.index, dtypes=chunk.dtypes, index_value=chunk.index_value, columns_value=chunk.columns_value) output_chunks.append(output_chunk) else: output_chunks = out_chunks new_op = op.copy() nsplits = ((np.nan, ) * len(output_chunks), (df.shape[1], )) return new_op.new_dataframes(None, shape=df.shape, dtypes=op.dtypes, index_value=df.index_value, columns_value=df.columns_value, chunks=output_chunks, nsplits=nsplits)
class DataFrameReadTable(_Base): _op_type_ = 123450 _odps_params = DictField('odps_params') _table_name = StringField('table_name') _partition_spec = StringField('partition_spec') _dtypes = SeriesField('dtypes') _add_offset = BoolField('add_offset') _columns = ListField('columns') _nrows = Int64Field('nrows') _use_arrow_dtype = BoolField('use_arrow_dtype') _string_as_binary = BoolField('string_as_binary') _append_partitions = BoolField('append_partitions') _last_modified_time = Int64Field('last_modified_time') _with_split_meta_on_tile = BoolField('with_split_meta_on_tile') def __init__(self, odps_params=None, table_name=None, partition_spec=None, columns=None, dtypes=None, nrows=None, sparse=None, add_offset=True, use_arrow_dtype=None, string_as_binary=None, memory_scale=None, append_partitions=None, last_modified_time=None, with_split_meta_on_tile=False, **kw): kw.update(_output_type_kw) super(DataFrameReadTable, self).__init__(_odps_params=odps_params, _table_name=table_name, _partition_spec=partition_spec, _columns=columns, _dtypes=dtypes, _nrows=nrows, _sparse=sparse, _use_arrow_dtype=use_arrow_dtype, _string_as_binary=string_as_binary, _add_offset=add_offset, _append_partitions=append_partitions, _last_modified_time=last_modified_time, _memory_scale=memory_scale, _with_split_meta_on_tile=with_split_meta_on_tile, **kw) @property def retryable(self): return False @property def odps_params(self): return self._odps_params @property def table_name(self): return self._table_name @property def partition(self): return getattr(self, '_partition_spec', None) @property def columns(self): return self._columns @property def dtypes(self): return self._dtypes @property def nrows(self): return self._nrows @property def use_arrow_dtype(self): return self._use_arrow_dtype @property def string_as_binary(self): return self._string_as_binary @property def add_offset(self): return self._add_offset @property def append_partitions(self): return self._append_partitions @property def with_split_meta_on_tile(self): return self._with_split_meta_on_tile def get_columns(self): return self._columns def set_pruned_columns(self, columns): self._columns = columns def __call__(self, shape, chunk_bytes=None, chunk_size=None): import numpy as np import pandas as pd if np.isnan(shape[0]): index_value = parse_index(pd.RangeIndex(0)) else: index_value = parse_index(pd.RangeIndex(shape[0])) columns_value = parse_index(self.dtypes.index, store_data=True) return self.new_dataframe(None, shape, dtypes=self.dtypes, index_value=index_value, columns_value=columns_value, chunk_bytes=chunk_bytes, chunk_size=chunk_size) @classmethod def _tile_cupid(cls, op): from odps import ODPS from odps.accounts import BearerTokenAccount from cupid import CupidSession, context from cupid.errors import CupidError from mars.context import get_context cupid_ctx = context() bearer_token = cupid_ctx.get_bearer_token() account = BearerTokenAccount(bearer_token) project = os.environ.get('ODPS_PROJECT_NAME', None) odps_params = op.odps_params.copy() if project: odps_params['project'] = project endpoint = os.environ.get( 'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint'] o = ODPS(None, None, account=account, project=odps_params['project'], endpoint=endpoint) cupid_session = CupidSession(o) mars_context = get_context() df = op.outputs[0] split_size = df.extra_params.chunk_bytes or READ_CHUNK_LIMIT out_dtypes = df.dtypes out_shape = df.shape out_columns_value = df.columns_value if op.columns is not None: out_dtypes = out_dtypes[op.columns] out_shape = (df.shape[0], len(op.columns)) out_columns_value = parse_index(out_dtypes.index, store_data=True) table_obj = o.get_table(op.table_name) if not table_obj.schema.partitions: data_srcs = [table_obj] elif op.partition is not None and check_partition_exist( table_obj, op.partition): data_srcs = [table_obj.get_partition(op.partition)] else: data_srcs = list(table_obj.partitions) if op.partition is not None: data_srcs = filter_partitions(o, data_srcs, op.partition) out_chunks = [] chunk_idx = 0 for data_src in data_srcs: try: data_store_size = data_src.size except ODPSError: # fail to get data size, just ignore pass else: if data_store_size < split_size and mars_context is not None: # get worker counts worker_count = max( len(mars_context.get_worker_addresses()), 1) # data is too small, split as many as number of cores split_size = data_store_size // worker_count # at least 1M split_size = max(split_size, 1 * 1024**2) logger.debug( 'Input data size is too small, split_size is %s', split_size) logger.debug( 'Start creating download session of table %s from cupid, ' 'columns: %s', op.table_name, op.columns) while True: try: download_session = cupid_session.create_download_session( data_src, split_size=split_size, columns=op.columns, with_split_meta=op.with_split_meta_on_tile) break except CupidError: logger.debug( 'The number of splits exceeds 100000, split_size is %s', split_size) if split_size >= MAX_CHUNK_SIZE: raise else: split_size *= 2 logger.debug('%s table splits have been created.', str(len(download_session.splits))) meta_chunk_rows = [ split.meta_row_count for split in download_session.splits ] if np.isnan(out_shape[0]): est_chunk_rows = meta_chunk_rows else: sp_file_sizes = np.array([ sp.split_file_end - sp.split_file_start for sp in download_session.splits ]) total_size = sp_file_sizes.sum() ratio_chunk_rows = (sp_file_sizes * out_shape[0] // total_size).tolist() est_chunk_rows = [ mr if mr is not None else rr for mr, rr in zip(meta_chunk_rows, ratio_chunk_rows) ] partition_spec = str(data_src.partition_spec) \ if getattr(data_src, 'partition_spec', None) else None logger.warning('Estimated chunk rows: %r', est_chunk_rows) if len(download_session.splits) == 0: logger.debug('Table %s has no data', op.table_name) chunk_op = DataFrameReadTableSplit() index_value = parse_index(pd.RangeIndex(0)) columns_value = parse_index(out_dtypes.index, store_data=True) out_chunk = chunk_op.new_chunk(None, shape=(np.nan, out_shape[1]), dtypes=op.dtypes, index_value=index_value, columns_value=columns_value, index=(chunk_idx, 0)) out_chunks.append(out_chunk) chunk_idx += 1 else: for idx, split in enumerate(download_session.splits): chunk_op = DataFrameReadTableSplit( cupid_handle=to_str(split.handle), split_index=split.split_index, split_file_start=split.split_file_start, split_file_end=split.split_file_end, schema_file_start=split.schema_file_start, schema_file_end=split.schema_file_end, add_offset=op.add_offset, dtypes=out_dtypes, sparse=op.sparse, split_size=split_size, string_as_binary=op.string_as_binary, use_arrow_dtype=op.use_arrow_dtype, estimate_rows=est_chunk_rows[idx], partition_spec=partition_spec, append_partitions=op.append_partitions, meta_raw_size=split.meta_raw_size, nrows=meta_chunk_rows[idx] or op.nrows, memory_scale=op.memory_scale) # the chunk shape is unknown index_value = parse_index(pd.RangeIndex(0)) columns_value = parse_index(out_dtypes.index, store_data=True) out_chunk = chunk_op.new_chunk(None, shape=(np.nan, out_shape[1]), dtypes=out_dtypes, index_value=index_value, columns_value=columns_value, index=(chunk_idx, 0)) chunk_idx += 1 out_chunks.append(out_chunk) if op.add_offset: out_chunks = standardize_range_index(out_chunks) new_op = op.copy() nsplits = ((np.nan, ) * len(out_chunks), (out_shape[1], )) return new_op.new_dataframes(None, shape=out_shape, dtypes=op.dtypes, index_value=df.index_value, columns_value=out_columns_value, chunks=out_chunks, nsplits=nsplits) @classmethod def _tile_tunnel(cls, op): from odps import ODPS project = os.environ.get('ODPS_PROJECT_NAME', None) odps_params = op.odps_params.copy() if project: odps_params['project'] = project endpoint = os.environ.get( 'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint'] o = ODPS(odps_params['access_id'], odps_params['secret_access_key'], project=odps_params['project'], endpoint=endpoint) table_obj = o.get_table(op.table_name) if not table_obj.schema.partitions: data_srcs = [table_obj] elif op.partition is not None and check_partition_exist( table_obj, op.partition): data_srcs = [table_obj.get_partition(op.partition)] else: data_srcs = list(table_obj.partitions) if op.partition is not None: data_srcs = filter_partitions(o, data_srcs, op.partition) out_chunks = [] row_nsplits = [] index_start = 0 df = op.outputs[0] out_dtypes = df.dtypes out_shape = df.shape out_columns_value = df.columns_value if op.columns is not None: out_dtypes = out_dtypes[op.columns] out_shape = (df.shape[0], len(op.columns)) out_columns_value = parse_index(out_dtypes.index, store_data=True) for data_src in data_srcs: data_store_size = data_src.size shape = out_shape chunk_size = df.extra_params.chunk_size partition_spec = str(data_src.partition_spec) \ if getattr(data_src, 'partition_spec', None) else None if chunk_size is None: chunk_bytes = df.extra_params.chunk_bytes or READ_CHUNK_LIMIT chunk_count = data_store_size // chunk_bytes + ( data_store_size % chunk_bytes != 0) chunk_size = ceildiv(shape[0], chunk_count) split_size = chunk_bytes else: chunk_count = ceildiv(shape[0], chunk_size) split_size = data_store_size // chunk_count for i in range(chunk_count): start_index = chunk_size * i end_index = min(chunk_size * (i + 1), shape[0]) row_size = end_index - start_index chunk_op = DataFrameReadTableSplit( table_name=op.table_name, partition_spec=partition_spec, start_index=start_index, end_index=end_index, nrows=op.nrows, odps_params=op.odps_params, columns=op.columns, add_offset=op.add_offset, dtypes=out_dtypes, sparse=op.sparse, split_size=split_size, use_arrow_dtype=op.use_arrow_dtype, estimate_rows=row_size, append_partitions=op.append_partitions, memory_scale=op.memory_scale) index_value = parse_index(pd.RangeIndex( start_index, end_index)) columns_value = parse_index(out_dtypes.index, store_data=True) out_chunk = chunk_op.new_chunk(None, shape=(row_size, out_shape[1]), dtypes=out_dtypes, index_value=index_value, columns_value=columns_value, index=(index_start + i, 0)) row_nsplits.append(row_size) out_chunks.append(out_chunk) index_start += chunk_count if op.add_offset: out_chunks = standardize_range_index(out_chunks) new_op = op.copy() nsplits = (tuple(row_nsplits), (out_shape[1], )) return new_op.new_dataframes(None, shape=out_shape, dtypes=op.dtypes, index_value=df.index_value, columns_value=out_columns_value, chunks=out_chunks, nsplits=nsplits) @classmethod def _tile(cls, op): from cupid.runtime import RuntimeContext if RuntimeContext.is_context_ready(): return cls._tile_cupid(op) else: return cls._tile_tunnel(op) if not head_can_be_opt: tile = _tile
class DataFrameReadTableSplit(_Base): _op_type_ = 123451 # for cupid _cupid_handle = StringField('cupid_handle') _split_index = Int64Field('split_index') _split_file_start = Int64Field('split_file_start') _split_file_end = Int64Field('split_file_end') _schema_file_start = Int64Field('schema_file_start') _schema_file_end = Int64Field('schema_file_end') _use_arrow_dtype = BoolField('use_arrow_dtype') _string_as_binary = BoolField('string_as_binary') _dtypes = SeriesField('dtypes') _nrows = Int64Field('nrows') # for tunnel _table_name = StringField('table_name') _partition_spec = StringField('partition_spec') _start_index = Int64Field('start_index') _end_index = Int64Field('end_index') _odps_params = DictField('odps_params') _columns = ListField('columns') _split_size = Int64Field('split_size') _append_partitions = BoolField('append_partitions') _estimate_rows = Int64Field('estimate_rows') _meta_raw_size = Int64Field('meta_raw_size') def __init__(self, cupid_handle=None, split_index=None, split_file_start=None, split_file_end=None, schema_file_start=None, schema_file_end=None, table_name=None, partition_spec=None, start_index=None, end_index=None, odps_params=None, columns=None, nrows=None, dtypes=None, string_as_binary=None, split_size=None, use_arrow_dtype=None, memory_scale=None, estimate_rows=None, meta_raw_size=None, append_partitions=None, sparse=None, **kw): kw.update(_output_type_kw) super(DataFrameReadTableSplit, self).__init__(_cupid_handle=cupid_handle, _split_index=split_index, _split_file_start=split_file_start, _split_file_end=split_file_end, _schema_file_start=schema_file_start, _schema_file_end=schema_file_end, _table_name=table_name, _partition_spec=partition_spec, _columns=columns, _start_index=start_index, _end_index=end_index, _odps_params=odps_params, _use_arrow_dtype=use_arrow_dtype, _string_as_binary=string_as_binary, _nrows=nrows, _estimate_rows=estimate_rows, _split_size=split_size, _dtypes=dtypes, _append_partitions=append_partitions, _sparse=sparse, _meta_raw_size=meta_raw_size, _memory_scale=memory_scale, **kw) @property def retryable(self): return False @property def output_limit(self): return 1 @property def cupid_handle(self): return self._cupid_handle @property def split_index(self): return self._split_index @property def split_file_start(self): return self._split_file_start @property def split_file_end(self): return self._split_file_end @property def schema_file_start(self): return self._schema_file_start @property def schema_file_end(self): return self._schema_file_end @property def table_name(self): return self._table_name @property def partition_spec(self): return self._partition_spec @property def start_index(self): return self._start_index @property def end_index(self): return self._end_index @property def odps_params(self): return self._odps_params @property def columns(self): return self._columns @property def nrows(self): return self._nrows @property def dtypes(self): return self._dtypes @property def split_size(self): return self._split_size @property def estimate_rows(self): return self._estimate_rows @property def use_arrow_dtype(self): return self._use_arrow_dtype @property def string_as_binary(self): return self._string_as_binary @property def append_partitions(self): return self._append_partitions @property def meta_raw_size(self): return self._meta_raw_size @classmethod def estimate_size(cls, ctx, op): import numpy as np def is_object_dtype(dtype): try: return np.issubdtype(dtype, np.object_) \ or np.issubdtype(dtype, np.unicode_) \ or np.issubdtype(dtype, np.bytes_) except TypeError: # pragma: no cover return False if op.split_size is None: ctx[op.outputs[0].key] = (0, 0) return arrow_size = (op.memory_scale or ORC_COMPRESSION_RATIO) * op.split_size if op.meta_raw_size is not None: raw_arrow_size = (op.memory_scale or 1) * op.meta_raw_size arrow_size = max(arrow_size, raw_arrow_size) n_strings = len([dt for dt in op.dtypes if is_object_dtype(dt)]) if op.estimate_rows or op.nrows: rows = op.nrows if op.nrows is not None else op.estimate_rows pd_size = arrow_size + n_strings * rows * STRING_FIELD_OVERHEAD logger.debug('Estimate pandas memory cost: %r', pd_size) else: pd_size = arrow_size * 10 if n_strings else arrow_size ctx[op.outputs[0].key] = (pd_size, pd_size + arrow_size) @classmethod def _cast_string_to_binary(cls, arrow_table): import pyarrow as pa new_schema = [] for field in arrow_table.schema: if field.type == pa.string(): new_schema.append(pa.field(field.name, pa.binary())) else: new_schema.append(field) return arrow_table.cast(pa.schema(new_schema)) @classmethod def _append_partition_values(cls, arrow_table, op): import pyarrow as pa if op.append_partitions and op.partition_spec: from odps.types import PartitionSpec spec = PartitionSpec(op.partition_spec) for col_name, pt_val in spec.items(): arrow_table = arrow_table.append_column( col_name, pa.array([pt_val] * arrow_table.num_rows, pa.string())) return arrow_table @staticmethod def _align_columns(data, expected_dtypes): data_columns = data.dtypes.index expected_columns = expected_dtypes.index if not data_columns.equals(expected_columns): logger.debug( "Data columns differs from output columns, " "data columns: %s, output columns: %s", data_columns, expected_columns) data.columns = expected_columns[:len(data.columns)] for extra_col in expected_columns[len(data.columns):]: data[extra_col] = pd.Series([], dtype=expected_dtypes[extra_col]) if not data.dtypes.index.equals(expected_columns): data = data[expected_columns] return data @classmethod def _execute_in_cupid(cls, ctx, op): import pyarrow as pa from cupid.io.table import TableSplit out = op.outputs[0] if op.cupid_handle is None: empty_df = pd.DataFrame() for name, dtype in out.dtypes.items(): empty_df[name] = pd.Series(dtype=dtype) ctx[out.key] = empty_df return tsp = TableSplit( _handle=op.cupid_handle, _split_index=op.split_index, _split_file_start=op.split_file_start, _split_file_end=op.split_file_end, _schema_file_start=op.schema_file_start, _schema_file_end=op.schema_file_end, ) logger.debug('Read split table, split index: %s', op.split_index) reader = tsp.open_arrow_reader() if op.nrows is None: arrow_table = reader.read_all() else: nrows = 0 batches = [] while nrows < op.nrows: try: batch = reader.read_next_batch() nrows += batch.num_rows batches.append(batch) except StopIteration: break logger.debug('Read %s rows of this split.', op.nrows) arrow_table = pa.Table.from_batches(batches) arrow_table = cls._append_partition_values(arrow_table, op) if op.string_as_binary: arrow_table = cls._cast_string_to_binary(arrow_table) data = arrow_table_to_pandas_dataframe( arrow_table, use_arrow_dtype=op.use_arrow_dtype) if op.nrows is not None: data = data[:op.nrows] data = cls._align_columns(data, out.dtypes) logger.debug('Read split table finished, split index: %s', op.split_index) logger.debug('Split data shape is %s, size is %s', data.shape, data.memory_usage(deep=True).sum()) ctx[out.key] = data @classmethod def _execute_arrow_tunnel(cls, ctx, op): from odps import ODPS from odps.tunnel import TableTunnel project = os.environ.get('ODPS_PROJECT_NAME', None) odps_params = op.odps_params.copy() if project: odps_params['project'] = project endpoint = os.environ.get( 'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint'] o = ODPS(odps_params['access_id'], odps_params['secret_access_key'], project=odps_params['project'], endpoint=endpoint) t = o.get_table(op.table_name) tunnel = TableTunnel(o, project=t.project) if op.partition_spec is not None: download_session = tunnel.create_download_session( t.name, partition_spec=op.partition_spec) else: download_session = tunnel.create_download_session(t.name) logger.debug('Start reading table %s(%s) split from %s to %s', op.table_name, op.partition_spec, op.start_index, op.end_index) if op.nrows is None: count = op.end_index - op.start_index else: count = op.nrows with download_session.open_arrow_reader(op.start_index, count, columns=op.columns) as reader: table = reader.read() table = cls._append_partition_values(table, op) if op.string_as_binary: table = cls._cast_string_to_binary(table) data = arrow_table_to_pandas_dataframe( table, use_arrow_dtype=op.use_arrow_dtype) data = cls._align_columns(data, op.outputs[0].dtypes) logger.debug('Finish reading table %s(%s) split from %s to %s', op.table_name, op.partition_spec, op.start_index, op.end_index) ctx[op.outputs[0].key] = data @classmethod def execute(cls, ctx, op): from cupid.runtime import RuntimeContext if RuntimeContext.is_context_ready(): cls._execute_in_cupid(ctx, op) else: cls._execute_arrow_tunnel(ctx, op)