def __call__(self, x): shape = (0, ) * len(x.shape) index_value = parse_index(x.index_value.to_pandas()[:0], x.key, 'index') columns_value = parse_index(x.columns_value.to_pandas()[:0], x.key, 'columns', store_data=True) return self.new_dataframe([x], shape=shape, dtypes=x.dtypes[:0], index_value=index_value, columns_value=columns_value)
def __call__(self, shape, chunk_bytes=None): import numpy as np import pandas as pd if np.isnan(shape[0]): index_value = parse_index(pd.RangeIndex(0)) else: index_value = parse_index(pd.RangeIndex(shape[0])) columns_value = parse_index(self.dtypes.index, store_data=True) return self.new_dataframe(None, shape, dtypes=self.dtypes, index_value=index_value, columns_value=columns_value, chunk_bytes=chunk_bytes)
def _tile_tunnel(cls, op): out_df = op.outputs[0] in_df = build_concatenated_rows_frame(op.inputs[0]) out_chunks = [] for chunk in in_df.chunks: chunk_op = DataFrameWriteTableSplit( dtypes=op.dtypes, table_name=op.table_name, odps_params=op.odps_params, partition_spec=op.partition_spec) index_value = parse_index(chunk.index_value.to_pandas()[:0], chunk) out_chunk = chunk_op.new_chunk([chunk], shape=(0, 0), index_value=index_value, columns_value=out_df.columns_value, dtypes=out_df.dtypes, index=chunk.index) out_chunks.append(out_chunk) new_op = op.copy() params = out_df.params.copy() params.update( dict(chunks=out_chunks, nsplits=((0, ) * in_df.chunk_shape[0], (0, )))) return new_op.new_tileables([in_df], **params)
def test_merge_index_value(): with Timer() as timer: index_values = {i: parse_index(pd.RangeIndex(1e7)) for i in range(20)} index_value = merge_index_value(index_values) pd.testing.assert_index_equal(index_value.to_pandas(), pd.Index([], dtype=np.int64)) assert index_value.min_val == 0 assert index_value.max_val == 1e7 - 1 # range indexes that are continuous index_values = { i: parse_index(pd.RangeIndex(i * 1e7, (i + 1) * 1e7)) for i in range(20) } index_value = merge_index_value(index_values) pd.testing.assert_index_equal(index_value.to_pandas(), pd.RangeIndex(1e7 * 20)) assert index_value.min_val == 0 assert index_value.max_val == 1e7 * 20 - 1 assert timer.duration < 1
def testParseIndex(self): index = pd.Int64Index([]) parsed_index = parse_index(index) self.assertIsInstance(parsed_index.value, IndexValue.Int64Index) pd.testing.assert_index_equal(index, parsed_index.to_pandas()) index = pd.Int64Index([1, 2]) parsed_index = parse_index(index) # not parse data self.assertIsInstance(parsed_index.value, IndexValue.Int64Index) with self.assertRaises(AssertionError): pd.testing.assert_index_equal(index, parsed_index.to_pandas()) parsed_index = parse_index(index, store_data=True) # parse data self.assertIsInstance(parsed_index.value, IndexValue.Int64Index) pd.testing.assert_index_equal(index, parsed_index.to_pandas()) index = pd.RangeIndex(0, 10, 3) parsed_index = parse_index(index) self.assertIsInstance(parsed_index.value, IndexValue.RangeIndex) pd.testing.assert_index_equal(index, parsed_index.to_pandas()) index = pd.MultiIndex.from_arrays([[0, 1], ['a', 'b']]) parsed_index = parse_index(index) # not parse data self.assertIsInstance(parsed_index.value, IndexValue.MultiIndex) with self.assertRaises(AssertionError): pd.testing.assert_index_equal(index, parsed_index.to_pandas()) parsed_index = parse_index(index, store_data=True) # parse data self.assertIsInstance(parsed_index.value, IndexValue.MultiIndex) pd.testing.assert_index_equal(index, parsed_index.to_pandas())
def testFilterIndexValue(self): pd_index = pd.RangeIndex(10) index_value = parse_index(pd_index) min_max = (0, True, 9, True) self.assertEqual(filter_index_value(index_value, min_max).to_pandas().tolist(), pd_index[(pd_index >= 0) & (pd_index <= 9)].tolist()) min_max = (0, False, 9, False) self.assertEqual(filter_index_value(index_value, min_max).to_pandas().tolist(), pd_index[(pd_index > 0) & (pd_index < 9)].tolist()) pd_index = pd.RangeIndex(1, 11, 3) index_value = parse_index(pd_index) min_max = (2, True, 10, True) self.assertEqual(filter_index_value(index_value, min_max).to_pandas().tolist(), pd_index[(pd_index >= 2) & (pd_index <= 10)].tolist()) min_max = (2, False, 10, False) self.assertEqual(filter_index_value(index_value, min_max).to_pandas().tolist(), pd_index[(pd_index > 2) & (pd_index < 10)].tolist()) pd_index = pd.RangeIndex(9, -1, -1) index_value = parse_index(pd_index) min_max = (0, True, 9, True) self.assertEqual(filter_index_value(index_value, min_max).to_pandas().tolist(), pd_index[(pd_index >= 0) & (pd_index <= 9)].tolist()) min_max = (0, False, 9, False) self.assertEqual(filter_index_value(index_value, min_max).to_pandas().tolist(), pd_index[(pd_index > 0) & (pd_index < 9)].tolist()) pd_index = pd.RangeIndex(10, 0, -3) index_value = parse_index(pd_index, store_data=False) min_max = (2, True, 10, True) self.assertEqual(filter_index_value(index_value, min_max).to_pandas().tolist(), pd_index[(pd_index >= 2) & (pd_index <= 10)].tolist()) min_max = (2, False, 10, False) self.assertEqual(filter_index_value(index_value, min_max).to_pandas().tolist(), pd_index[(pd_index > 2) & (pd_index < 10)].tolist()) pd_index = pd.Int64Index([0, 3, 8]) index_value = parse_index(pd_index, store_data=True) min_max = (2, True, 8, False) self.assertEqual(filter_index_value(index_value, min_max, store_data=True).to_pandas().tolist(), pd_index[(pd_index >= 2) & (pd_index < 8)].tolist()) index_value = parse_index(pd_index) min_max = (2, True, 8, False) filtered = filter_index_value(index_value, min_max) self.assertEqual(len(filtered.to_pandas().tolist()), 0) self.assertIsInstance(filtered.value, IndexValue.Int64Index)
def testInferIndexValue(self): # same range index index1 = pd.RangeIndex(1, 3) index2 = pd.RangeIndex(1, 3) ival1 = parse_index(index1) ival2 = parse_index(index2) oival = infer_index_value(ival1, ival2) self.assertEqual(oival.key, ival1.key) self.assertEqual(oival.key, ival2.key) # different range index index1 = pd.RangeIndex(1, 3) index2 = pd.RangeIndex(2, 4) ival1 = parse_index(index1) ival2 = parse_index(index2) oival = infer_index_value(ival1, ival2) self.assertIsInstance(oival.value, IndexValue.Int64Index) self.assertNotEqual(oival.key, ival1.key) self.assertNotEqual(oival.key, ival2.key) # same int64 index, all unique index1 = pd.Int64Index([1, 2]) index2 = pd.Int64Index([1, 2]) ival1 = parse_index(index1) ival2 = parse_index(index2) oival = infer_index_value(ival1, ival2) self.assertIsInstance(oival.value, IndexValue.Int64Index) self.assertEqual(oival.key, ival1.key) self.assertEqual(oival.key, ival2.key) # same int64 index, not all unique index1 = pd.Int64Index([1, 2, 2]) index2 = pd.Int64Index([1, 2, 2]) ival1 = parse_index(index1) ival2 = parse_index(index2) oival = infer_index_value(ival1, ival2) self.assertIsInstance(oival.value, IndexValue.Int64Index) self.assertNotEqual(oival.key, ival1.key) self.assertNotEqual(oival.key, ival2.key) # different int64 index index1 = pd.Int64Index([1, 2]) index2 = pd.Int64Index([2, 3]) ival1 = parse_index(index1) ival2 = parse_index(index2) oival = infer_index_value(ival1, ival2) self.assertIsInstance(oival.value, IndexValue.Int64Index) self.assertNotEqual(oival.key, ival1.key) self.assertNotEqual(oival.key, ival2.key) # different index type index1 = pd.Int64Index([1, 2]) index2 = pd.Float64Index([2.0, 3.0]) ival1 = parse_index(index1) ival2 = parse_index(index2) oival = infer_index_value(ival1, ival2) self.assertIsInstance(oival.value, IndexValue.Float64Index) self.assertNotEqual(oival.key, ival1.key) self.assertNotEqual(oival.key, ival2.key) # range index and other index index1 = pd.RangeIndex(1, 4) index2 = pd.Float64Index([2, 3, 4]) ival1 = parse_index(index1) ival2 = parse_index(index2) oival = infer_index_value(ival1, ival2) self.assertIsInstance(oival.value, IndexValue.Float64Index) self.assertNotEqual(oival.key, ival1.key) self.assertNotEqual(oival.key, ival2.key) index1 = pd.DatetimeIndex([]) index2 = pd.RangeIndex(2) ival1 = parse_index(index1) ival2 = parse_index(index2) oival = infer_index_value(ival1, ival2) self.assertIsInstance(oival.value, IndexValue.Index) self.assertNotEqual(oival.key, ival1.key) self.assertNotEqual(oival.key, ival2.key)
def test_infer_index_value(): # same range index index1 = pd.RangeIndex(1, 3) index2 = pd.RangeIndex(1, 3) ival1 = parse_index(index1) ival2 = parse_index(index2) oival = infer_index_value(ival1, ival2) assert oival.key == ival1.key assert oival.key == ival2.key # different range index index1 = pd.RangeIndex(1, 3) index2 = pd.RangeIndex(2, 4) ival1 = parse_index(index1) ival2 = parse_index(index2) oival = infer_index_value(ival1, ival2) assert isinstance(oival.value, IndexValue.Int64Index) assert oival.key != ival1.key assert oival.key != ival2.key # same int64 index, all unique index1 = pd.Int64Index([1, 2]) index2 = pd.Int64Index([1, 2]) ival1 = parse_index(index1) ival2 = parse_index(index2) oival = infer_index_value(ival1, ival2) assert isinstance(oival.value, IndexValue.Int64Index) assert oival.key == ival1.key assert oival.key == ival2.key # same int64 index, not all unique index1 = pd.Int64Index([1, 2, 2]) index2 = pd.Int64Index([1, 2, 2]) ival1 = parse_index(index1) ival2 = parse_index(index2) oival = infer_index_value(ival1, ival2) assert isinstance(oival.value, IndexValue.Int64Index) assert oival.key != ival1.key assert oival.key != ival2.key # different int64 index index1 = pd.Int64Index([1, 2]) index2 = pd.Int64Index([2, 3]) ival1 = parse_index(index1) ival2 = parse_index(index2) oival = infer_index_value(ival1, ival2) assert isinstance(oival.value, IndexValue.Int64Index) assert oival.key != ival1.key assert oival.key != ival2.key # different index type index1 = pd.Int64Index([1, 2]) index2 = pd.Float64Index([2.0, 3.0]) ival1 = parse_index(index1) ival2 = parse_index(index2) oival = infer_index_value(ival1, ival2) assert isinstance(oival.value, IndexValue.Float64Index) assert oival.key != ival1.key assert oival.key != ival2.key # range index and other index index1 = pd.RangeIndex(1, 4) index2 = pd.Float64Index([2, 3, 4]) ival1 = parse_index(index1) ival2 = parse_index(index2) oival = infer_index_value(ival1, ival2) assert isinstance(oival.value, IndexValue.Float64Index) assert oival.key != ival1.key assert oival.key != ival2.key index1 = pd.DatetimeIndex([]) index2 = pd.RangeIndex(2) ival1 = parse_index(index1) ival2 = parse_index(index2) oival = infer_index_value(ival1, ival2) assert isinstance(oival.value, IndexValue.Index) assert oival.key != ival1.key assert oival.key != ival2.key
def test_filter_index_value(): pd_index = pd.RangeIndex(10) index_value = parse_index(pd_index) min_max = (0, True, 9, True) assert filter_index_value( index_value, min_max).to_pandas().tolist() == pd_index[(pd_index >= 0) & (pd_index <= 9)].tolist() min_max = (0, False, 9, False) assert filter_index_value( index_value, min_max).to_pandas().tolist() == pd_index[(pd_index > 0) & (pd_index < 9)].tolist() pd_index = pd.RangeIndex(1, 11, 3) index_value = parse_index(pd_index) min_max = (2, True, 10, True) assert filter_index_value( index_value, min_max).to_pandas().tolist() == pd_index[(pd_index >= 2) & (pd_index <= 10)].tolist() min_max = (2, False, 10, False) assert filter_index_value( index_value, min_max).to_pandas().tolist() == pd_index[(pd_index > 2) & (pd_index < 10)].tolist() pd_index = pd.RangeIndex(9, -1, -1) index_value = parse_index(pd_index) min_max = (0, True, 9, True) assert filter_index_value( index_value, min_max).to_pandas().tolist() == pd_index[(pd_index >= 0) & (pd_index <= 9)].tolist() min_max = (0, False, 9, False) assert filter_index_value( index_value, min_max).to_pandas().tolist() == pd_index[(pd_index > 0) & (pd_index < 9)].tolist() pd_index = pd.RangeIndex(10, 0, -3) index_value = parse_index(pd_index, store_data=False) min_max = (2, True, 10, True) assert filter_index_value( index_value, min_max).to_pandas().tolist() == pd_index[(pd_index >= 2) & (pd_index <= 10)].tolist() min_max = (2, False, 10, False) assert filter_index_value( index_value, min_max).to_pandas().tolist() == pd_index[(pd_index > 2) & (pd_index < 10)].tolist() pd_index = pd.Int64Index([0, 3, 8]) index_value = parse_index(pd_index, store_data=True) min_max = (2, True, 8, False) assert filter_index_value( index_value, min_max, store_data=True).to_pandas().tolist() == pd_index[ (pd_index >= 2) & (pd_index < 8)].tolist() index_value = parse_index(pd_index) min_max = (2, True, 8, False) filtered = filter_index_value(index_value, min_max) assert len(filtered.to_pandas().tolist()) == 0 assert isinstance(filtered.value, IndexValue.Int64Index)
def tile(cls, op): import numpy as np import pandas as pd from odps import ODPS from odps.accounts import BearerTokenAccount from cupid import CupidSession, context from mars.context import get_context cupid_ctx = context() if cupid_ctx is None: raise SystemError( 'No Mars cluster found, please create via `o.create_mars_cluster`.' ) bearer_token = cupid_ctx.get_bearer_token() account = BearerTokenAccount(bearer_token) project = os.environ.get('ODPS_PROJECT_NAME', None) odps_params = op.odps_params.copy() if project: odps_params['project'] = project o = ODPS(None, None, account=account, **odps_params) cupid_session = CupidSession(o) mars_context = get_context() df = op.outputs[0] split_size = df.extra_params.chunk_bytes or READ_CHUNK_LIMIT data_src = o.get_table(op.table_name) if op.partition is not None: data_src = data_src.get_partition(op.partition) try: data_store_size = data_src.size except ODPSError: # fail to get data size, just ignore pass else: if data_store_size < split_size and mars_context is not None: # get worker counts worker_count = max(len(mars_context.get_worker_addresses()), 1) # data is too small, split as many as number of cores split_size = data_store_size // worker_count # at least 1M split_size = max(split_size, 1 * 1024**2) logger.debug( 'Input data size is too small, split_size is {}'.format( split_size)) logger.debug( 'Start creating download session of table {} from cupid.'.format( op.table_name)) while True: try: download_session = cupid_session.create_download_session( data_src, split_size=split_size, columns=op.columns) break except CupidError: logger.debug( 'The number of splits exceeds 100000, split_size is {}'. format(split_size)) if split_size >= MAX_CHUNK_SIZE: raise else: split_size *= 2 logger.debug('%s table splits have been created.', str(len(download_session.splits))) if np.isnan(df.shape[0]): est_chunk_rows = [None] * len(download_session.splits) else: sp_file_sizes = np.array([ sp.split_file_end - sp.split_file_start for sp in download_session.splits ]) total_size = sp_file_sizes.sum() est_chunk_rows = sp_file_sizes * df.shape[0] // total_size logger.warning('Estimated chunk rows: %r', est_chunk_rows) out_chunks = [] # Ignore add_offset at this time. op._add_offset = False if len(download_session.splits) == 0: logger.debug('Table {} has no data'.format(op.table_name)) chunk_op = DataFrameReadTableSplit() index_value = parse_index(pd.RangeIndex(0)) columns_value = parse_index(df.dtypes.index, store_data=True) out_chunk = chunk_op.new_chunk(None, shape=(np.nan, df.shape[1]), dtypes=op.dtypes, index_value=index_value, columns_value=columns_value, index=(0, 0)) out_chunks = [out_chunk] else: for idx, split in enumerate(download_session.splits): chunk_op = DataFrameReadTableSplit( cupid_handle=to_str(split.handle), split_index=split.split_index, split_file_start=split.split_file_start, split_file_end=split.split_file_end, schema_file_start=split.schema_file_start, schema_file_end=split.schema_file_end, add_offset=op.add_offset, dtypes=op.dtypes, sparse=op.sparse, split_size=split_size, use_arrow_dtype=op.use_arrow_dtype, estimate_rows=est_chunk_rows[idx]) # the chunk shape is unknown index_value = parse_index(pd.RangeIndex(0)) columns_value = parse_index(df.dtypes.index, store_data=True) out_chunk = chunk_op.new_chunk(None, shape=(np.nan, df.shape[1]), dtypes=op.dtypes, index_value=index_value, columns_value=columns_value, index=(idx, 0)) out_chunks.append(out_chunk) if op.add_offset: out_chunks = standardize_range_index(out_chunks) new_op = op.copy() nsplits = ((np.nan, ) * len(out_chunks), (df.shape[1], )) return new_op.new_dataframes(None, shape=df.shape, dtypes=op.dtypes, index_value=df.index_value, columns_value=df.columns_value, chunks=out_chunks, nsplits=nsplits)
def tile(cls, op): import numpy as np import pandas as pd from odps import ODPS from odps.accounts import BearerTokenAccount from cupid import CupidSession, context bearer_token = context().get_bearer_token() account = BearerTokenAccount(bearer_token) project = os.environ.get('ODPS_PROJECT_NAME', None) odps_params = op.odps_params.copy() if project: odps_params['project'] = project o = ODPS(None, None, account=account, **odps_params) cupid_session = CupidSession(o) df = op.outputs[0] split_size = df.extra_params.chunk_bytes or CHUNK_LIMIT data_src = o.get_table(op.table_name) if op.partition is not None: data_src = data_src.get_partition(op.partition) logger.debug('Start creating download session from cupid.') while True: try: download_session = cupid_session.create_download_session( data_src, split_size=split_size, columns=op.columns) break except CupidError: logger.debug( 'The number of splits exceeds 100000, split_size is {}'. format(split_size)) if split_size >= MAX_CHUNK_SIZE: raise else: split_size *= 2 logger.debug('%s table splits have been created.', str(len(download_session.splits))) out_chunks = [] # Ignore add_offset at this time. op._add_offset = False for idx, split in enumerate(download_session.splits): chunk_op = DataFrameReadTableSplit( cupid_handle=to_str(split.handle), split_index=split.split_index, split_file_start=split.split_file_start, split_file_end=split.split_file_end, schema_file_start=split.schema_file_start, schema_file_end=split.schema_file_end, add_offset=op.add_offset, dtypes=op.dtypes, sparse=op.sparse) # the chunk shape is unknown index_value = parse_index(pd.RangeIndex(0)) columns_value = parse_index(df.dtypes.index, store_data=True) out_chunk = chunk_op.new_chunk(None, shape=(np.nan, df.shape[1]), dtypes=op.dtypes, index_value=index_value, columns_value=columns_value, index=(idx, 0)) out_chunks.append(out_chunk) if op.add_offset: out_chunks = standardize_range_index(out_chunks) new_op = op.copy() nsplits = ((np.nan, ) * len(out_chunks), (df.shape[1], )) return new_op.new_dataframes(None, shape=df.shape, dtypes=op.dtypes, index_value=df.index_value, columns_value=df.columns_value, chunks=out_chunks, nsplits=nsplits)
def _tile_tunnel(cls, op): from odps import ODPS project = os.environ.get('ODPS_PROJECT_NAME', None) odps_params = op.odps_params.copy() if project: odps_params['project'] = project endpoint = os.environ.get( 'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint'] o = ODPS(odps_params['access_id'], odps_params['secret_access_key'], project=odps_params['project'], endpoint=endpoint) table_obj = o.get_table(op.table_name) if not table_obj.schema.partitions: data_srcs = [table_obj] elif op.partition is not None and check_partition_exist( table_obj, op.partition): data_srcs = [table_obj.get_partition(op.partition)] else: data_srcs = list(table_obj.partitions) if op.partition is not None: data_srcs = filter_partitions(o, data_srcs, op.partition) out_chunks = [] row_nsplits = [] index_start = 0 df = op.outputs[0] out_dtypes = df.dtypes out_shape = df.shape out_columns_value = df.columns_value if op.columns is not None: out_dtypes = out_dtypes[op.columns] out_shape = (df.shape[0], len(op.columns)) out_columns_value = parse_index(out_dtypes.index, store_data=True) if len(data_srcs) == 0: # no partitions are selected chunk_op = DataFrameReadTableSplit() index_value = parse_index(pd.RangeIndex(0)) columns_value = parse_index(out_dtypes.index, store_data=True) out_chunk = chunk_op.new_chunk(None, shape=(0, out_shape[1]), dtypes=op.dtypes, index_value=index_value, columns_value=columns_value, index=(index_start, 0)) out_chunks.append(out_chunk) else: retry_times = op.retry_times or options.retry_times for data_src in data_srcs: data_store_size = data_src.size retries = 0 while True: try: with data_src.open_reader() as reader: record_count = reader.count break except: if retries >= retry_times: raise retries += 1 time.sleep(1) if data_store_size == 0: # empty table chunk_op = DataFrameReadTableSplit() index_value = parse_index(pd.RangeIndex(0)) columns_value = parse_index(out_dtypes.index, store_data=True) out_chunk = chunk_op.new_chunk(None, shape=(0, out_shape[1]), dtypes=op.dtypes, index_value=index_value, columns_value=columns_value, index=(index_start, 0)) out_chunks.append(out_chunk) index_start += 1 continue chunk_size = df.extra_params.chunk_size partition_spec = str(data_src.partition_spec) \ if getattr(data_src, 'partition_spec', None) else None if chunk_size is None: chunk_bytes = df.extra_params.chunk_bytes or CHUNK_BYTES_LIMIT chunk_count = data_store_size // chunk_bytes + ( data_store_size % chunk_bytes != 0) chunk_size = ceildiv(record_count, chunk_count) split_size = chunk_bytes else: chunk_count = ceildiv(record_count, chunk_size) split_size = data_store_size // chunk_count for i in range(chunk_count): start_index = chunk_size * i end_index = min(chunk_size * (i + 1), record_count) row_size = end_index - start_index chunk_op = DataFrameReadTableSplit( table_name=op.table_name, partition_spec=partition_spec, start_index=start_index, end_index=end_index, nrows=op.nrows, odps_params=op.odps_params, columns=op.columns, incremental_index=op.incremental_index, dtypes=out_dtypes, sparse=op.sparse, split_size=split_size, use_arrow_dtype=op.use_arrow_dtype, estimate_rows=row_size, append_partitions=op.append_partitions, memory_scale=op.memory_scale, retry_times=op.retry_times, extra_params=op.extra_params) index_value = parse_index( pd.RangeIndex(start_index, end_index)) columns_value = parse_index(out_dtypes.index, store_data=True) out_chunk = chunk_op.new_chunk(None, shape=(row_size, out_shape[1]), dtypes=out_dtypes, index_value=index_value, columns_value=columns_value, index=(index_start + i, 0)) row_nsplits.append(row_size) out_chunks.append(out_chunk) index_start += chunk_count if op.incremental_index and _NEED_STANDARDIZE: out_chunks = standardize_range_index(out_chunks) new_op = op.copy() nsplits = (tuple(row_nsplits), (out_shape[1], )) return new_op.new_dataframes(None, shape=out_shape, dtypes=op.dtypes, index_value=df.index_value, columns_value=out_columns_value, chunks=out_chunks, nsplits=nsplits)
def _tile_cupid(cls, op): import numpy as np import pandas as pd from mars.core.context import get_context df = op.outputs[0] split_size = df.extra_params.chunk_bytes or CHUNK_BYTES_LIMIT out_dtypes = df.dtypes out_shape = df.shape out_columns_value = df.columns_value if op.columns is not None: out_dtypes = out_dtypes[op.columns] out_shape = (df.shape[0], len(op.columns)) out_columns_value = parse_index(out_dtypes.index, store_data=True) mars_context = get_context() if mars_context is not None: worker_count = len(mars_context.get_worker_addresses()) else: worker_count = None cupid_client = CupidServiceClient() try: parts = cupid_client.enum_table_partitions(op.odps_params, op.table_name, op.partition) if parts is None: parts = [None] out_chunks = [] chunk_idx = 0 for partition_spec in parts: splits, split_size = cupid_client.create_table_download_session( op.odps_params, op.table_name, partition_spec, op.columns, worker_count, split_size, MAX_CHUNK_NUM, op.with_split_meta_on_tile) logger.debug('%s table splits have been created.', str(len(splits))) meta_chunk_rows = [split.meta_row_count for split in splits] if np.isnan(out_shape[0]): est_chunk_rows = meta_chunk_rows else: sp_file_sizes = np.array([ sp.split_file_end - sp.split_file_start for sp in splits ]) total_size = sp_file_sizes.sum() ratio_chunk_rows = (sp_file_sizes * out_shape[0] // total_size).tolist() est_chunk_rows = [ mr if mr is not None else rr for mr, rr in zip(meta_chunk_rows, ratio_chunk_rows) ] logger.warning('Estimated chunk rows: %r', est_chunk_rows) if len(splits) == 0: logger.debug('Table %s has no data', op.table_name) chunk_op = DataFrameReadTableSplit() index_value = parse_index(pd.RangeIndex(0)) columns_value = parse_index(out_dtypes.index, store_data=True) out_chunk = chunk_op.new_chunk(None, shape=(np.nan, out_shape[1]), dtypes=op.dtypes, index_value=index_value, columns_value=columns_value, index=(chunk_idx, 0)) out_chunks.append(out_chunk) chunk_idx += 1 else: for idx, split in enumerate(splits): chunk_op = DataFrameReadTableSplit( cupid_handle=to_str(split.handle), split_index=split.split_index, split_file_start=split.split_file_start, split_file_end=split.split_file_end, schema_file_start=split.schema_file_start, schema_file_end=split.schema_file_end, incremental_index=op.incremental_index, dtypes=out_dtypes, sparse=op.sparse, split_size=split_size, string_as_binary=op.string_as_binary, use_arrow_dtype=op.use_arrow_dtype, estimate_rows=est_chunk_rows[idx], partition_spec=partition_spec, append_partitions=op.append_partitions, meta_raw_size=split.meta_raw_size, nrows=meta_chunk_rows[idx] or op.nrows, memory_scale=op.memory_scale, extra_params=op.extra_params) # the chunk shape is unknown index_value = parse_index(pd.RangeIndex(0)) columns_value = parse_index(out_dtypes.index, store_data=True) out_chunk = chunk_op.new_chunk( None, shape=(np.nan, out_shape[1]), dtypes=out_dtypes, index_value=index_value, columns_value=columns_value, index=(chunk_idx, 0)) chunk_idx += 1 out_chunks.append(out_chunk) finally: cupid_client.close() if op.incremental_index and _NEED_STANDARDIZE: out_chunks = standardize_range_index(out_chunks) new_op = op.copy() nsplits = ((np.nan, ) * len(out_chunks), (out_shape[1], )) return new_op.new_dataframes(None, shape=out_shape, dtypes=op.dtypes, index_value=df.index_value, columns_value=out_columns_value, chunks=out_chunks, nsplits=nsplits)
def tile(cls, op): import numpy as np import pandas as pd from odps import ODPS from odps.accounts import BearerTokenAccount from cupid import CupidSession, context bearer_token = context().get_bearer_token() account = BearerTokenAccount(bearer_token) o = ODPS(None, None, account=account, **op.odps_params) cupid_session = CupidSession(o) df = op.outputs[0] split_size = df.extra_params.chunk_store_limit or options.tensor.chunk_store_limit data_src = o.get_table(op.table_name) if op.partition is not None: data_src = data_src.get_partition(op.partition) logger.debug('Start creating download session from cupid.') download_session = cupid_session.create_download_session( data_src, split_size=split_size) logger.debug('%s table splits have been created.', str(len(download_session.splits))) out_chunks = [] out_count_chunks = [] for idx, split in enumerate(download_session.splits): chunk_op = DataFrameReadTableSplit( cupid_handle=to_str(split.handle), split_index=split.split_index, split_file_start=split.split_file_start, split_file_end=split.split_file_end, schema_file_start=split.schema_file_start, schema_file_end=split.schema_file_end, dtypes=op.dtypes, sparse=op.sparse) # the chunk shape is unknown index_value = parse_index(pd.RangeIndex(0)) columns_value = parse_index(df.dtypes.index, store_data=True) out_chunk, out_count_chunk = chunk_op.new_chunks( None, kws=[{ 'shape': (np.nan, df.shape[1]), 'dtypes': op.dtypes, 'index_value': index_value, 'columns_value': columns_value, 'index': (idx, ) }, { 'shape': (1, ), 'index': (idx, ) }]) out_chunks.append(out_chunk) out_count_chunks.append(out_count_chunk) if op.add_offset: output_chunks = [] for i, chunk in enumerate(out_chunks): if i == 0: output_chunks.append(chunk) continue counts = out_count_chunks[:i] inputs = [chunk] + counts output_chunk = DataFrameReadTableWithOffset( dtypes=chunk.dtypes).new_chunk( inputs, shape=chunk.shape, index=chunk.index, dtypes=chunk.dtypes, index_value=chunk.index_value, columns_value=chunk.columns_value) output_chunks.append(output_chunk) else: output_chunks = out_chunks new_op = op.copy() nsplits = ((np.nan, ) * len(output_chunks), (df.shape[1], )) return new_op.new_dataframes(None, shape=df.shape, dtypes=op.dtypes, index_value=df.index_value, columns_value=df.columns_value, chunks=output_chunks, nsplits=nsplits)
def _tile_tunnel(cls, op): from odps import ODPS project = os.environ.get('ODPS_PROJECT_NAME', None) odps_params = op.odps_params.copy() if project: odps_params['project'] = project endpoint = os.environ.get( 'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint'] o = ODPS(odps_params['access_id'], odps_params['secret_access_key'], project=odps_params['project'], endpoint=endpoint) table_obj = o.get_table(op.table_name) if not table_obj.schema.partitions: data_srcs = [table_obj] elif op.partition is not None and check_partition_exist( table_obj, op.partition): data_srcs = [table_obj.get_partition(op.partition)] else: data_srcs = list(table_obj.partitions) if op.partition is not None: data_srcs = filter_partitions(o, data_srcs, op.partition) out_chunks = [] row_nsplits = [] index_start = 0 df = op.outputs[0] out_dtypes = df.dtypes out_shape = df.shape out_columns_value = df.columns_value if op.columns is not None: out_dtypes = out_dtypes[op.columns] out_shape = (df.shape[0], len(op.columns)) out_columns_value = parse_index(out_dtypes.index, store_data=True) for data_src in data_srcs: data_store_size = data_src.size shape = out_shape chunk_size = df.extra_params.chunk_size partition_spec = str(data_src.partition_spec) \ if getattr(data_src, 'partition_spec', None) else None if chunk_size is None: chunk_bytes = df.extra_params.chunk_bytes or READ_CHUNK_LIMIT chunk_count = data_store_size // chunk_bytes + ( data_store_size % chunk_bytes != 0) chunk_size = ceildiv(shape[0], chunk_count) split_size = chunk_bytes else: chunk_count = ceildiv(shape[0], chunk_size) split_size = data_store_size // chunk_count for i in range(chunk_count): start_index = chunk_size * i end_index = min(chunk_size * (i + 1), shape[0]) row_size = end_index - start_index chunk_op = DataFrameReadTableSplit( table_name=op.table_name, partition_spec=partition_spec, start_index=start_index, end_index=end_index, nrows=op.nrows, odps_params=op.odps_params, columns=op.columns, add_offset=op.add_offset, dtypes=out_dtypes, sparse=op.sparse, split_size=split_size, use_arrow_dtype=op.use_arrow_dtype, estimate_rows=row_size, append_partitions=op.append_partitions, memory_scale=op.memory_scale) index_value = parse_index(pd.RangeIndex( start_index, end_index)) columns_value = parse_index(out_dtypes.index, store_data=True) out_chunk = chunk_op.new_chunk(None, shape=(row_size, out_shape[1]), dtypes=out_dtypes, index_value=index_value, columns_value=columns_value, index=(index_start + i, 0)) row_nsplits.append(row_size) out_chunks.append(out_chunk) index_start += chunk_count if op.add_offset: out_chunks = standardize_range_index(out_chunks) new_op = op.copy() nsplits = (tuple(row_nsplits), (out_shape[1], )) return new_op.new_dataframes(None, shape=out_shape, dtypes=op.dtypes, index_value=df.index_value, columns_value=out_columns_value, chunks=out_chunks, nsplits=nsplits)
def _tile_cupid(cls, op): from odps import ODPS from odps.accounts import BearerTokenAccount from cupid import CupidSession, context from cupid.errors import CupidError from mars.context import get_context cupid_ctx = context() bearer_token = cupid_ctx.get_bearer_token() account = BearerTokenAccount(bearer_token) project = os.environ.get('ODPS_PROJECT_NAME', None) odps_params = op.odps_params.copy() if project: odps_params['project'] = project endpoint = os.environ.get( 'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint'] o = ODPS(None, None, account=account, project=odps_params['project'], endpoint=endpoint) cupid_session = CupidSession(o) mars_context = get_context() df = op.outputs[0] split_size = df.extra_params.chunk_bytes or READ_CHUNK_LIMIT out_dtypes = df.dtypes out_shape = df.shape out_columns_value = df.columns_value if op.columns is not None: out_dtypes = out_dtypes[op.columns] out_shape = (df.shape[0], len(op.columns)) out_columns_value = parse_index(out_dtypes.index, store_data=True) table_obj = o.get_table(op.table_name) if not table_obj.schema.partitions: data_srcs = [table_obj] elif op.partition is not None and check_partition_exist( table_obj, op.partition): data_srcs = [table_obj.get_partition(op.partition)] else: data_srcs = list(table_obj.partitions) if op.partition is not None: data_srcs = filter_partitions(o, data_srcs, op.partition) out_chunks = [] chunk_idx = 0 for data_src in data_srcs: try: data_store_size = data_src.size except ODPSError: # fail to get data size, just ignore pass else: if data_store_size < split_size and mars_context is not None: # get worker counts worker_count = max( len(mars_context.get_worker_addresses()), 1) # data is too small, split as many as number of cores split_size = data_store_size // worker_count # at least 1M split_size = max(split_size, 1 * 1024**2) logger.debug( 'Input data size is too small, split_size is %s', split_size) logger.debug( 'Start creating download session of table %s from cupid, ' 'columns: %s', op.table_name, op.columns) while True: try: download_session = cupid_session.create_download_session( data_src, split_size=split_size, columns=op.columns, with_split_meta=op.with_split_meta_on_tile) break except CupidError: logger.debug( 'The number of splits exceeds 100000, split_size is %s', split_size) if split_size >= MAX_CHUNK_SIZE: raise else: split_size *= 2 logger.debug('%s table splits have been created.', str(len(download_session.splits))) meta_chunk_rows = [ split.meta_row_count for split in download_session.splits ] if np.isnan(out_shape[0]): est_chunk_rows = meta_chunk_rows else: sp_file_sizes = np.array([ sp.split_file_end - sp.split_file_start for sp in download_session.splits ]) total_size = sp_file_sizes.sum() ratio_chunk_rows = (sp_file_sizes * out_shape[0] // total_size).tolist() est_chunk_rows = [ mr if mr is not None else rr for mr, rr in zip(meta_chunk_rows, ratio_chunk_rows) ] partition_spec = str(data_src.partition_spec) \ if getattr(data_src, 'partition_spec', None) else None logger.warning('Estimated chunk rows: %r', est_chunk_rows) if len(download_session.splits) == 0: logger.debug('Table %s has no data', op.table_name) chunk_op = DataFrameReadTableSplit() index_value = parse_index(pd.RangeIndex(0)) columns_value = parse_index(out_dtypes.index, store_data=True) out_chunk = chunk_op.new_chunk(None, shape=(np.nan, out_shape[1]), dtypes=op.dtypes, index_value=index_value, columns_value=columns_value, index=(chunk_idx, 0)) out_chunks.append(out_chunk) chunk_idx += 1 else: for idx, split in enumerate(download_session.splits): chunk_op = DataFrameReadTableSplit( cupid_handle=to_str(split.handle), split_index=split.split_index, split_file_start=split.split_file_start, split_file_end=split.split_file_end, schema_file_start=split.schema_file_start, schema_file_end=split.schema_file_end, add_offset=op.add_offset, dtypes=out_dtypes, sparse=op.sparse, split_size=split_size, string_as_binary=op.string_as_binary, use_arrow_dtype=op.use_arrow_dtype, estimate_rows=est_chunk_rows[idx], partition_spec=partition_spec, append_partitions=op.append_partitions, meta_raw_size=split.meta_raw_size, nrows=meta_chunk_rows[idx] or op.nrows, memory_scale=op.memory_scale) # the chunk shape is unknown index_value = parse_index(pd.RangeIndex(0)) columns_value = parse_index(out_dtypes.index, store_data=True) out_chunk = chunk_op.new_chunk(None, shape=(np.nan, out_shape[1]), dtypes=out_dtypes, index_value=index_value, columns_value=columns_value, index=(chunk_idx, 0)) chunk_idx += 1 out_chunks.append(out_chunk) if op.add_offset: out_chunks = standardize_range_index(out_chunks) new_op = op.copy() nsplits = ((np.nan, ) * len(out_chunks), (out_shape[1], )) return new_op.new_dataframes(None, shape=out_shape, dtypes=op.dtypes, index_value=df.index_value, columns_value=out_columns_value, chunks=out_chunks, nsplits=nsplits)