def execute(cls, ctx, op): import pyarrow as pa from cupid.io.table import TableSplit if op.cupid_handle is None: empty_df = pd.DataFrame() for name, dtype in op.outputs[0].dtypes.items(): empty_df[name] = pd.Series(dtype=dtype) ctx[op.outputs[0].key] = empty_df return tsp = TableSplit( _handle=op.cupid_handle, _split_index=op.split_index, _split_file_start=op.split_file_start, _split_file_end=op.split_file_end, _schema_file_start=op.schema_file_start, _schema_file_end=op.schema_file_end, ) logger.debug('Read split table, split index: %s', op.split_index) reader = tsp.open_arrow_reader() if op.nrows is not None: nrows = 0 batches = [] while nrows < op.nrows: try: batch = reader.read_next_batch() nrows += batch.num_rows batches.append(batch) except StopIteration: break logger.debug('Read %s rows of this split.', op.nrows) data = arrow_table_to_pandas_dataframe( pa.Table.from_batches(batches), use_arrow_dtype=op.use_arrow_dtype)[:op.nrows] else: arrow_table = reader.read_all() data = arrow_table_to_pandas_dataframe( arrow_table, use_arrow_dtype=op.use_arrow_dtype) data_columns = data.dtypes.index expected_columns = op.outputs[0].dtypes.index if not data_columns.equals(expected_columns): logger.debug("Data columns differs from output columns, " "data columns: {}, output columns: {}".format( data_columns, expected_columns)) data.columns = expected_columns logger.debug('Read split table finished, split index: %s', op.split_index) logger.debug('Split data shape is {}, size is {}'.format( data.shape, data.memory_usage(deep=True).sum())) ctx[op.outputs[0].key] = data
def _execute_in_cupid(cls, ctx, op): out = op.outputs[0] if op.cupid_handle is None: ctx[out.key] = cls._build_empty_df(out) return split_config = dict( _handle=op.cupid_handle, _split_index=op.split_index, _split_file_start=op.split_file_start, _split_file_end=op.split_file_end, _schema_file_start=op.schema_file_start, _schema_file_end=op.schema_file_end, ) cupid_client = CupidServiceClient() try: pa_table = cupid_client.read_table_data(split_config, op.nrows) finally: cupid_client.close() cupid_client = None pa_table = cls._append_partition_values(pa_table, op) if op.string_as_binary: pa_table = cls._cast_string_to_binary(pa_table) data = arrow_table_to_pandas_dataframe( pa_table, use_arrow_dtype=op.use_arrow_dtype)[:op.nrows] data = cls._align_output_data(op, data) logger.debug('Read split table finished, split index: %s', op.split_index) logger.debug('Split data shape is %s', data.shape) ctx[out.key] = data
def testToPandas(self): rs = np.random.RandomState(0) df = pd.DataFrame({'a': rs.rand(100), 'b': ['s' + str(i) for i in rs.randint(100, size=100)]}) batch_size = 15 n_batch = len(df) // 15 + 1 batches = [pa.RecordBatch.from_pandas(df[i * batch_size: (i + 1) * batch_size]) for i in range(n_batch)] table = pa.Table.from_batches(batches) df2 = arrow_table_to_pandas_dataframe(table) self.assertEqual(df2.dtypes.iloc[1], ArrowStringDtype()) self.assertLess(df2.memory_usage(deep=True).sum(), df.memory_usage(deep=True).sum()) # test serialize df3 = dataserializer.loads(dataserializer.dumps(df2)) self.assertEqual(df2.dtypes.iloc[1], ArrowStringDtype()) pd.testing.assert_frame_equal(df3, df2) # test df method df4 = df2.groupby('b').sum() expected = df.groupby('b').sum() pd.testing.assert_frame_equal(df4, expected) s = ('s' + df2['b']).astype('string') expected = ('s' + df['b']).astype('string') pd.testing.assert_series_equal(s, expected) s2 = df2['b'].str[:2] expected = df['b'].astype('string').str[:2] pd.testing.assert_series_equal(s2, expected)
def _execute_in_cupid(cls, ctx, op): import pyarrow as pa from cupid.io.table import TableSplit out = op.outputs[0] if op.cupid_handle is None: empty_df = pd.DataFrame() for name, dtype in out.dtypes.items(): empty_df[name] = pd.Series(dtype=dtype) ctx[out.key] = empty_df return tsp = TableSplit( _handle=op.cupid_handle, _split_index=op.split_index, _split_file_start=op.split_file_start, _split_file_end=op.split_file_end, _schema_file_start=op.schema_file_start, _schema_file_end=op.schema_file_end, ) logger.debug('Read split table, split index: %s', op.split_index) reader = tsp.open_arrow_reader() if op.nrows is None: arrow_table = reader.read_all() else: nrows = 0 batches = [] while nrows < op.nrows: try: batch = reader.read_next_batch() nrows += batch.num_rows batches.append(batch) except StopIteration: break logger.debug('Read %s rows of this split.', op.nrows) arrow_table = pa.Table.from_batches(batches) arrow_table = cls._append_partition_values(arrow_table, op) if op.string_as_binary: arrow_table = cls._cast_string_to_binary(arrow_table) data = arrow_table_to_pandas_dataframe( arrow_table, use_arrow_dtype=op.use_arrow_dtype) if op.nrows is not None: data = data[:op.nrows] data = cls._align_columns(data, out.dtypes) logger.debug('Read split table finished, split index: %s', op.split_index) logger.debug('Split data shape is %s, size is %s', data.shape, data.memory_usage(deep=True).sum()) ctx[out.key] = data
def test_to_pandas(): rs = np.random.RandomState(0) df = pd.DataFrame({ 'a': rs.rand(100), 'b': ['s' + str(i) for i in rs.randint(100, size=100)], 'c': [['ss0' + str(i), 'ss1' + str(i)] for i in rs.randint(100, size=100)] }) batch_size = 15 n_batch = len(df) // 15 + 1 batches = [ pa.RecordBatch.from_pandas(df[i * batch_size:(i + 1) * batch_size]) for i in range(n_batch) ] table = pa.Table.from_batches(batches) df1 = arrow_table_to_pandas_dataframe(table, use_arrow_dtype=False) assert df1.dtypes.iloc[1] == np.dtype('O') assert df1.dtypes.iloc[2] == np.dtype('O') df2 = arrow_table_to_pandas_dataframe(table) assert df2.dtypes.iloc[1] == ArrowStringDtype() assert df2.dtypes.iloc[2] == ArrowListDtype(str) assert df2.memory_usage(deep=True).sum() < df.memory_usage(deep=True).sum() # test df method df4 = df2.groupby('b').sum() expected = df.groupby('b').sum() pd.testing.assert_frame_equal(df4, expected) s = ('s' + df2['b']).astype('string') expected = ('s' + df['b']).astype('string') pd.testing.assert_series_equal(s, expected) s2 = df2['b'].str[:2] expected = df['b'].astype('string').str[:2] pd.testing.assert_series_equal(s2, expected)
def _execute_arrow_tunnel(cls, ctx, op): from odps import ODPS from odps.tunnel import TableTunnel project = os.environ.get('ODPS_PROJECT_NAME', None) odps_params = op.odps_params.copy() if project: odps_params['project'] = project endpoint = os.environ.get( 'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint'] o = ODPS(odps_params['access_id'], odps_params['secret_access_key'], project=odps_params['project'], endpoint=endpoint) t = o.get_table(op.table_name) tunnel = TableTunnel(o, project=t.project) if op.partition_spec is not None: download_session = tunnel.create_download_session( t.name, partition_spec=op.partition_spec) else: download_session = tunnel.create_download_session(t.name) logger.debug('Start reading table %s(%s) split from %s to %s', op.table_name, op.partition_spec, op.start_index, op.end_index) if op.nrows is None: count = op.end_index - op.start_index else: count = op.nrows with download_session.open_arrow_reader(op.start_index, count, columns=op.columns) as reader: table = reader.read() table = cls._append_partition_values(table, op) if op.string_as_binary: table = cls._cast_string_to_binary(table) data = arrow_table_to_pandas_dataframe( table, use_arrow_dtype=op.use_arrow_dtype) data = cls._align_columns(data, op.outputs[0].dtypes) logger.debug('Finish reading table %s(%s) split from %s to %s', op.table_name, op.partition_spec, op.start_index, op.end_index) ctx[op.outputs[0].key] = data
def _execute_arrow_tunnel(cls, ctx, op): from odps import ODPS from odps.tunnel import TableTunnel out = op.outputs[0] if op.table_name is None: # is empty table ctx[out.key] = cls._build_empty_df(out) return project = os.environ.get('ODPS_PROJECT_NAME', None) odps_params = op.odps_params.copy() if project: odps_params['project'] = project endpoint = os.environ.get( 'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint'] o = ODPS(odps_params['access_id'], odps_params['secret_access_key'], project=odps_params['project'], endpoint=endpoint) t = o.get_table(op.table_name) tunnel = TableTunnel(o, project=t.project) retry_times = op.retry_times or options.retry_times retries = 0 while True: try: if op.partition_spec is not None: download_session = tunnel.create_download_session( t.name, partition_spec=op.partition_spec) else: download_session = tunnel.create_download_session(t.name) break except: if retries >= retry_times: raise retries += 1 time.sleep(1) logger.debug('Start reading table %s(%s) split from %s to %s', op.table_name, op.partition_spec, op.start_index, op.end_index) if op.nrows is None: count = op.end_index - op.start_index else: count = op.nrows retries = 0 while True: try: with download_session.open_arrow_reader( op.start_index, count, columns=op.columns) as reader: table = reader.read() break except: if retries >= retry_times: raise retries += 1 time.sleep(1) table = cls._append_partition_values(table, op) if op.string_as_binary: table = cls._cast_string_to_binary(table) data = arrow_table_to_pandas_dataframe( table, use_arrow_dtype=op.use_arrow_dtype) data = cls._align_output_data(op, data) logger.debug('Finish reading table %s(%s) split from %s to %s', op.table_name, op.partition_spec, op.start_index, op.end_index) ctx[op.outputs[0].key] = data