def tile(cls, op): from cupid.runtime import RuntimeContext if RuntimeContext.is_context_ready(): return cls._tile_cupid(op) else: return cls._tile_tunnel(op)
def execute(cls, ctx, op): from cupid.runtime import RuntimeContext if RuntimeContext.is_context_ready(): cls._execute_in_cupid(ctx, op) else: cls._execute_arrow_tunnel(ctx, op)
def get_bearer_token(): from cupid.runtime import context, RuntimeContext if not RuntimeContext.is_context_ready(): return cupid_context = context() return cupid_context.get_bearer_token()
def _handle_terminate_instance(sock): from cupid.runtime import context, RuntimeContext from odps import ODPS from odps.accounts import BearerTokenAccount try: cmd_len, = struct.unpack('<I', sock.recv(4)) # dict with key cmd_body = pickle.loads(sock.recv(cmd_len)) instance_id = cmd_body['instance_id'] if not RuntimeContext.is_context_ready(): logger.warning('Cupid context not ready') else: bearer_token = context().get_bearer_token() account = BearerTokenAccount(bearer_token) project = os.environ['ODPS_PROJECT_NAME'] endpoint = os.environ['ODPS_RUNTIME_ENDPOINT'] o = ODPS(None, None, account=account, project=project, endpoint=endpoint) o.stop_instance(instance_id) except: logger.exception('Failed to put kv value') _write_request_result(sock, False, exc_info=sys.exc_info())
def _handle_commit_table_upload_session(sock): try: cmd_len, = struct.unpack('<I', sock.recv(4)) # dict with odps_params, table_name, cupid_handle, blocks, overwrite commit_config = pickle.loads(sock.recv(cmd_len)) from odps import ODPS from odps.accounts import BearerTokenAccount from cupid import CupidSession, context from cupid.runtime import RuntimeContext from cupid.io.table import CupidTableUploadSession if not RuntimeContext.is_context_ready(): raise SystemError( 'No Mars cluster found, please create via `o.create_mars_cluster`.' ) cupid_ctx = context() odps_params = commit_config['odps_params'] bearer_token = cupid_ctx.get_bearer_token() account = BearerTokenAccount(bearer_token) project = os.environ.get('ODPS_PROJECT_NAME', None) or odps_params['project'] endpoint = os.environ.get( 'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint'] o = ODPS(None, None, account=account, project=project, endpoint=endpoint) cupid_session = CupidSession(o) project_name, table_name = commit_config['table_name'].split('.') upload_session = CupidTableUploadSession( session=cupid_session, table_name=table_name, project_name=project_name, handle=commit_config['cupid_handle'], blocks=commit_config['blocks']) upload_session.commit(overwrite=commit_config['overwrite']) _write_request_result(sock) except: logger.exception('Failed to commit upload session') _write_request_result(sock, False, exc_info=sys.exc_info())
def _handle_put_kv(sock): try: cmd_len, = struct.unpack('<I', sock.recv(4)) # dict with key cmd_body = pickle.loads(sock.recv(cmd_len)) from cupid.runtime import RuntimeContext if not RuntimeContext.is_context_ready(): logger.warning('Cupid context not ready') else: from cupid import context cupid_kv = context().kv_store() cupid_kv[cmd_body['key']] = cmd_body['value'] _write_request_result(sock) except: logger.exception('Failed to put kv value') _write_request_result(sock, False, exc_info=sys.exc_info())
def _handle_create_table_upload_session(sock): try: cmd_len, = struct.unpack('<I', sock.recv(4)) # dict with odps_params, table_name session_config = pickle.loads(sock.recv(cmd_len)) from odps import ODPS from odps.accounts import BearerTokenAccount from cupid import CupidSession, context from cupid.runtime import RuntimeContext if not RuntimeContext.is_context_ready(): raise SystemError( 'No Mars cluster found, please create via `o.create_mars_cluster`.' ) cupid_ctx = context() odps_params = session_config['odps_params'] bearer_token = cupid_ctx.get_bearer_token() account = BearerTokenAccount(bearer_token) project = os.environ.get('ODPS_PROJECT_NAME', None) or odps_params['project'] endpoint = os.environ.get( 'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint'] o = ODPS(None, None, account=account, project=project, endpoint=endpoint) cupid_session = CupidSession(o) data_src = o.get_table(session_config['table_name']) logger.debug('Start creating upload session from cupid.') upload_session = cupid_session.create_upload_session(data_src) ret_data = { 'handle': upload_session.handle, } _write_request_result(sock, result=ret_data) except: logger.exception('Failed to create upload session') _write_request_result(sock, False, exc_info=sys.exc_info())
def _extract_pod_name_ep(self, pod_data): from cupid.runtime import RuntimeContext pod_name = pod_data['metadata']['name'] if not RuntimeContext.is_context_ready(): logger.debug('Cupid context not ready, pod name: {}'.format(pod_name)) return pod_name, None if pod_name in self._pod_to_port: pod_port = self._pod_to_port[pod_name] else: pod_kv_data = self.cupid_kv.get(pod_name) if pod_kv_data: pod_port = self._pod_to_port[pod_name] = \ json.loads(pod_kv_data)['endpoint'].rsplit(':', 1)[-1] logger.debug('Get port from kvstore, name: {}, port: {}'.format(pod_name, pod_port)) else: pod_port = None logger.debug('Cannot get port from kvstore, name: {}'.format(pod_name)) pod_endpoint = '%s:%s' % (pod_data['status']['pod_ip'], pod_port) return pod_name, pod_endpoint if pod_port else None
def _handle_get_kv(sock): try: cmd_len, = struct.unpack('<I', sock.recv(4)) # dict with key cmd_body = pickle.loads(sock.recv(cmd_len)) from cupid.runtime import RuntimeContext if not RuntimeContext.is_context_ready(): logger.warning('Cupid context not ready') value = None else: from cupid import context cupid_kv = context().kv_store() value = cupid_kv.get(cmd_body['key']) ret_data = { 'value': value, } _write_request_result(sock, result=ret_data) except: logger.exception('Failed to get kv value') _write_request_result(sock, False, exc_info=sys.exc_info())
def _tile_cupid(cls, op): from odps import ODPS from odps.accounts import BearerTokenAccount from cupid import CupidSession, context from cupid.runtime import RuntimeContext if not RuntimeContext.is_context_ready(): raise SystemError( 'No Mars cluster found, please create via `o.create_mars_cluster`.' ) cupid_ctx = context() bearer_token = cupid_ctx.get_bearer_token() account = BearerTokenAccount(bearer_token) project = os.environ.get('ODPS_PROJECT_NAME', None) odps_params = op.odps_params.copy() if project: odps_params['project'] = project endpoint = os.environ.get( 'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint'] o = ODPS(None, None, account=account, project=odps_params['project'], endpoint=endpoint) cupid_session = CupidSession(o) data_src = o.get_table(op.table_name) logger.debug('Start creating upload session from cupid.') upload_session = cupid_session.create_upload_session(data_src) input_df = build_concatenated_rows_frame(op.inputs[0]) out_df = op.outputs[0] out_chunks = [] out_chunk_shape = (0, ) * len(input_df.shape) blocks = {} for chunk in input_df.chunks: block_id = str(int(time.time())) + '_' + str(uuid.uuid4()).replace( '-', '') chunk_op = DataFrameWriteTableSplit( dtypes=op.dtypes, table_name=op.table_name, unknown_as_string=op.unknown_as_string, partition_spec=op.partition_spec, cupid_handle=to_str(upload_session.handle), block_id=block_id, write_batch_size=op.write_batch_size) out_chunk = chunk_op.new_chunk([chunk], shape=out_chunk_shape, index=chunk.index, index_value=out_df.index_value, dtypes=chunk.dtypes) out_chunks.append(out_chunk) blocks[block_id] = op.partition_spec # build commit tree combine_size = 8 chunks = out_chunks while len(chunks) >= combine_size: new_chunks = [] for i in range(0, len(chunks), combine_size): chks = chunks[i:i + combine_size] if len(chks) == 1: chk = chks[0] else: chk_op = DataFrameWriteTableCommit(dtypes=op.dtypes, is_terminal=False) chk = chk_op.new_chunk(chks, shape=out_chunk_shape, index_value=out_df.index_value, dtypes=op.dtypes) new_chunks.append(chk) chunks = new_chunks assert len(chunks) < combine_size commit_table_op = DataFrameWriteTableCommit(dtypes=op.dtypes, table_name=op.table_name, blocks=blocks, cupid_handle=to_str( upload_session.handle), overwrite=op.overwrite, odps_params=op.odps_params, is_terminal=True) commit_table_chunk = commit_table_op.new_chunk( chunks, shape=out_chunk_shape, dtypes=op.dtypes, index_value=out_df.index_value) new_op = op.copy() return new_op.new_dataframes(op.inputs, shape=out_df.shape, index_value=out_df.index_value, dtypes=out_df.dtypes, columns_value=out_df.columns_value, chunks=[commit_table_chunk], nsplits=((0, ), ) * len(out_chunk_shape))
def to_mars_dataframe(odps, table_name, shape=None, partition=None, chunk_bytes=None, sparse=False, columns=None, add_offset=False, calc_nrows=True, use_arrow_dtype=False, string_as_binary=None, cupid_internal_endpoint=None): """ Read table to Mars DataFrame. :param table_name: table name :param shape: table shape. A tuple like (1000, 3) which means table count is 1000 and schema length is 3. :param partition: partition spec. :param chunk_bytes: Bytes to read for each chunk. Default value is '16M'. :param sparse: if read as sparse DataFrame. :param columns: selected columns. :param add_offset: if standardize the DataFrame's index to RangeIndex. False as default. :param calc_nrows: if calculate nrows if shape is not specified. :param use_arrow_dtype: read to arrow dtype. Reduce memory in some saces. :param string_as_binary: read string columns as binary type. :return: Mars DataFrame. """ from cupid.runtime import RuntimeContext from .dataframe import read_odps_table from ..utils import init_progress_ui odps_params = dict( project=odps.project, endpoint=cupid_internal_endpoint or cupid_options.cupid.runtime.endpoint) data_src = odps.get_table(table_name) odps_schema = data_src.schema if len(odps_schema.partitions) != 0: if partition is None: raise TypeError('Partition should be specified.') for col in columns or []: if col not in odps_schema.names: raise TypeError("Specific column {} doesn't exist in table".format(col)) # persist view table to a temp table if data_src.is_virtual_view: temp_table_name = table_name + '_temp_mars_table_' + str(uuid.uuid4()).replace('-', '_') odps.create_table(temp_table_name, schema=data_src.schema, stored_as='aliorc', lifecycle=1) data_src.to_df().persist(temp_table_name) table_name = temp_table_name data_src = odps.get_table(table_name) # get dataframe's shape if shape is None: if calc_nrows and not RuntimeContext.is_context_ready(): # obtain count if partition is None: odps_df = data_src.to_df() else: odps_df = data_src.get_partition(partition).to_df() nrows = odps_df.count().execute(use_tunnel=False, ui=init_progress_ui(mock=True)) else: nrows = np.nan shape = (nrows, len(data_src.schema.simple_columns)) return read_odps_table(odps.get_table(table_name), shape, partition=partition, chunk_bytes=chunk_bytes, sparse=sparse, columns=columns, odps_params=odps_params, add_offset=add_offset, use_arrow_dtype=use_arrow_dtype, string_as_binary=string_as_binary)
def tile(cls, op): import numpy as np import pandas as pd from odps import ODPS from odps.accounts import BearerTokenAccount from cupid import CupidSession, context from cupid.runtime import RuntimeContext from mars.context import get_context if not RuntimeContext.is_context_ready(): raise SystemError( 'No Mars cluster found, please create via `o.create_mars_cluster`.' ) cupid_ctx = context() bearer_token = cupid_ctx.get_bearer_token() account = BearerTokenAccount(bearer_token) project = os.environ.get('ODPS_PROJECT_NAME', None) odps_params = op.odps_params.copy() if project: odps_params['project'] = project o = ODPS(None, None, account=account, **odps_params) cupid_session = CupidSession(o) mars_context = get_context() df = op.outputs[0] split_size = df.extra_params.chunk_bytes or READ_CHUNK_LIMIT data_src = o.get_table(op.table_name) if op.partition is not None: data_src = data_src.get_partition(op.partition) try: data_store_size = data_src.size except ODPSError: # fail to get data size, just ignore pass else: if data_store_size < split_size and mars_context is not None: # get worker counts worker_count = max(len(mars_context.get_worker_addresses()), 1) # data is too small, split as many as number of cores split_size = data_store_size // worker_count # at least 1M split_size = max(split_size, 1 * 1024**2) logger.debug( 'Input data size is too small, split_size is {}'.format( split_size)) logger.debug( 'Start creating download session of table {} from cupid.'.format( op.table_name)) while True: try: download_session = cupid_session.create_download_session( data_src, split_size=split_size, columns=op.columns) break except CupidError: logger.debug( 'The number of splits exceeds 100000, split_size is {}'. format(split_size)) if split_size >= MAX_CHUNK_SIZE: raise else: split_size *= 2 logger.debug('%s table splits have been created.', str(len(download_session.splits))) if np.isnan(df.shape[0]): est_chunk_rows = [None] * len(download_session.splits) else: sp_file_sizes = np.array([ sp.split_file_end - sp.split_file_start for sp in download_session.splits ]) total_size = sp_file_sizes.sum() est_chunk_rows = sp_file_sizes * df.shape[0] // total_size logger.warning('Estimated chunk rows: %r', est_chunk_rows) out_chunks = [] if len(download_session.splits) == 0: logger.debug('Table {} has no data'.format(op.table_name)) chunk_op = DataFrameReadTableSplit() index_value = parse_index(pd.RangeIndex(0)) columns_value = parse_index(df.dtypes.index, store_data=True) out_chunk = chunk_op.new_chunk(None, shape=(np.nan, df.shape[1]), dtypes=op.dtypes, index_value=index_value, columns_value=columns_value, index=(0, 0)) out_chunks = [out_chunk] else: for idx, split in enumerate(download_session.splits): chunk_op = DataFrameReadTableSplit( cupid_handle=to_str(split.handle), split_index=split.split_index, split_file_start=split.split_file_start, split_file_end=split.split_file_end, schema_file_start=split.schema_file_start, schema_file_end=split.schema_file_end, add_offset=op.add_offset, dtypes=op.dtypes, sparse=op.sparse, split_size=split_size, string_as_binary=op.string_as_binary, use_arrow_dtype=op.use_arrow_dtype, estimate_rows=est_chunk_rows[idx]) # the chunk shape is unknown index_value = parse_index(pd.RangeIndex(0)) columns_value = parse_index(df.dtypes.index, store_data=True) out_chunk = chunk_op.new_chunk(None, shape=(np.nan, df.shape[1]), dtypes=op.dtypes, index_value=index_value, columns_value=columns_value, index=(idx, 0)) out_chunks.append(out_chunk) if op.add_offset: out_chunks = standardize_range_index(out_chunks) new_op = op.copy() nsplits = ((np.nan, ) * len(out_chunks), (df.shape[1], )) return new_op.new_dataframes(None, shape=df.shape, dtypes=op.dtypes, index_value=df.index_value, columns_value=df.columns_value, chunks=out_chunks, nsplits=nsplits)
def _handle_create_table_download_session(sock): try: cmd_len, = struct.unpack('<I', sock.recv(4)) # dict with odps_params, table_name, partition, columns, worker_count, split_size, max_chunk_num session_config = pickle.loads(sock.recv(cmd_len)) from odps import ODPS from odps.errors import ODPSError from odps.accounts import BearerTokenAccount from cupid import CupidSession, context from cupid.errors import CupidError from cupid.runtime import RuntimeContext if not RuntimeContext.is_context_ready(): raise SystemError( 'No Mars cluster found, please create via `o.create_mars_cluster`.' ) cupid_ctx = context() odps_params = session_config['odps_params'] bearer_token = cupid_ctx.get_bearer_token() account = BearerTokenAccount(bearer_token) project = os.environ.get('ODPS_PROJECT_NAME', None) or odps_params['project'] endpoint = os.environ.get( 'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint'] o = ODPS(None, None, account=account, project=project, endpoint=endpoint) cupid_session = CupidSession(o) split_size = session_config['split_size'] table_name = session_config['table_name'] data_src = o.get_table(table_name) if session_config.get('partition') is not None: data_src = data_src.get_partition(session_config['partition']) try: data_store_size = data_src.size except ODPSError: # fail to get data size, just ignore pass else: worker_count = session_config['worker_count'] if data_store_size < split_size and worker_count is not None: # data is too small, split as many as number of cores split_size = data_store_size // worker_count # at least 1M split_size = max(split_size, 1 * 1024**2) logger.debug( 'Input data size is too small, split_size is {}'.format( split_size)) max_chunk_num = session_config['max_chunk_num'] columns = session_config['columns'] with_split_meta = session_config.get('with_split_meta_on_tile') logger.debug( 'Start creating download session of table %s from cupid, columns %r', table_name, columns) while True: try: download_session = cupid_session.create_download_session( data_src, split_size=split_size, columns=columns, with_split_meta=with_split_meta) break except CupidError: logger.debug( 'The number of splits exceeds 100000, split_size is {}'. format(split_size)) if split_size >= max_chunk_num: raise else: split_size *= 2 ret_data = { 'splits': download_session.splits, 'split_size': split_size, } _write_request_result(sock, result=ret_data) except: logger.exception('Failed to create download session') _write_request_result(sock, False, exc_info=sys.exc_info())
def persist_mars_dataframe(odps, df, table_name, overwrite=False, partition=None, write_batch_size=None, unknown_as_string=None, as_type=None, drop_table=False, create_table=True, drop_partition=False, create_partition=None, lifecycle=None, runtime_endpoint=None, **kw): """ Write Mars DataFrame to table. :param df: Mars DataFrame. :param table_name: table to write. :param overwrite: if overwrite the data. False as default. :param partition: partition spec. :param write_batch_size: batch size of records to write. 1024 as default. :param unknown_as_string: set the columns to string type if it's type is Object. :param as_type: specify column dtypes. {'a': 'string'} will set column `a` as string type. :param drop_table: drop table if exists, False as default :param create_table: create table first if not exits, True as default :param drop_partition: drop partition if exists, False as default :param create_partition: create partition if not exists, None as default :param lifecycle: table lifecycle. If absent, `options.lifecycle` will be used. :return: None """ from .dataframe import write_odps_table from odps.tunnel import TableTunnel dtypes = df.dtypes odps_types = [] names = [] for name, t in zip(dtypes.keys(), list(dtypes.values)): names.append(name) if as_type and name in as_type: odps_types.append(as_type[name]) else: odps_types.append( pd_type_to_odps_type(t, name, unknown_as_string=unknown_as_string)) if partition: p = PartitionSpec(partition) schema = Schema.from_lists(names, odps_types, p.keys, ['string'] * len(p)) else: schema = Schema.from_lists(names, odps_types) if drop_table: odps.delete_table(table_name, if_exists=True) if partition is None: # the non-partitioned table if drop_partition: raise ValueError('Cannot drop partition for non-partition table') if create_partition: raise ValueError('Cannot create partition for non-partition table') if create_table or (not odps.exist_table(table_name)): odps.create_table(table_name, schema, if_not_exists=True, stored_as='aliorc', lifecycle=lifecycle) else: if odps.exist_table(table_name) or not create_table: t = odps.get_table(table_name) table_partition = t.get_partition(partition) if drop_partition: t.delete_partition(table_partition, if_exists=True) if create_partition: t.create_partition(table_partition, if_not_exists=True) else: odps.create_table(table_name, schema, stored_as='aliorc', lifecycle=lifecycle) table = odps.get_table(table_name) if len(table.schema.simple_columns) != len(schema.simple_columns): raise TypeError( 'Table column number is %s while input DataFrame has %s columns' % (len(table.schema.simple_columns), len(schema.simple_columns))) for c_left, c_right in zip(table.schema.simple_columns, schema.simple_columns): if c_left.name.lower() != c_right.name.lower( ) or c_left.type != c_right.type: raise TypeError( 'Column type between provided DataFrame and target table' ' does not agree with each other. DataFrame column %s type is %s,' 'target table column %s type is %s' % (c_right.name, c_right.type, c_left.name, c_left.type)) if partition: table.create_partition(partition, if_not_exists=True) runtime_endpoint = (runtime_endpoint or kw.pop('cupid_internal_endpoint', None) or cupid_options.cupid.runtime.endpoint) odps_params = dict(project=odps.project, endpoint=runtime_endpoint) if isinstance(odps.account, AliyunAccount): odps_params.update( dict(access_id=odps.account.access_id, secret_access_key=odps.account.secret_access_key)) if isinstance(df, pd.DataFrame): from cupid.runtime import RuntimeContext import pyarrow as pa if RuntimeContext.is_context_ready(): _write_table_in_cupid(odps, df, table, partition=partition, overwrite=overwrite, unknown_as_string=unknown_as_string) else: t = odps.get_table(table_name) tunnel = TableTunnel(odps, project=t.project) if partition is not None: upload_session = tunnel.create_upload_session( t.name, partition_spec=partition) else: upload_session = tunnel.create_upload_session(t.name) writer = upload_session.open_arrow_writer(0) arrow_rb = pa.RecordBatch.from_pandas(df) writer.write(arrow_rb) writer.close() upload_session.commit([0]) else: write_odps_table(df, table, partition=partition, overwrite=overwrite, odps_params=odps_params, unknown_as_string=unknown_as_string, write_batch_size=write_batch_size).execute()