def tile(cls, op):
        from odps import ODPS
        from odps.accounts import BearerTokenAccount
        from cupid import CupidSession, context

        bearer_token = context().get_bearer_token()
        account = BearerTokenAccount(bearer_token)
        o = ODPS(None, None, account=account, **op.odps_params)
        cupid_session = CupidSession(o)

        data_src = o.get_table(op.table_name)

        logger.debug('Start creating upload session from cupid.')
        upload_session = cupid_session.create_upload_session(data_src)

        input_df = op.inputs[0]

        out_chunks = []
        out_chunk_shape = (0,) * len(input_df.shape)
        blocks = {}
        for chunk in input_df.chunks:
            block_id = str(int(time.time())) + '_' + str(uuid.uuid4()).replace('-', '')
            chunk_op = DataFrameWriteTableSplit(dtypes=op.dtypes, table_name=op.table_name,
                                                partition_spec=op.partition_spec,
                                                cupid_handle=to_str(upload_session.handle),
                                                block_id=block_id, write_batch_size=op.write_batch_size)
            out_chunk = chunk_op.new_chunk([chunk], shape=out_chunk_shape, index=chunk.index, dtypes=chunk.dtypes)
            out_chunks.append(out_chunk)
            blocks[block_id] = op.partition_spec

        # build commit tree
        combine_size = 8
        chunks = out_chunks
        while len(chunks) > combine_size:
            new_chunks = []
            for i in range(0, len(chunks), combine_size):
                chks = chunks[i: i + combine_size]
                if len(chks) == 1:
                    chk = chks[0]
                else:
                    chk_op = DataFrameWriteTableCommit(dtypes=op.dtypes, is_terminal=False)
                    chk = chk_op.new_chunk(chks, shape=out_chunk_shape, dtypes=op.dtypes)
                new_chunks.append(chk)
            chunks = new_chunks

        assert len(chunks) < combine_size

        commit_table_op = DataFrameWriteTableCommit(dtypes=op.dtypes, table_name=op.table_name, blocks=blocks,
                                                    cupid_handle=to_str(upload_session.handle),
                                                    overwrite=op.overwrite, odps_params=op.odps_params,
                                                    is_terminal=True)
        commit_table_chunk = commit_table_op.new_chunk(chunks, shape=out_chunk_shape, dtypes=op.dtypes)

        out_df = op.outputs[0]
        new_op = op.copy()
        return new_op.new_dataframes(op.inputs, shape=out_df.shape,
                                     dtypes=out_df.dtypes, chunks=[commit_table_chunk],
                                     nsplits=((0,),) * len(out_chunk_shape))
Пример #2
0
    def execute(cls, ctx, op):
        import pandas as pd
        from odps import ODPS
        from odps.accounts import BearerTokenAccount
        from cupid import CupidSession, context
        from cupid.io.table import CupidTableUploadSession

        if op.is_terminal:
            bearer_token = context().get_bearer_token()
            account = BearerTokenAccount(bearer_token)
            project = os.environ.get('ODPS_PROJECT_NAME', None)
            odps_params = op.odps_params.copy()
            if project:
                odps_params['project'] = project
            endpoint = os.environ.get(
                'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
            o = ODPS(None,
                     None,
                     account=account,
                     project=odps_params['project'],
                     endpoint=endpoint)
            cupid_session = CupidSession(o)

            project_name, table_name = op.table_name.split('.')
            upload_session = CupidTableUploadSession(session=cupid_session,
                                                     table_name=table_name,
                                                     project_name=project_name,
                                                     handle=op.cupid_handle,
                                                     blocks=op.blocks)
            upload_session.commit(overwrite=op.overwrite)

        ctx[op.outputs[0].key] = pd.DataFrame()
    def __init__(self, odps, inst=None, project=None):
        self._odps = odps
        self._cupid_session = CupidSession(odps, project=project)
        self._kube_instance = inst
        self._kube_url = None
        self._kube_client = None
        self._kube_namespace = None

        self._scheduler_key = None
        self._scheduler_config = None
        self._worker_config = None
        self._web_config = None
        self._endpoint = None
        self._with_notebook = False
        self._notebook_endpoint = None

        self._mars_session = None
        self._req_session = None
Пример #4
0
def _write_table_in_cupid(odps,
                          df,
                          table,
                          partition=None,
                          overwrite=True,
                          unknown_as_string=None):
    import pyarrow as pa
    from mars.utils import to_str
    from cupid import CupidSession
    from cupid.io.table.core import BlockWriter

    cupid_session = CupidSession(odps)
    logger.debug('Start creating upload session from cupid.')
    upload_session = cupid_session.create_upload_session(table)
    block_writer = BlockWriter(_table_name=table.name,
                               _project_name=table.project.name,
                               _table_schema=table.schema,
                               _partition_spec=partition,
                               _block_id='0',
                               _handle=to_str(upload_session.handle))
    logger.debug('Start writing table block, block id: 0')
    with block_writer.open_arrow_writer() as cupid_writer:
        sink = pa.BufferOutputStream()

        batch_size = 1024
        batch_idx = 0
        batch_data = df[batch_size * batch_idx:batch_size * (batch_idx + 1)]
        batch_data = convert_pandas_object_to_string(batch_data)
        schema = pa.RecordBatch.from_pandas(df[:1],
                                            preserve_index=False).schema
        arrow_writer = pa.RecordBatchStreamWriter(sink, schema)
        while len(batch_data) > 0:
            batch = pa.RecordBatch.from_pandas(batch_data,
                                               preserve_index=False)
            arrow_writer.write_batch(batch)
            batch_idx += 1
            batch_data = df[batch_size * batch_idx:batch_size *
                            (batch_idx + 1)]
        arrow_writer.close()
        cupid_writer.write(sink.getvalue())
    block_writer.commit()

    upload_session._blocks = {'0': partition}
    upload_session.commit(overwrite=overwrite)
Пример #5
0
def _handle_create_table_upload_session(sock):
    try:
        cmd_len, = struct.unpack('<I', sock.recv(4))
        # dict with odps_params, table_name
        session_config = pickle.loads(sock.recv(cmd_len))

        from odps import ODPS
        from odps.accounts import BearerTokenAccount
        from cupid import CupidSession, context
        from cupid.runtime import RuntimeContext

        if not RuntimeContext.is_context_ready():
            raise SystemError(
                'No Mars cluster found, please create via `o.create_mars_cluster`.'
            )
        cupid_ctx = context()

        odps_params = session_config['odps_params']
        bearer_token = cupid_ctx.get_bearer_token()
        account = BearerTokenAccount(bearer_token)
        project = os.environ.get('ODPS_PROJECT_NAME',
                                 None) or odps_params['project']
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(None,
                 None,
                 account=account,
                 project=project,
                 endpoint=endpoint)
        cupid_session = CupidSession(o)

        data_src = o.get_table(session_config['table_name'])

        logger.debug('Start creating upload session from cupid.')
        upload_session = cupid_session.create_upload_session(data_src)

        ret_data = {
            'handle': upload_session.handle,
        }
        _write_request_result(sock, result=ret_data)
    except:
        logger.exception('Failed to create upload session')
        _write_request_result(sock, False, exc_info=sys.exc_info())
Пример #6
0
    def __init__(self, odps, inst=None, project=None):
        from cupid import CupidSession

        self._odps = odps
        self._cupid_session = CupidSession(odps, project=project)
        self._kube_instance = inst
        self._kube_url = None
        self._kube_client = None
        self._kube_namespace = None

        self._supervisor_key = None
        self._supervisor_config = None
        self._worker_config = None
        self._endpoint = None
        self._with_notebook = False
        self._notebook_endpoint = None
        self._with_graphscope = False
        self._graphscope_endpoint = None

        self._mars_session = None
        self._req_session = None
Пример #7
0
def _handle_commit_table_upload_session(sock):
    try:
        cmd_len, = struct.unpack('<I', sock.recv(4))
        # dict with odps_params, table_name, cupid_handle, blocks, overwrite
        commit_config = pickle.loads(sock.recv(cmd_len))

        from odps import ODPS
        from odps.accounts import BearerTokenAccount
        from cupid import CupidSession, context
        from cupid.runtime import RuntimeContext
        from cupid.io.table import CupidTableUploadSession

        if not RuntimeContext.is_context_ready():
            raise SystemError(
                'No Mars cluster found, please create via `o.create_mars_cluster`.'
            )
        cupid_ctx = context()

        odps_params = commit_config['odps_params']
        bearer_token = cupid_ctx.get_bearer_token()
        account = BearerTokenAccount(bearer_token)
        project = os.environ.get('ODPS_PROJECT_NAME',
                                 None) or odps_params['project']
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(None,
                 None,
                 account=account,
                 project=project,
                 endpoint=endpoint)
        cupid_session = CupidSession(o)

        project_name, table_name = commit_config['table_name'].split('.')
        upload_session = CupidTableUploadSession(
            session=cupid_session,
            table_name=table_name,
            project_name=project_name,
            handle=commit_config['cupid_handle'],
            blocks=commit_config['blocks'])
        upload_session.commit(overwrite=commit_config['overwrite'])

        _write_request_result(sock)
    except:
        logger.exception('Failed to commit upload session')
        _write_request_result(sock, False, exc_info=sys.exc_info())
    def execute(cls, ctx, op):
        import pandas as pd
        from odps import ODPS
        from odps.accounts import BearerTokenAccount
        from cupid import CupidSession, context
        from cupid.io.table import CupidTableUploadSession

        if op.is_terminal:
            bearer_token = context().get_bearer_token()
            account = BearerTokenAccount(bearer_token)
            o = ODPS(None, None, account=account, **op.odps_params)
            cupid_session = CupidSession(o)

            project_name, table_name = op.table_name.split('.')
            upload_session = CupidTableUploadSession(
                session=cupid_session, table_name=table_name, project_name=project_name,
                handle=op.cupid_handle, blocks=op.blocks)
            upload_session.commit(overwrite=op.overwrite)

        ctx[op.outputs[0].key] = pd.DataFrame()
Пример #9
0
    def _tile_cupid(cls, op):
        from odps import ODPS
        from odps.accounts import BearerTokenAccount
        from cupid import CupidSession, context
        from cupid.runtime import RuntimeContext

        if not RuntimeContext.is_context_ready():
            raise SystemError(
                'No Mars cluster found, please create via `o.create_mars_cluster`.'
            )
        cupid_ctx = context()

        bearer_token = cupid_ctx.get_bearer_token()
        account = BearerTokenAccount(bearer_token)
        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(None,
                 None,
                 account=account,
                 project=odps_params['project'],
                 endpoint=endpoint)
        cupid_session = CupidSession(o)

        data_src = o.get_table(op.table_name)

        logger.debug('Start creating upload session from cupid.')
        upload_session = cupid_session.create_upload_session(data_src)

        input_df = build_concatenated_rows_frame(op.inputs[0])
        out_df = op.outputs[0]

        out_chunks = []
        out_chunk_shape = (0, ) * len(input_df.shape)
        blocks = {}
        for chunk in input_df.chunks:
            block_id = str(int(time.time())) + '_' + str(uuid.uuid4()).replace(
                '-', '')
            chunk_op = DataFrameWriteTableSplit(
                dtypes=op.dtypes,
                table_name=op.table_name,
                unknown_as_string=op.unknown_as_string,
                partition_spec=op.partition_spec,
                cupid_handle=to_str(upload_session.handle),
                block_id=block_id,
                write_batch_size=op.write_batch_size)
            out_chunk = chunk_op.new_chunk([chunk],
                                           shape=out_chunk_shape,
                                           index=chunk.index,
                                           index_value=out_df.index_value,
                                           dtypes=chunk.dtypes)
            out_chunks.append(out_chunk)
            blocks[block_id] = op.partition_spec

        # build commit tree
        combine_size = 8
        chunks = out_chunks
        while len(chunks) >= combine_size:
            new_chunks = []
            for i in range(0, len(chunks), combine_size):
                chks = chunks[i:i + combine_size]
                if len(chks) == 1:
                    chk = chks[0]
                else:
                    chk_op = DataFrameWriteTableCommit(dtypes=op.dtypes,
                                                       is_terminal=False)
                    chk = chk_op.new_chunk(chks,
                                           shape=out_chunk_shape,
                                           index_value=out_df.index_value,
                                           dtypes=op.dtypes)
                new_chunks.append(chk)
            chunks = new_chunks

        assert len(chunks) < combine_size

        commit_table_op = DataFrameWriteTableCommit(dtypes=op.dtypes,
                                                    table_name=op.table_name,
                                                    blocks=blocks,
                                                    cupid_handle=to_str(
                                                        upload_session.handle),
                                                    overwrite=op.overwrite,
                                                    odps_params=op.odps_params,
                                                    is_terminal=True)
        commit_table_chunk = commit_table_op.new_chunk(
            chunks,
            shape=out_chunk_shape,
            dtypes=op.dtypes,
            index_value=out_df.index_value)

        new_op = op.copy()
        return new_op.new_dataframes(op.inputs,
                                     shape=out_df.shape,
                                     index_value=out_df.index_value,
                                     dtypes=out_df.dtypes,
                                     columns_value=out_df.columns_value,
                                     chunks=[commit_table_chunk],
                                     nsplits=((0, ), ) * len(out_chunk_shape))
class MarsCupidClient(object):
    def __init__(self, odps, inst=None, project=None):
        self._odps = odps
        self._cupid_session = CupidSession(odps, project=project)
        self._kube_instance = inst
        self._kube_url = None
        self._kube_client = None
        self._kube_namespace = None

        self._scheduler_key = None
        self._scheduler_config = None
        self._worker_config = None
        self._web_config = None
        self._endpoint = None
        self._with_notebook = False
        self._notebook_endpoint = None

        self._mars_session = None
        self._req_session = None

    @property
    def endpoint(self):
        return self._endpoint

    @property
    def notebook_endpoint(self):
        return self._notebook_endpoint

    @property
    def session(self):
        return self._mars_session

    @property
    def instance_id(self):
        return self._kube_instance.id

    def submit(self, image=None, scheduler_num=1, scheduler_cpu=8, scheduler_mem=32 * 1024 ** 3,
               worker_num=1, worker_cpu=8, worker_mem=32 * 1024 ** 3, worker_cache_mem=None,
               min_worker_num=None, worker_disk_num=1, worker_disk_size=100 * 1024 ** 3,
               web_num=1, web_cpu=1, web_mem=1024 ** 3, with_notebook=False, notebook_cpu=1,
               notebook_mem=2 * 1024 ** 3, timeout=None, extra_env=None, extra_modules=None,
               resources=None, create_session=True, priority=None, running_cluster=None,
               task_name=None, **kw):
        try:
            async_ = kw.pop('async_', None)

            # compatible with early version
            mars_image = kw.pop('mars_image', None)
            default_resources = kw.pop('default_resources', None) or DEFAULT_RESOURCES
            instance_idle_timeout = kw.pop('instance_idle_timeout', None)
            if with_notebook is not None:
                self._with_notebook = bool(with_notebook)
            else:
                self._with_notebook = options.mars.launch_notebook
            if self._kube_instance is None:
                image = image or mars_image or build_image_name('mars')

                extra_modules = extra_modules or []
                if isinstance(extra_modules, (tuple, list)):
                    extra_modules = list(extra_modules) + ['odps.mars_extension']
                else:
                    extra_modules = [extra_modules, 'odps.mars_extension']

                if resources is not None:
                    if isinstance(resources, (tuple, list)):
                        resources = list(resources)
                        resources.extend(default_resources)
                    else:
                        resources = [resources] + default_resources
                else:
                    resources = default_resources

                if worker_cache_mem is None:
                    worker_cache_mem = int(worker_mem * 0.48)
                else:
                    worker_cache_mem = worker_cache_mem

                cluster_args = dict(
                    image=image, scheduler_num=scheduler_num, scheduler_cpu=scheduler_cpu,
                    scheduler_mem=scheduler_mem, worker_num=worker_num, worker_cpu=worker_cpu,
                    worker_mem=worker_mem, worker_cache_mem=worker_cache_mem,
                    min_worker_num=min_worker_num, worker_disk_num=worker_disk_num,
                    worker_disk_size=worker_disk_size, web_num=web_num, web_cpu=web_cpu,
                    web_mem=web_mem, with_notebook=with_notebook, notebook_cpu=notebook_cpu,
                    notebook_mem=notebook_mem, extra_env=extra_env, extra_modules=extra_modules,
                    instance_idle_timeout=instance_idle_timeout, timeout=timeout)

                command = '/srv/entrypoint.sh %s %s' % (
                    __name__.rsplit('.', 1)[0] + '.app',
                    base64.b64encode(json.dumps(cluster_args).encode()).decode()
                )

                self._kube_instance = self._cupid_session.start_kubernetes(
                    async_=True, running_cluster=running_cluster, priority=priority,
                    app_image=build_image_name('mars'), app_command=command, resources=resources,
                    task_name=task_name, **kw)
                write_log(self._kube_instance.get_logview_address())
            if async_:
                return self
            else:
                self.wait_for_success(create_session=create_session, min_worker_num=min_worker_num or worker_num)
                return self

        except KeyboardInterrupt:
            self.stop_server()
            return self

    def check_service_ready(self, timeout=1):
        try:
            resp = self._req_session.get(self._endpoint + '/api', timeout=timeout)
        except (requests.ConnectionError, requests.Timeout):
            return False
        if resp.status_code >= 400:
            return False
        return True

    def count_workers(self):
        resp = self._req_session.get(self._endpoint + '/api/worker?action=count', timeout=1)
        return json.loads(resp.text)

    def get_logview_address(self):
        return self._kube_instance.get_logview_address()

    def get_mars_endpoint(self):
        return self._cupid_session.get_proxied_url(self._kube_instance.id, CUPID_APP_NAME)

    def get_notebook_endpoint(self):
        return self._cupid_session.get_proxied_url(self._kube_instance.id, NOTEBOOK_NAME)

    def get_req_session(self):
        from ...rest import RestClient

        if options.mars.use_common_proxy:
            return RestClient(self._odps.account, self._endpoint, self._odps.project)
        else:
            return requests.Session()

    def check_instance_status(self):
        if self._kube_instance.is_terminated():
            for task_name, task in (self._kube_instance.get_task_statuses()).items():
                exc = None
                if task.status == Instance.Task.TaskStatus.FAILED:
                    exc = errors.parse_instance_error(self._kube_instance.get_task_result(task_name))
                elif task.status != Instance.Task.TaskStatus.SUCCESS:
                    exc = errors.ODPSError('%s, status=%s' % (task_name, task.status.value))
                if exc:
                    exc.instance_id = self._kube_instance.id
                    raise exc

    def wait_for_success(self, min_worker_num=0, create_session=True):
        while True:
            self.check_instance_status()
            try:
                if self._endpoint is None:
                    self._endpoint = self.get_mars_endpoint()
                    write_log('Mars UI: ' + self._endpoint)
                    self._req_session = self.get_req_session()

                    self._req_session.post(self._endpoint.rstrip('/') + '/api/logger', data=dict(
                        content='Mars UI from client: ' + self._endpoint
                    ))
                if self._with_notebook and self._notebook_endpoint is None:
                    self._notebook_endpoint = self.get_notebook_endpoint()
                    write_log('Notebook UI: ' + self._notebook_endpoint)

                    self._req_session.post(self._endpoint.rstrip('/') + '/api/logger', data=dict(
                        content='Notebook UI from client: ' + self._notebook_endpoint
                    ))
            except KeyboardInterrupt:
                raise
            except:
                time.sleep(1)
                continue

            if not self.check_service_ready():
                continue
            try:
                if self.count_workers() >= min_worker_num:
                    break
                else:
                    time.sleep(1)
            except:
                continue

        if create_session:
            try:
                self._mars_session = new_session(self._endpoint, req_session=self._req_session).as_default()
            except KeyboardInterrupt:
                raise
            except:
                if self._kube_instance and self._kube_instance.status == self._kube_instance.Status.RUNNING:
                    self._kube_instance.stop()
                raise

    def restart_session(self):
        self._mars_session.close()
        self._mars_session = new_session(self._endpoint, req_session=self._req_session).as_default()

    def stop_server(self):
        if self._kube_instance:
            self._kube_instance.stop()
            self._kube_instance = None
Пример #11
0
    def tile(cls, op):
        import numpy as np
        import pandas as pd
        from odps import ODPS
        from odps.accounts import BearerTokenAccount
        from cupid import CupidSession, context

        bearer_token = context().get_bearer_token()
        account = BearerTokenAccount(bearer_token)
        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        o = ODPS(None, None, account=account, **odps_params)
        cupid_session = CupidSession(o)

        df = op.outputs[0]
        split_size = df.extra_params.chunk_bytes or CHUNK_LIMIT

        data_src = o.get_table(op.table_name)
        if op.partition is not None:
            data_src = data_src.get_partition(op.partition)

        logger.debug('Start creating download session from cupid.')
        while True:
            try:
                download_session = cupid_session.create_download_session(
                    data_src, split_size=split_size, columns=op.columns)
                break
            except CupidError:
                logger.debug(
                    'The number of splits exceeds 100000, split_size is {}'.
                    format(split_size))
                if split_size >= MAX_CHUNK_SIZE:
                    raise
                else:
                    split_size *= 2

        logger.debug('%s table splits have been created.',
                     str(len(download_session.splits)))

        out_chunks = []
        # Ignore add_offset at this time.
        op._add_offset = False

        for idx, split in enumerate(download_session.splits):
            chunk_op = DataFrameReadTableSplit(
                cupid_handle=to_str(split.handle),
                split_index=split.split_index,
                split_file_start=split.split_file_start,
                split_file_end=split.split_file_end,
                schema_file_start=split.schema_file_start,
                schema_file_end=split.schema_file_end,
                add_offset=op.add_offset,
                dtypes=op.dtypes,
                sparse=op.sparse)
            # the chunk shape is unknown
            index_value = parse_index(pd.RangeIndex(0))
            columns_value = parse_index(df.dtypes.index, store_data=True)
            out_chunk = chunk_op.new_chunk(None,
                                           shape=(np.nan, df.shape[1]),
                                           dtypes=op.dtypes,
                                           index_value=index_value,
                                           columns_value=columns_value,
                                           index=(idx, 0))
            out_chunks.append(out_chunk)

        if op.add_offset:
            out_chunks = standardize_range_index(out_chunks)

        new_op = op.copy()
        nsplits = ((np.nan, ) * len(out_chunks), (df.shape[1], ))
        return new_op.new_dataframes(None,
                                     shape=df.shape,
                                     dtypes=op.dtypes,
                                     index_value=df.index_value,
                                     columns_value=df.columns_value,
                                     chunks=out_chunks,
                                     nsplits=nsplits)
    def tile(cls, op):
        import numpy as np
        import pandas as pd
        from odps import ODPS
        from odps.accounts import BearerTokenAccount
        from cupid import CupidSession, context
        from mars.context import get_context

        cupid_ctx = context()
        if cupid_ctx is None:
            raise SystemError(
                'No Mars cluster found, please create via `o.create_mars_cluster`.'
            )

        bearer_token = cupid_ctx.get_bearer_token()
        account = BearerTokenAccount(bearer_token)
        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        o = ODPS(None, None, account=account, **odps_params)
        cupid_session = CupidSession(o)

        mars_context = get_context()

        df = op.outputs[0]
        split_size = df.extra_params.chunk_bytes or READ_CHUNK_LIMIT

        data_src = o.get_table(op.table_name)
        if op.partition is not None:
            data_src = data_src.get_partition(op.partition)

        try:
            data_store_size = data_src.size
        except ODPSError:
            # fail to get data size, just ignore
            pass
        else:
            if data_store_size < split_size and mars_context is not None:
                # get worker counts
                worker_count = max(len(mars_context.get_worker_addresses()), 1)
                # data is too small, split as many as number of cores
                split_size = data_store_size // worker_count
                # at least 1M
                split_size = max(split_size, 1 * 1024**2)
                logger.debug(
                    'Input data size is too small, split_size is {}'.format(
                        split_size))

        logger.debug(
            'Start creating download session of table {} from cupid.'.format(
                op.table_name))
        while True:
            try:
                download_session = cupid_session.create_download_session(
                    data_src, split_size=split_size, columns=op.columns)
                break
            except CupidError:
                logger.debug(
                    'The number of splits exceeds 100000, split_size is {}'.
                    format(split_size))
                if split_size >= MAX_CHUNK_SIZE:
                    raise
                else:
                    split_size *= 2

        logger.debug('%s table splits have been created.',
                     str(len(download_session.splits)))

        if np.isnan(df.shape[0]):
            est_chunk_rows = [None] * len(download_session.splits)
        else:
            sp_file_sizes = np.array([
                sp.split_file_end - sp.split_file_start
                for sp in download_session.splits
            ])
            total_size = sp_file_sizes.sum()
            est_chunk_rows = sp_file_sizes * df.shape[0] // total_size

        logger.warning('Estimated chunk rows: %r', est_chunk_rows)

        out_chunks = []
        # Ignore add_offset at this time.
        op._add_offset = False

        if len(download_session.splits) == 0:
            logger.debug('Table {} has no data'.format(op.table_name))
            chunk_op = DataFrameReadTableSplit()
            index_value = parse_index(pd.RangeIndex(0))
            columns_value = parse_index(df.dtypes.index, store_data=True)
            out_chunk = chunk_op.new_chunk(None,
                                           shape=(np.nan, df.shape[1]),
                                           dtypes=op.dtypes,
                                           index_value=index_value,
                                           columns_value=columns_value,
                                           index=(0, 0))
            out_chunks = [out_chunk]
        else:
            for idx, split in enumerate(download_session.splits):
                chunk_op = DataFrameReadTableSplit(
                    cupid_handle=to_str(split.handle),
                    split_index=split.split_index,
                    split_file_start=split.split_file_start,
                    split_file_end=split.split_file_end,
                    schema_file_start=split.schema_file_start,
                    schema_file_end=split.schema_file_end,
                    add_offset=op.add_offset,
                    dtypes=op.dtypes,
                    sparse=op.sparse,
                    split_size=split_size,
                    use_arrow_dtype=op.use_arrow_dtype,
                    estimate_rows=est_chunk_rows[idx])
                # the chunk shape is unknown
                index_value = parse_index(pd.RangeIndex(0))
                columns_value = parse_index(df.dtypes.index, store_data=True)
                out_chunk = chunk_op.new_chunk(None,
                                               shape=(np.nan, df.shape[1]),
                                               dtypes=op.dtypes,
                                               index_value=index_value,
                                               columns_value=columns_value,
                                               index=(idx, 0))
                out_chunks.append(out_chunk)

        if op.add_offset:
            out_chunks = standardize_range_index(out_chunks)

        new_op = op.copy()
        nsplits = ((np.nan, ) * len(out_chunks), (df.shape[1], ))
        return new_op.new_dataframes(None,
                                     shape=df.shape,
                                     dtypes=op.dtypes,
                                     index_value=df.index_value,
                                     columns_value=df.columns_value,
                                     chunks=out_chunks,
                                     nsplits=nsplits)
Пример #13
0
    def tile(cls, op):
        import numpy as np
        import pandas as pd
        from odps import ODPS
        from odps.accounts import BearerTokenAccount
        from cupid import CupidSession, context

        bearer_token = context().get_bearer_token()
        account = BearerTokenAccount(bearer_token)
        o = ODPS(None, None, account=account, **op.odps_params)
        cupid_session = CupidSession(o)

        df = op.outputs[0]
        split_size = df.extra_params.chunk_store_limit or options.tensor.chunk_store_limit

        data_src = o.get_table(op.table_name)
        if op.partition is not None:
            data_src = data_src.get_partition(op.partition)

        logger.debug('Start creating download session from cupid.')
        download_session = cupid_session.create_download_session(
            data_src, split_size=split_size)
        logger.debug('%s table splits have been created.',
                     str(len(download_session.splits)))

        out_chunks = []
        out_count_chunks = []
        for idx, split in enumerate(download_session.splits):
            chunk_op = DataFrameReadTableSplit(
                cupid_handle=to_str(split.handle),
                split_index=split.split_index,
                split_file_start=split.split_file_start,
                split_file_end=split.split_file_end,
                schema_file_start=split.schema_file_start,
                schema_file_end=split.schema_file_end,
                dtypes=op.dtypes,
                sparse=op.sparse)
            # the chunk shape is unknown
            index_value = parse_index(pd.RangeIndex(0))
            columns_value = parse_index(df.dtypes.index, store_data=True)
            out_chunk, out_count_chunk = chunk_op.new_chunks(
                None,
                kws=[{
                    'shape': (np.nan, df.shape[1]),
                    'dtypes': op.dtypes,
                    'index_value': index_value,
                    'columns_value': columns_value,
                    'index': (idx, )
                }, {
                    'shape': (1, ),
                    'index': (idx, )
                }])
            out_chunks.append(out_chunk)
            out_count_chunks.append(out_count_chunk)

        if op.add_offset:
            output_chunks = []
            for i, chunk in enumerate(out_chunks):
                if i == 0:
                    output_chunks.append(chunk)
                    continue
                counts = out_count_chunks[:i]
                inputs = [chunk] + counts
                output_chunk = DataFrameReadTableWithOffset(
                    dtypes=chunk.dtypes).new_chunk(
                        inputs,
                        shape=chunk.shape,
                        index=chunk.index,
                        dtypes=chunk.dtypes,
                        index_value=chunk.index_value,
                        columns_value=chunk.columns_value)
                output_chunks.append(output_chunk)
        else:
            output_chunks = out_chunks

        new_op = op.copy()
        nsplits = ((np.nan, ) * len(output_chunks), (df.shape[1], ))
        return new_op.new_dataframes(None,
                                     shape=df.shape,
                                     dtypes=op.dtypes,
                                     index_value=df.index_value,
                                     columns_value=df.columns_value,
                                     chunks=output_chunks,
                                     nsplits=nsplits)
Пример #14
0
class MarsCupidClient(object):
    def __init__(self, odps, inst=None):
        self._odps = odps
        self._cupid_session = CupidSession(odps)
        self._kube_instance = inst
        self._kube_url = None
        self._kube_client = None
        self._kube_namespace = None

        self._scheduler_key = None
        self._scheduler_config = None
        self._worker_config = None
        self._web_config = None
        self._endpoint = None
        self._has_notebook = False
        self._notebook_endpoint = None

        self._mars_session = None
        self._req_session = None

    @property
    def endpoint(self):
        return self._endpoint

    @property
    def notebook_endpoint(self):
        return self._notebook_endpoint

    @property
    def session(self):
        return self._mars_session

    @property
    def instance_id(self):
        return self._kube_instance.id

    def submit(self,
               worker_num=1,
               worker_cpu=8,
               worker_mem=32,
               disk_num=1,
               min_worker_num=None,
               cache_mem=None,
               resources=None,
               module_path=None,
               create_session=True,
               priority=None,
               running_cluster=None,
               scheduler_num=1,
               notebook=None,
               **kw):
        try:
            async_ = kw.pop('async_', None)
            default_resources = kw.pop('default_resources',
                                       None) or DEFAULT_RESOURCES
            if notebook is not None:
                self._has_notebook = bool(notebook)
            else:
                self._has_notebook = options.mars.launch_notebook
            if self._kube_instance is None:
                if module_path is not None:
                    if isinstance(module_path, (tuple, list)):
                        module_path = list(module_path) + [
                            'odps.mars_extension'
                        ]
                    else:
                        module_path = [module_path, 'odps.mars_extension']

                if resources is not None:
                    if isinstance(resources, (tuple, list)):
                        resources = list(resources)
                        resources.extend(default_resources)
                    else:
                        resources = [resources] + default_resources
                else:
                    resources = default_resources

                if cache_mem is None:
                    cache_mem = str(worker_mem * 0.48) + 'G'
                else:
                    cache_mem = str(cache_mem) + 'G'
                mars_config = {
                    'scheduler_num': scheduler_num,
                    'worker_num': worker_num,
                    'worker_cpu': worker_cpu,
                    'worker_mem': worker_mem,
                    'cache_mem': cache_mem or '',
                    'disk_num': disk_num,
                    'resources': resources,
                    'module_path': module_path or ['odps.mars_extension'],
                }
                if 'mars_app_image' in kw:
                    mars_config['mars_app_image'] = kw.pop('mars_app_image')
                if 'mars_image' in kw:
                    mars_config['mars_image'] = kw.pop('mars_image')
                if 'proxy_endpoint' in kw:
                    mars_config['proxy_endpoint'] = kw.pop('proxy_endpoint')
                if 'major_task_version' in kw:
                    mars_config['major_task_version'] = kw.pop(
                        'major_task_version')
                mars_config['scheduler_mem'] = kw.pop('scheduler_mem', 32)
                mars_config['scheduler_cpu'] = kw.pop('scheduler_cpu', 8)

                if self._has_notebook:
                    mars_config['notebook'] = True
                self._kube_instance = self._cupid_session.start_kubernetes(
                    async_=True,
                    running_cluster=running_cluster,
                    priority=priority,
                    app_name='mars',
                    app_config=mars_config,
                    **kw)
                write_log(self._kube_instance.get_logview_address())
            if async_:
                return self
            else:
                self.wait_for_success(create_session=create_session,
                                      min_worker_num=min_worker_num
                                      or worker_num)
                return self

        except KeyboardInterrupt:
            self.stop_server()
            return self

    def check_service_ready(self, timeout=1):
        try:
            resp = self._req_session.get(self._endpoint + '/api',
                                         timeout=timeout)
        except (requests.ConnectionError, requests.Timeout):
            return False
        if resp.status_code >= 400:
            return False
        return True

    def count_workers(self):
        resp = self._req_session.get(self._endpoint +
                                     '/api/worker?action=count',
                                     timeout=1)
        return json.loads(resp.text)

    def get_logview_address(self):
        return self._kube_instance.get_logview_address()

    def get_mars_endpoint(self):
        return self._cupid_session.get_proxied_url(self._kube_instance.id,
                                                   CUPID_APP_NAME)

    def get_notebook_endpoint(self):
        return self._cupid_session.get_proxied_url(self._kube_instance.id,
                                                   NOTEBOOK_NAME)

    def get_req_session(self):
        from ...rest import RestClient

        if options.mars.use_common_proxy:
            return RestClient(self._odps.account, self._endpoint,
                              self._odps.project)
        else:
            return requests.Session()

    def check_instance_status(self):
        if self._kube_instance.is_terminated():
            for task_name, task in (
                    self._kube_instance.get_task_statuses()).items():
                exc = None
                if task.status == Instance.Task.TaskStatus.FAILED:
                    exc = errors.parse_instance_error(
                        self._kube_instance.get_task_result(task_name))
                elif task.status != Instance.Task.TaskStatus.SUCCESS:
                    exc = errors.ODPSError('%s, status=%s' %
                                           (task_name, task.status.value))
                if exc:
                    exc.instance_id = self._kube_instance.id
                    raise exc

    def wait_for_success(self, min_worker_num=0, create_session=True):
        while True:
            self.check_instance_status()
            try:
                if self._endpoint is None:
                    self._endpoint = self.get_mars_endpoint()
                    write_log('Mars UI: ' + self._endpoint)
                    self._req_session = self.get_req_session()
                if self._has_notebook and self._notebook_endpoint is None:
                    self._notebook_endpoint = self.get_notebook_endpoint()
                    write_log('Notebook UI: ' + self._notebook_endpoint)
            except KeyboardInterrupt:
                raise
            except:
                time.sleep(1)
                continue

            if not self.check_service_ready():
                continue
            try:
                if self.count_workers() >= min_worker_num:
                    break
                else:
                    time.sleep(1)
            except:
                continue

        if create_session:
            try:
                self._mars_session = new_session(
                    self._endpoint,
                    req_session=self._req_session).as_default()
            except KeyboardInterrupt:
                raise
            except:
                if self._kube_instance and self._kube_instance.status == self._kube_instance.Status.RUNNING:
                    self._kube_instance.stop()
                raise

    def stop_server(self):
        if self._kube_instance:
            self._kube_instance.stop()
            self._kube_instance = None
Пример #15
0
    def _tile_cupid(cls, op):
        from odps import ODPS
        from odps.accounts import BearerTokenAccount
        from cupid import CupidSession, context
        from cupid.errors import CupidError
        from mars.context import get_context

        cupid_ctx = context()

        bearer_token = cupid_ctx.get_bearer_token()
        account = BearerTokenAccount(bearer_token)
        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(None,
                 None,
                 account=account,
                 project=odps_params['project'],
                 endpoint=endpoint)
        cupid_session = CupidSession(o)

        mars_context = get_context()

        df = op.outputs[0]
        split_size = df.extra_params.chunk_bytes or READ_CHUNK_LIMIT

        out_dtypes = df.dtypes
        out_shape = df.shape
        out_columns_value = df.columns_value
        if op.columns is not None:
            out_dtypes = out_dtypes[op.columns]
            out_shape = (df.shape[0], len(op.columns))
            out_columns_value = parse_index(out_dtypes.index, store_data=True)

        table_obj = o.get_table(op.table_name)
        if not table_obj.schema.partitions:
            data_srcs = [table_obj]
        elif op.partition is not None and check_partition_exist(
                table_obj, op.partition):
            data_srcs = [table_obj.get_partition(op.partition)]
        else:
            data_srcs = list(table_obj.partitions)
            if op.partition is not None:
                data_srcs = filter_partitions(o, data_srcs, op.partition)

        out_chunks = []
        chunk_idx = 0

        for data_src in data_srcs:
            try:
                data_store_size = data_src.size
            except ODPSError:
                # fail to get data size, just ignore
                pass
            else:
                if data_store_size < split_size and mars_context is not None:
                    # get worker counts
                    worker_count = max(
                        len(mars_context.get_worker_addresses()), 1)
                    # data is too small, split as many as number of cores
                    split_size = data_store_size // worker_count
                    # at least 1M
                    split_size = max(split_size, 1 * 1024**2)
                    logger.debug(
                        'Input data size is too small, split_size is %s',
                        split_size)

            logger.debug(
                'Start creating download session of table %s from cupid, '
                'columns: %s', op.table_name, op.columns)
            while True:
                try:
                    download_session = cupid_session.create_download_session(
                        data_src,
                        split_size=split_size,
                        columns=op.columns,
                        with_split_meta=op.with_split_meta_on_tile)
                    break
                except CupidError:
                    logger.debug(
                        'The number of splits exceeds 100000, split_size is %s',
                        split_size)
                    if split_size >= MAX_CHUNK_SIZE:
                        raise
                    else:
                        split_size *= 2

            logger.debug('%s table splits have been created.',
                         str(len(download_session.splits)))

            meta_chunk_rows = [
                split.meta_row_count for split in download_session.splits
            ]
            if np.isnan(out_shape[0]):
                est_chunk_rows = meta_chunk_rows
            else:
                sp_file_sizes = np.array([
                    sp.split_file_end - sp.split_file_start
                    for sp in download_session.splits
                ])
                total_size = sp_file_sizes.sum()
                ratio_chunk_rows = (sp_file_sizes * out_shape[0] //
                                    total_size).tolist()
                est_chunk_rows = [
                    mr if mr is not None else rr
                    for mr, rr in zip(meta_chunk_rows, ratio_chunk_rows)
                ]

            partition_spec = str(data_src.partition_spec) \
                if getattr(data_src, 'partition_spec', None) else None

            logger.warning('Estimated chunk rows: %r', est_chunk_rows)

            if len(download_session.splits) == 0:
                logger.debug('Table %s has no data', op.table_name)
                chunk_op = DataFrameReadTableSplit()
                index_value = parse_index(pd.RangeIndex(0))
                columns_value = parse_index(out_dtypes.index, store_data=True)
                out_chunk = chunk_op.new_chunk(None,
                                               shape=(np.nan, out_shape[1]),
                                               dtypes=op.dtypes,
                                               index_value=index_value,
                                               columns_value=columns_value,
                                               index=(chunk_idx, 0))
                out_chunks.append(out_chunk)
                chunk_idx += 1
            else:
                for idx, split in enumerate(download_session.splits):
                    chunk_op = DataFrameReadTableSplit(
                        cupid_handle=to_str(split.handle),
                        split_index=split.split_index,
                        split_file_start=split.split_file_start,
                        split_file_end=split.split_file_end,
                        schema_file_start=split.schema_file_start,
                        schema_file_end=split.schema_file_end,
                        add_offset=op.add_offset,
                        dtypes=out_dtypes,
                        sparse=op.sparse,
                        split_size=split_size,
                        string_as_binary=op.string_as_binary,
                        use_arrow_dtype=op.use_arrow_dtype,
                        estimate_rows=est_chunk_rows[idx],
                        partition_spec=partition_spec,
                        append_partitions=op.append_partitions,
                        meta_raw_size=split.meta_raw_size,
                        nrows=meta_chunk_rows[idx] or op.nrows,
                        memory_scale=op.memory_scale)
                    # the chunk shape is unknown
                    index_value = parse_index(pd.RangeIndex(0))
                    columns_value = parse_index(out_dtypes.index,
                                                store_data=True)
                    out_chunk = chunk_op.new_chunk(None,
                                                   shape=(np.nan,
                                                          out_shape[1]),
                                                   dtypes=out_dtypes,
                                                   index_value=index_value,
                                                   columns_value=columns_value,
                                                   index=(chunk_idx, 0))
                    chunk_idx += 1
                    out_chunks.append(out_chunk)

        if op.add_offset:
            out_chunks = standardize_range_index(out_chunks)

        new_op = op.copy()
        nsplits = ((np.nan, ) * len(out_chunks), (out_shape[1], ))
        return new_op.new_dataframes(None,
                                     shape=out_shape,
                                     dtypes=op.dtypes,
                                     index_value=df.index_value,
                                     columns_value=out_columns_value,
                                     chunks=out_chunks,
                                     nsplits=nsplits)
Пример #16
0
def _handle_create_table_download_session(sock):
    try:
        cmd_len, = struct.unpack('<I', sock.recv(4))
        # dict with odps_params, table_name, partition, columns, worker_count, split_size, max_chunk_num
        session_config = pickle.loads(sock.recv(cmd_len))

        from odps import ODPS
        from odps.errors import ODPSError
        from odps.accounts import BearerTokenAccount
        from cupid import CupidSession, context
        from cupid.errors import CupidError
        from cupid.runtime import RuntimeContext

        if not RuntimeContext.is_context_ready():
            raise SystemError(
                'No Mars cluster found, please create via `o.create_mars_cluster`.'
            )

        cupid_ctx = context()

        odps_params = session_config['odps_params']
        bearer_token = cupid_ctx.get_bearer_token()
        account = BearerTokenAccount(bearer_token)
        project = os.environ.get('ODPS_PROJECT_NAME',
                                 None) or odps_params['project']
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(None,
                 None,
                 account=account,
                 project=project,
                 endpoint=endpoint)
        cupid_session = CupidSession(o)

        split_size = session_config['split_size']
        table_name = session_config['table_name']
        data_src = o.get_table(table_name)
        if session_config.get('partition') is not None:
            data_src = data_src.get_partition(session_config['partition'])

        try:
            data_store_size = data_src.size
        except ODPSError:
            # fail to get data size, just ignore
            pass
        else:
            worker_count = session_config['worker_count']
            if data_store_size < split_size and worker_count is not None:
                # data is too small, split as many as number of cores
                split_size = data_store_size // worker_count
                # at least 1M
                split_size = max(split_size, 1 * 1024**2)
                logger.debug(
                    'Input data size is too small, split_size is {}'.format(
                        split_size))

        max_chunk_num = session_config['max_chunk_num']
        columns = session_config['columns']
        with_split_meta = session_config.get('with_split_meta_on_tile')

        logger.debug(
            'Start creating download session of table %s from cupid, columns %r',
            table_name, columns)
        while True:
            try:
                download_session = cupid_session.create_download_session(
                    data_src,
                    split_size=split_size,
                    columns=columns,
                    with_split_meta=with_split_meta)
                break
            except CupidError:
                logger.debug(
                    'The number of splits exceeds 100000, split_size is {}'.
                    format(split_size))
                if split_size >= max_chunk_num:
                    raise
                else:
                    split_size *= 2

        ret_data = {
            'splits': download_session.splits,
            'split_size': split_size,
        }
        _write_request_result(sock, result=ret_data)
    except:
        logger.exception('Failed to create download session')
        _write_request_result(sock, False, exc_info=sys.exc_info())