def _register_reader(self): channel = SandboxRpcChannel() stub = subprocess_pb.CupidSubProcessService_Stub(channel) req = subprocess_pb.RegisterTableReaderRequest( inputTableHandle=self._handle, inputSplit=self.split_proto) controller = CupidRpcController() resp = stub.RegisterTableReader(controller, req, None) if controller.Failed(): raise CupidError(controller.ErrorText()) logger.info("RegisterTableReader response: %s", resp) logger.info("RegisterTableReaderResponse protobuf field size = %d", len(resp.ListFields())) schema_json = json.loads(resp.schema) partition_schema_json = json.loads(resp.partitionSchema) \ if resp.HasField('partitionSchema') else None schema_names = [d['name'] for d in schema_json] schema_types = [d['type'] for d in schema_json] pt_schema_names = [d['name'] for d in partition_schema_json] pt_schema_types = [d['type'] for d in partition_schema_json] schema = Schema.from_lists(schema_names, schema_types, pt_schema_names, pt_schema_types) return resp.readIterator, schema
def create_upload_session(session, table): controller = CupidRpcController() channel = CupidTaskServiceRpcChannel(session) stub = task_service_pb.CupidTaskService_Stub(channel) req = task_service_pb.WriteTableRequest(lookupName=session.lookup_name, tableName=table.name, projectName=table.project.name) resp = stub.WriteTable(controller, req, None) if controller.Failed(): raise CupidError(controller.ErrorText()) logger.info( "[CupidTask] writeTable call, CurrentInstanceId: %s, " "request: %s, response: %s", session.lookup_name, req, resp, ) return CupidTableUploadSession( session=session, table_name=table.name, project_name=table.project.name, handle=resp.outputTableHandle)
def query_table_meta(session, table): controller = CupidRpcController() channel = CupidTaskServiceRpcChannel(session) stub = task_service_pb.CupidTaskService_Stub(channel) table_info = task_service_pb.TableInfo(projectName=table.project.name, tableName=table.name) req = task_service_pb.GetTableMetaRequest(lookupName=session.lookup_name, tableInfo=table_info, needContent=True, uploadFile='') resp = stub.GetTableMeta(controller, req, None) if controller.Failed(): raise CupidError(controller.ErrorText()) logger.info( "[CupidTask] getTableMeta call, CurrentInstanceId: %s, " "request: %s, response: %s", session.lookup_name, req, resp, ) return json.loads(resp.getTableMetaContent)
def commit(self): channel = SandboxRpcChannel() stub = subprocess_pb.CupidSubProcessService_Stub(channel) commit_actions = [ subprocess_pb.CommitActionInfo( commitFileName=self._block_id, attemptFileName=ATTEMPT_FILE_PREFIX + self._block_id, partSpec=self._partition_spec, ) ] req = subprocess_pb.CommitTableFilesRequest( outputTableHandle=self._handle, projectName=self._project_name, tableName=self._table_name, commitActionInfos=commit_actions, ) controller = CupidRpcController() for _ in range(options.retry_times): stub.CommitTableFiles(controller, req, None) if controller.Failed(): time.sleep(0.1) controller = CupidRpcController() else: break if controller.Failed(): raise CupidError(controller.ErrorText())
def commit(self, overwrite=False): partitions = list(set(p for p in self._blocks.values() if p is not None)) if not partitions: partitions = [''] channel = CupidTaskServiceRpcChannel(self._session) stub = task_service_pb.CupidTaskService_Stub(channel) part_specs = [pt.replace("'", '') for pt in partitions] req = task_service_pb.CommitTableRequest( outputTableHandle=self._handle, projectName=self._project_name, tableName=self._table_name, isOverWrite=overwrite, lookupName=self._session.lookup_name, partSpecs=part_specs, ) controller = CupidRpcController() resp = None for _ in range(options.retry_times): resp = stub.CommitTable(controller, req, None) if controller.Failed(): time.sleep(0.1) controller = CupidRpcController() else: break if controller.Failed(): raise CupidError(controller.ErrorText()) logger.info( "[CupidTask] commitTable call, CurrentInstanceId: %s, " "request: %s, response: %s", self._session.lookup_name, req, resp, )
def _register_writer(self, partition=None): if isinstance(partition, TablePartition): partition = str(partition.spec) controller = CupidRpcController() channel = SandboxRpcChannel() stub = subprocess_pb.CupidSubProcessService_Stub(channel) table_schema = self._table_schema schema_str = '|' + '|'.join(str(col.type) for col in table_schema.simple_columns) req = subprocess_pb.RegisterTableWriterRequest( outputTableHandle=self._handle, projectName=self._project_name, tableName=self._table_name, attemptFileName=ATTEMPT_FILE_PREFIX + self._block_id, partSpec=partition.replace("'", '') if partition else None, schema=schema_str, ) resp = stub.RegisterTableWriter(controller, req, None) write_label = resp.subprocessWriteTableLabel return write_label
def create_download_session(session, table_or_parts, split_size=None, split_count=None, columns=None, with_split_meta=False): channel = CupidTaskServiceRpcChannel(session) stub = task_service_pb.CupidTaskService_Stub(channel) if not isinstance(table_or_parts, (list, tuple, set, GeneratorType)): table_or_parts = [table_or_parts] if split_size is None and split_count is None: split_count = 1 split_count = split_count or 0 split_size = (split_size or 1024**2) // 1024**2 table_pbs = [] for t in table_or_parts: if isinstance(t, Table): if not columns: columns = t.schema.names table_kw = dict( projectName=t.project.name, tableName=t.name, columns=','.join(columns), ) elif isinstance(t, TablePartition): if not columns: columns = t.table.schema.names table_kw = dict( projectName=t.table.project.name, tableName=t.table.name, columns=','.join(columns), partSpec=str(t.partition_spec).replace("'", '').strip(), ) else: raise NotImplementedError table_pbs.append(task_service_pb.TableInputInfo(**table_kw)) request = task_service_pb.SplitTablesRequest( lookupName=session.lookup_name, splitSize=split_size, splitCount=split_count, tableInputInfos=table_pbs, allowNoColumns=True, requireSplitMeta=with_split_meta, ) controller = CupidRpcController() resp = stub.SplitTables(controller, request, None) if controller.Failed(): raise CupidError(controller.ErrorText()) logger.info("[CupidTask] splitTables call, CurrentInstanceId: %s, " "request: %s, response: %s" % ( session.lookup_name, str(request), str(resp), )) handle = resp.inputTableHandle channel = SandboxRpcChannel() stub = subprocess_pb.CupidSubProcessService_Stub(channel) if not with_split_meta: split_meta = itertools.repeat(None) else: req = subprocess_pb.GetSplitsMetaRequest(inputTableHandle=handle, ) controller = CupidRpcController() resp = stub.GetSplitsMeta(controller, req, None) logger.info("[CupidTask] getSplitsMeta call, CurrentInstanceId: %s, " "request: %s, response: %s" % ( session.lookup_name, str(request), str(resp), )) if controller.Failed(): split_meta = itertools.repeat(None) logger.warning('Failed to get results of getSplitsMeta, ' 'may running on an old service') else: split_meta = resp.inputSplitsMeta req = subprocess_pb.GetSplitsRequest(inputTableHandle=handle) controller = CupidRpcController() resp = stub.GetSplits(controller, req, None) if controller.Failed(): raise CupidError(controller.ErrorText()) input_splits = [] for info, meta in zip(resp.inputSplits, split_meta): input_splits.append( TableSplit(split_proto=info, meta_proto=meta, handle=handle, columns=columns)) logger.info("[SubProcess] getSplits call, CurrentInstanceId: %s, " "request: %s, response: %s" % ( session.lookup_name, str(req), str(resp), )) return CupidTableDownloadSession(session=session, handle=handle, splits=input_splits)