def downloadJsonFile(self): # the itemKey context is opposite, to check if the applied category # exists in registered consumerCategories jpacket = {'eventKey': f'REPO|{self.jobId}', 'itemKey': 'jsonToCsv'} repo = self.query(Note(jpacket)) apiBase = self._leveldb['apiBase'] sysPath = f'{apiBase}/{repo.sysPath}' if not os.path.exists(sysPath): errmsg = f'output repo path does not exist : {sysPath}' raise TaskError(errmsg) catPath = self.jmeta.category if catPath not in repo.consumerCategories: errmsg = 'consumer category branch %s does not exist in %s' \ % (catPath, str(repo.consumerCategories)) raise TaskError(errmsg) repoPath = f'{sysPath}/{catPath}' logger.info('output json gzipfile repo path : ' + repoPath) jsonZipfile = f'{self.jobId}.{self.jmeta.fileExt}' logger.info('output json gzipfile : ' + jsonZipfile) dbKey = f'{self.jobId}|datastream|workspace' self._leveldb[dbKey] = repoPath dbKey = f'{self.jobId}|datastream|outfile' self._leveldb[dbKey] = jsonZipfile
async def readFile(self, jobId, taskNum): try: dbkey = f'{jobId}|datastream|workspace' workspace = self._leveldb[dbkey] dbkey = f'{jobId}|datastream|outfile' outfileName = self._leveldb[dbkey] except KeyError as ex: errmsg = f'{jobId} workspace or filename info not found' logger.error(errmsg) raise TaskError(errmsg) if not os.path.exists(workspace): errmsg = f'workspace {workspace} does not exist' raise TaskError(errmsg) outfilePath = f'{workspace}/{outfileName}' connector = await Microservice.connector(taskNum, self.name) status, response = await connector.prepare(jobId, taskNum) if status not in (200, 201): raise TaskError( f'{self.name}, datastream preparation failed : {response}') try: logger.info( f'{self.name}, about to read {outfileName} by datastream ...') with open(outfilePath, 'wb') as fhwb: async for chunk in connector.read(): fhwb.write(chunk) self.uncompressFile(workspace, outfileName) except Exception as ex: errmsg = f'failed writing outfile {outfilePath}' logger.error(errmsg) raise TaskError(errmsg)
def prepareDownload(self): # the itemKey context is opposite, to check if the applied category # exists in registered consumerCategories jpacket = {'eventKey': f'REPO|{self.jobId}', 'itemKey': 'csvToJson'} repo = self.query(Note(jpacket)) apiBase = self._leveldb['apiBase'] sysPath = f'{apiBase}/{repo.sysPath}' if not os.path.exists(sysPath): errmsg = f'output repo path does not exist : {sysPath}' raise TaskError(errmsg) catPath = self.jmeta.category if catPath not in repo.consumerCategories: errmsg = 'consumer category branch %s does not exist in %s' \ % (catPath, str(repo.consumerCategories)) raise TaskError(errmsg) repoPath = f'{sysPath}/{catPath}/{self.jobId}' logger.info('output json file repo path : ' + repoPath) try: self.sysCmd(['mkdir', '-p', repoPath]) except TaskError as ex: logger.error('output repo path creation failed') raise csvGZipfile = f'{self.jobId}.{self.jmeta.fileExt}' logger.info('output csv tar gzipfile : ' + csvGZipfile) dbKey = f'{self.jobId}|datastream|workspace' self._leveldb[dbKey] = repoPath dbKey = f'{self.jobId}|datastream|outfile' self._leveldb[dbKey] = csvGZipfile
def resolve(self, packet): if not self._signalFrom: logger.info('ResumeHandler, state transition does not have a resume condition') raise TaskComplete logger.info('ResumeHandler, resolving resume status ...') if packet.fromKey not in self._signalFrom: raise TaskError(f'resume resolution failed, {packet.fromKey} is not registered for this service') if packet.signal != 201: errMsg = f'resume resolution failed, {packet.fromKey} actor {packet.caller} failed' raise TaskError(f'{errMsg}\nresume packet : {packet.copy()}') self._signalFrom.remove(packet.fromKey) if not self._signalFrom: raise TaskComplete
async def connector(self, taskNum, owner): logger.info(f'### get connector {taskNum} called by {owner}') taskId = f'task{taskNum}' connector = self.context.get(taskId) if connector != None: logger.info( f'### {self.name}, connector {connector.cid} is cached') if taskNum in self.started: return connector logger.info( f'{self.name}, requesting {owner} provider service restart, taskId : {taskId} ...' ) status, response = await self.submit(taskNum, 'restart', owner) if status not in (200, 201): raise TaskError( f'{self.name}, {owner} provider service restart failed : {response}' ) self.started.append(taskNum) logger.info( f'{self.name}, {owner} provider service {taskId} restart response : {response}' ) return connector # provider service creation logger.info( f'{self.name}, requesting {owner} provider subscription, taskId : {taskId} ...' ) # SubscriptionA promote protocol requires the ms-handler to return the service bind address # so that the microservice client context can connect to that address in connector creation status, response = await self.submit(taskNum, 'promote', owner) if status not in (200, 201): raise TaskError( f'{self.name}, {owner} provider subscription failed : {response}' ) self.started.append(taskNum) logger.info( f'{self.name}, {owner} provider service {taskId} startup response : {response}' ) # set connware sockAddr for client connector creation # this sets up an exclusive microservice data channel sockAddr = response['sockAddr'] connware = Connware(sock=[zmq.DEALER, sockAddr], sockopt={zmq.IDENTITY: taskId}) connector = self.context.addConn(taskId, connware) logger.info( f'#### {self.name}, connector {connector.cid} added to cache') return connector
def uncompressFile(self, workspace, outfileName): logger.info(f'{self.name}, extract by gunzip, {outfileName} ...') try: self.sysCmd(['gunzip', outfileName], cwd=workspace) except Exception as ex: errmsg = f'{self.name}, extract by gunzip failed, {outfileName}' logger.error(errmsg) raise TaskError(errmsg)
async def request(self, method, packet, connector=None): if not connector: connector = self._request if method == 'terminate': logger.info( f'{self.name}, terminate called, connector : {connector.name}') await connector.send([method, packet], self.name) if packet['synchronous']: status, response = await connector.recv() if status not in (200, 201): raise TaskError(f'{self.name}, {method} failed : {response}') return response
def headerCheck(self, header): logger.debug(f'{self.nodeName}, header check ...') columnsDs = header if not columnsDs == self.columns: if set(columnsDs) ^ set(self.columns): specOnly = set(self.columns) - set(columnsDs) if specOnly: errTxt = 'some required columns do NOT exist in the actual dataset' errMsg = f'{self.nodeName}, {errTxt}\nMissing required columns : {specOnly}' raise TaskError(errMsg) else: warnMsg = 'dataset column order does not match the column definition' logger.warn(f'{self.nodeName}, {warnMsg}') self._headerCheck = True
def runActor(self, jobId, taskNum, keyHigh, **kwargs): try: logger.info(f'### {self.name} is called ... ###') dbKey = f'{jobId}|workspace' workspace = self._hh[dbKey] self.nodeTree = TreeProvider.get() tableName, nodeList = self.nodeTree.tableMap[taskNum] csvPath = f'{workspace}/{tableName}.csv' writer = CsvWriter.make(taskNum, tableName, keyHigh) writer.writeAll(csvPath, nodeList) except Exception as ex: logger.error(f'actor {self.actorId} error', exc_info=True) raise TaskError(ex)
def runActor(self, jobId, taskNum): try: logger.info(f'{self.name} is called ... ###') dbKey = f'{jobId}|workspace' workspace = self._hh[dbKey] dbKey = f'{jobId}|output|jsonFile' jsonFile = self._hh[dbKey] jsonPath = f'{workspace}/{jsonFile}' logger.info(f'### task workspace : {workspace}') logger.info(f'### output json file : {jsonFile}') self.arrange(jobId, taskNum) self.run(jsonPath) except Exception as ex: logger.error(f'{self.name}, actor {self.actorId} errored', exc_info=True) raise TaskError(ex)
def runActor(self, jobId, taskNum, *args, **kwargs): try: logger.info(f'### CsvNormaliser {taskNum} is called ... ###') dbKey = f'{jobId}|workspace' workspace = self._hh[dbKey] logger.info(f'### normalise workspace : {workspace}') self.arrange(taskNum) logger.info( f'### task|{taskNum:02} input csv file : {self.tableName}.csv') csvPath = f'{workspace}/{self.tableName}.csv' if not os.path.exists(csvPath): errmsg = f'{self.csvFile} does not exist in workspace' raise Exception(errmsg) self.run(csvPath) except Exception as ex: logger.error(f'{self.name}, actor {self.actorId} errored', exc_info=True) raise TaskError(ex)
def runActor(self, jobId, taskNum, *args, **kwargs): try: logger.info(f'### XmlNormaliser {taskNum} is called ... ###') dbKey = f'{jobId}|workspace' workspace = self._hh[dbKey] dbKey = f'{jobId}|XFORM|input|{taskNum}|xmlFile' xmlFile = self._hh[dbKey] logger.info(f'### normalise workspace : {workspace}') logger.info(f'### task|{taskNum:02} input xml file : {xmlFile}') xmlPath = f'{workspace}/{xmlFile}' if not os.path.exists(xmlPath): errmsg = f'{xmlFile} does not exist in workspace' raise Exception(errmsg) self.arrange(taskNum) self.run(xmlPath, xmlFile) except Exception as ex: logger.error(f'{self.name}, actor {self.actorId} errored', exc_info=True) raise TaskError(ex)
async def _PREPARE(self, jobId, taskId): self.jobId = jobId logger.info(f'{self.name}, job {jobId}, preparing {taskId} data stream ...') hardhash = HardhashContext.connector(jobId) try: dbKey = f'{jobId}|workspace' workspace = hardhash[dbKey] dbKey = f'{jobId}|datastream|infile' self.infileName = hardhash[dbKey] except KeyError as ex: errmsg = f'{jobId}, failed to get job article from datastorage' await self._conn.sendReply([500, {'error': errmsg}]) raise TaskError(errmsg) logger.info(f'{self.name}, datastream workspace, infile : {workspace}, {self.infileName}') self.infilePath = f'{workspace}/{self.infileName}' if not os.path.exists(self.infilePath): errmsg = f'source file {self.infileName} does not exist in workspace' await self._conn.sendReply([500, {'error': errmsg}]) else: await self._conn.sendReply([200, {'status':'ready','infile':f'{self.infileName}'}])
def evalSysStatus(self): jpacket = {'eventKey': f'REPO|{self.jobId}', 'itemKey': 'csvToJson'} repo = self.query(Note(jpacket)) apiBase = self._leveldb['apiBase'] sysPath = f'{apiBase}/{repo.sysPath}' if not os.path.exists(sysPath): errmsg = f'xform input path does not exist : {sysPath}' raise TaskError(errmsg) catPath = self.jmeta.category if catPath not in repo.consumerCategories: errmsg = 'consumer category branch %s does not exist under %s' \ % (catPath, str(repo.consumerCategories)) raise TaskError(errmsg) repoPath = f'{sysPath}/{catPath}' logger.info('input zipfile repo path : ' + repoPath) inputZipFile = f'{self.jobId}.{self.jmeta.fileExt}' logger.info('input zipfile : ' + inputZipFile) zipFilePath = f'{repoPath}/{inputZipFile}' if not os.path.exists(zipFilePath): errmsg = 'xform input zipfile does not exist in source repo' raise TaskError(errmsg) workbase = f'{apiBase}/{self.jmeta.workspace}' if not os.path.exists(workbase): errmsg = f'xform workspace path does not exist : {workbase}' raise TaskError(errmsg) tsXref = datetime.now().strftime('%y%m%d%H%M%S') workspace = f'{workbase}/{tsXref}' logger.info('session workspace : ' + workspace) logger.info('creating session workspace ... ') try: cmdArgs = ['mkdir', '-p', workspace] self.sysCmd(cmdArgs) except TaskError as ex: logger.error(f'{self.jobId}, workspace creation failed') raise try: self.sysCmd(['cp', zipFilePath, workspace]) except TaskError as ex: logger.error(f'zipfile copy to workspace failed : {zipFilePath}') raise try: cmdArgs = ['tar', '-xzf', inputZipFile] self.sysCmd(cmdArgs, cwd=workspace) except TaskError as ex: logger.error(f'{inputZipFile}, gunzip tar extract command failed') raise # put workspace path in storage for micro-service access dbKey = f'{self.jobId}|workspace' self._leveldb[dbKey] = workspace self.workspace = workspace
def evalSysStatus(self): jpacket = {'eventKey': f'REPO|{self.jobId}', 'itemKey': 'xmlToCsv'} repo = self.query(Note(jpacket)) apiBase = self._leveldb['apiBase'] sysPath = f'{apiBase}/{repo.sysPath}' if not os.path.exists(sysPath): errmsg = f'xform input path does not exist : {sysPath}' raise TaskError(errmsg) catPath = self.jmeta.category if catPath not in repo.consumerCategories: errmsg = 'consumer category branch %s does not exist in %s' \ % (catPath, str(repo.consumerCategories)) raise TaskError(errmsg) repoPath = f'{sysPath}/{catPath}' logger.info('xml input file repo path : ' + repoPath) inputXmlFile = f'{self.jobId}.{self.jmeta.fileExt}' logger.info('xml input file : ' + inputXmlFile) xmlFilePath = f'{repoPath}/{inputXmlFile}' if not os.path.exists(xmlFilePath): errmsg = 'xform xml input file does not exist in source repo' raise TaskError(errmsg) workbase = f'{apiBase}/{self.jmeta.workspace}' if not os.path.exists(workbase): errmsg = f'xform workspace path does not exist : {workbase}' raise TaskError(errmsg) tsXref = datetime.now().strftime('%y%m%d%H%M%S') workspace = f'{workbase}/{tsXref}' logger.info('session workspace : ' + workspace) logger.info('creating session workspace ... ') try: self.sysCmd(['mkdir', '-p', workspace]) except TaskError as ex: logger.error(f'{self.jobId}, workspace creation failed') raise try: cmdArgs = ['cp', xmlFilePath, workspace] self.sysCmd(cmdArgs) except TaskError as ex: logger.error(f'copy to workspace failed : {inputXmlFile}') raise xmlFilePath = f'{workspace}/{inputXmlFile}' lineCount = getLineCount(xmlFilePath) if lineCount <= 2000: logMsg = f'file split not required, line count : {lineCount} < 2000' logger.info(f'{self.jobId}, {logMsg}') self.jobRange = 1 dbKey = f'{self.jobId}|XFORM|input|1|xmlFile' self._leveldb[dbKey] = inputXmlFile else: self.jobRange = 2 splitSize = int(math.ceil(lineCount / self.jobRange)) # round up to the nearest 50 #splitSize = int(math.ceil(splitSize / 50.0)) * 50 logger.info( f'{self.jobId}, line count, split size : {lineCount}, {splitSize}' ) try: splitFileName = self.jobId cmdArgs = [ 'split', '-l', str(splitSize), inputXmlFile, splitFileName ] self.sysCmd(cmdArgs, cwd=workspace) except TaskError as ex: logger.error(f'{inputXmlFile}, split command failed') raise for i in range(1, self.jobRange + 1): self.putSplitFilename(i) # put workspace path in storage for micro-service access dbKey = f'{self.jobId}|workspace' self._leveldb[dbKey] = workspace self.workspace = workspace
def make(cls, contextId=None): if not contextId: raise TaskError( '{self.name}, cannot make context datasource with an id') datasource = LeveldbDatasource.make(contextId) return cls(contextId, datasource)