Пример #1
0
    def downloadJsonFile(self):
        # the itemKey context is opposite, to check if the applied category
        # exists in registered consumerCategories
        jpacket = {'eventKey': f'REPO|{self.jobId}', 'itemKey': 'jsonToCsv'}
        repo = self.query(Note(jpacket))

        apiBase = self._leveldb['apiBase']
        sysPath = f'{apiBase}/{repo.sysPath}'
        if not os.path.exists(sysPath):
            errmsg = f'output repo path does not exist : {sysPath}'
            raise TaskError(errmsg)

        catPath = self.jmeta.category
        if catPath not in repo.consumerCategories:
            errmsg = 'consumer category branch %s does not exist in %s' \
                                      % (catPath, str(repo.consumerCategories))
            raise TaskError(errmsg)

        repoPath = f'{sysPath}/{catPath}'
        logger.info('output json gzipfile repo path : ' + repoPath)

        jsonZipfile = f'{self.jobId}.{self.jmeta.fileExt}'
        logger.info('output json gzipfile : ' + jsonZipfile)

        dbKey = f'{self.jobId}|datastream|workspace'
        self._leveldb[dbKey] = repoPath
        dbKey = f'{self.jobId}|datastream|outfile'
        self._leveldb[dbKey] = jsonZipfile
Пример #2
0
    async def readFile(self, jobId, taskNum):
        try:
            dbkey = f'{jobId}|datastream|workspace'
            workspace = self._leveldb[dbkey]
            dbkey = f'{jobId}|datastream|outfile'
            outfileName = self._leveldb[dbkey]
        except KeyError as ex:
            errmsg = f'{jobId} workspace or filename info not found'
            logger.error(errmsg)
            raise TaskError(errmsg)

        if not os.path.exists(workspace):
            errmsg = f'workspace {workspace} does not exist'
            raise TaskError(errmsg)

        outfilePath = f'{workspace}/{outfileName}'
        connector = await Microservice.connector(taskNum, self.name)
        status, response = await connector.prepare(jobId, taskNum)
        if status not in (200, 201):
            raise TaskError(
                f'{self.name}, datastream preparation failed : {response}')
        try:
            logger.info(
                f'{self.name}, about to read {outfileName} by datastream ...')
            with open(outfilePath, 'wb') as fhwb:
                async for chunk in connector.read():
                    fhwb.write(chunk)
            self.uncompressFile(workspace, outfileName)
        except Exception as ex:
            errmsg = f'failed writing outfile {outfilePath}'
            logger.error(errmsg)
            raise TaskError(errmsg)
Пример #3
0
    def prepareDownload(self):
        # the itemKey context is opposite, to check if the applied category
        # exists in registered consumerCategories
        jpacket = {'eventKey': f'REPO|{self.jobId}', 'itemKey': 'csvToJson'}
        repo = self.query(Note(jpacket))

        apiBase = self._leveldb['apiBase']
        sysPath = f'{apiBase}/{repo.sysPath}'
        if not os.path.exists(sysPath):
            errmsg = f'output repo path does not exist : {sysPath}'
            raise TaskError(errmsg)

        catPath = self.jmeta.category
        if catPath not in repo.consumerCategories:
            errmsg = 'consumer category branch %s does not exist in %s' \
                                      % (catPath, str(repo.consumerCategories))
            raise TaskError(errmsg)

        repoPath = f'{sysPath}/{catPath}/{self.jobId}'
        logger.info('output json file repo path : ' + repoPath)

        try:
            self.sysCmd(['mkdir', '-p', repoPath])
        except TaskError as ex:
            logger.error('output repo path creation failed')
            raise

        csvGZipfile = f'{self.jobId}.{self.jmeta.fileExt}'
        logger.info('output csv tar gzipfile : ' + csvGZipfile)

        dbKey = f'{self.jobId}|datastream|workspace'
        self._leveldb[dbKey] = repoPath
        dbKey = f'{self.jobId}|datastream|outfile'
        self._leveldb[dbKey] = csvGZipfile
Пример #4
0
 def resolve(self, packet):
   if not self._signalFrom:
     logger.info('ResumeHandler, state transition does not have a resume condition')
     raise TaskComplete
   logger.info('ResumeHandler, resolving resume status ...')
   if packet.fromKey not in self._signalFrom:
     raise TaskError(f'resume resolution failed, {packet.fromKey} is not registered for this service')
   if packet.signal != 201:
     errMsg = f'resume resolution failed, {packet.fromKey} actor {packet.caller} failed'
     raise TaskError(f'{errMsg}\nresume packet : {packet.copy()}')
   self._signalFrom.remove(packet.fromKey)
   if not self._signalFrom:
     raise TaskComplete
Пример #5
0
    async def connector(self, taskNum, owner):
        logger.info(f'### get connector {taskNum} called by {owner}')
        taskId = f'task{taskNum}'
        connector = self.context.get(taskId)
        if connector != None:
            logger.info(
                f'### {self.name}, connector {connector.cid} is cached')
            if taskNum in self.started:
                return connector
            logger.info(
                f'{self.name}, requesting {owner} provider service restart, taskId : {taskId} ...'
            )
            status, response = await self.submit(taskNum, 'restart', owner)
            if status not in (200, 201):
                raise TaskError(
                    f'{self.name}, {owner} provider service restart failed : {response}'
                )
            self.started.append(taskNum)
            logger.info(
                f'{self.name}, {owner} provider service {taskId} restart response : {response}'
            )
            return connector

        # provider service creation
        logger.info(
            f'{self.name}, requesting {owner} provider subscription, taskId : {taskId} ...'
        )
        # SubscriptionA promote protocol requires the ms-handler to return the service bind address
        # so that the microservice client context can connect to that address in connector creation
        status, response = await self.submit(taskNum, 'promote', owner)
        if status not in (200, 201):
            raise TaskError(
                f'{self.name}, {owner} provider subscription failed : {response}'
            )
        self.started.append(taskNum)
        logger.info(
            f'{self.name}, {owner} provider service {taskId} startup response : {response}'
        )

        # set connware sockAddr for client connector creation
        # this sets up an exclusive microservice data channel
        sockAddr = response['sockAddr']
        connware = Connware(sock=[zmq.DEALER, sockAddr],
                            sockopt={zmq.IDENTITY: taskId})
        connector = self.context.addConn(taskId, connware)
        logger.info(
            f'#### {self.name}, connector {connector.cid} added to cache')
        return connector
Пример #6
0
 def uncompressFile(self, workspace, outfileName):
     logger.info(f'{self.name}, extract by gunzip, {outfileName} ...')
     try:
         self.sysCmd(['gunzip', outfileName], cwd=workspace)
     except Exception as ex:
         errmsg = f'{self.name}, extract by gunzip failed, {outfileName}'
         logger.error(errmsg)
         raise TaskError(errmsg)
Пример #7
0
 async def request(self, method, packet, connector=None):
     if not connector:
         connector = self._request
     if method == 'terminate':
         logger.info(
             f'{self.name}, terminate called, connector : {connector.name}')
     await connector.send([method, packet], self.name)
     if packet['synchronous']:
         status, response = await connector.recv()
         if status not in (200, 201):
             raise TaskError(f'{self.name}, {method} failed : {response}')
         return response
Пример #8
0
 def headerCheck(self, header):
     logger.debug(f'{self.nodeName}, header check ...')
     columnsDs = header
     if not columnsDs == self.columns:
         if set(columnsDs) ^ set(self.columns):
             specOnly = set(self.columns) - set(columnsDs)
             if specOnly:
                 errTxt = 'some required columns do NOT exist in the actual dataset'
                 errMsg = f'{self.nodeName}, {errTxt}\nMissing required columns : {specOnly}'
                 raise TaskError(errMsg)
         else:
             warnMsg = 'dataset column order does not match the column definition'
             logger.warn(f'{self.nodeName}, {warnMsg}')
     self._headerCheck = True
Пример #9
0
    def runActor(self, jobId, taskNum, keyHigh, **kwargs):
        try:
            logger.info(f'### {self.name} is called ... ###')
            dbKey = f'{jobId}|workspace'
            workspace = self._hh[dbKey]
            self.nodeTree = TreeProvider.get()
            tableName, nodeList = self.nodeTree.tableMap[taskNum]

            csvPath = f'{workspace}/{tableName}.csv'

            writer = CsvWriter.make(taskNum, tableName, keyHigh)
            writer.writeAll(csvPath, nodeList)
        except Exception as ex:
            logger.error(f'actor {self.actorId} error', exc_info=True)
            raise TaskError(ex)
Пример #10
0
    def runActor(self, jobId, taskNum):
        try:
            logger.info(f'{self.name} is called ... ###')
            dbKey = f'{jobId}|workspace'
            workspace = self._hh[dbKey]
            dbKey = f'{jobId}|output|jsonFile'
            jsonFile = self._hh[dbKey]

            jsonPath = f'{workspace}/{jsonFile}'
            logger.info(f'### task workspace : {workspace}')
            logger.info(f'### output json file : {jsonFile}')
            self.arrange(jobId, taskNum)
            self.run(jsonPath)
        except Exception as ex:
            logger.error(f'{self.name}, actor {self.actorId} errored',
                         exc_info=True)
            raise TaskError(ex)
Пример #11
0
    def runActor(self, jobId, taskNum, *args, **kwargs):
        try:
            logger.info(f'### CsvNormaliser {taskNum} is called ... ###')
            dbKey = f'{jobId}|workspace'
            workspace = self._hh[dbKey]
            logger.info(f'### normalise workspace : {workspace}')

            self.arrange(taskNum)
            logger.info(
                f'### task|{taskNum:02} input csv file : {self.tableName}.csv')

            csvPath = f'{workspace}/{self.tableName}.csv'
            if not os.path.exists(csvPath):
                errmsg = f'{self.csvFile} does not exist in workspace'
                raise Exception(errmsg)
            self.run(csvPath)
        except Exception as ex:
            logger.error(f'{self.name}, actor {self.actorId} errored',
                         exc_info=True)
            raise TaskError(ex)
Пример #12
0
    def runActor(self, jobId, taskNum, *args, **kwargs):
        try:
            logger.info(f'### XmlNormaliser {taskNum} is called ... ###')
            dbKey = f'{jobId}|workspace'
            workspace = self._hh[dbKey]
            dbKey = f'{jobId}|XFORM|input|{taskNum}|xmlFile'
            xmlFile = self._hh[dbKey]

            logger.info(f'### normalise workspace : {workspace}')
            logger.info(f'### task|{taskNum:02} input xml file : {xmlFile}')

            xmlPath = f'{workspace}/{xmlFile}'
            if not os.path.exists(xmlPath):
                errmsg = f'{xmlFile} does not exist in workspace'
                raise Exception(errmsg)
            self.arrange(taskNum)
            self.run(xmlPath, xmlFile)
        except Exception as ex:
            logger.error(f'{self.name}, actor {self.actorId} errored',
                         exc_info=True)
            raise TaskError(ex)
Пример #13
0
  async def _PREPARE(self, jobId, taskId):
    self.jobId = jobId
    logger.info(f'{self.name}, job {jobId}, preparing {taskId} data stream ...')    
    hardhash = HardhashContext.connector(jobId)
    try:
      dbKey = f'{jobId}|workspace'
      workspace = hardhash[dbKey]
      dbKey = f'{jobId}|datastream|infile'
      self.infileName = hardhash[dbKey]
    except KeyError as ex:
      errmsg = f'{jobId}, failed to get job article from datastorage'
      await self._conn.sendReply([500, {'error': errmsg}])
      raise TaskError(errmsg)

    logger.info(f'{self.name}, datastream workspace, infile : {workspace}, {self.infileName}')
    self.infilePath = f'{workspace}/{self.infileName}'
    if not os.path.exists(self.infilePath):
      errmsg = f'source file {self.infileName} does not exist in workspace'
      await self._conn.sendReply([500, {'error': errmsg}])
    else:
      await self._conn.sendReply([200, {'status':'ready','infile':f'{self.infileName}'}])
Пример #14
0
    def evalSysStatus(self):
        jpacket = {'eventKey': f'REPO|{self.jobId}', 'itemKey': 'csvToJson'}
        repo = self.query(Note(jpacket))

        apiBase = self._leveldb['apiBase']
        sysPath = f'{apiBase}/{repo.sysPath}'
        if not os.path.exists(sysPath):
            errmsg = f'xform input path does not exist : {sysPath}'
            raise TaskError(errmsg)

        catPath = self.jmeta.category
        if catPath not in repo.consumerCategories:
            errmsg = 'consumer category branch %s does not exist under %s' \
                                      % (catPath, str(repo.consumerCategories))
            raise TaskError(errmsg)

        repoPath = f'{sysPath}/{catPath}'
        logger.info('input zipfile repo path : ' + repoPath)

        inputZipFile = f'{self.jobId}.{self.jmeta.fileExt}'
        logger.info('input zipfile : ' + inputZipFile)

        zipFilePath = f'{repoPath}/{inputZipFile}'
        if not os.path.exists(zipFilePath):
            errmsg = 'xform input zipfile does not exist in source repo'
            raise TaskError(errmsg)

        workbase = f'{apiBase}/{self.jmeta.workspace}'
        if not os.path.exists(workbase):
            errmsg = f'xform workspace path does not exist : {workbase}'
            raise TaskError(errmsg)

        tsXref = datetime.now().strftime('%y%m%d%H%M%S')

        workspace = f'{workbase}/{tsXref}'
        logger.info('session workspace : ' + workspace)
        logger.info('creating session workspace ... ')

        try:
            cmdArgs = ['mkdir', '-p', workspace]
            self.sysCmd(cmdArgs)
        except TaskError as ex:
            logger.error(f'{self.jobId}, workspace creation failed')
            raise

        try:
            self.sysCmd(['cp', zipFilePath, workspace])
        except TaskError as ex:
            logger.error(f'zipfile copy to workspace failed : {zipFilePath}')
            raise

        try:
            cmdArgs = ['tar', '-xzf', inputZipFile]
            self.sysCmd(cmdArgs, cwd=workspace)
        except TaskError as ex:
            logger.error(f'{inputZipFile}, gunzip tar extract command failed')
            raise

        # put workspace path in storage for micro-service access
        dbKey = f'{self.jobId}|workspace'
        self._leveldb[dbKey] = workspace
        self.workspace = workspace
Пример #15
0
    def evalSysStatus(self):
        jpacket = {'eventKey': f'REPO|{self.jobId}', 'itemKey': 'xmlToCsv'}
        repo = self.query(Note(jpacket))

        apiBase = self._leveldb['apiBase']
        sysPath = f'{apiBase}/{repo.sysPath}'
        if not os.path.exists(sysPath):
            errmsg = f'xform input path does not exist : {sysPath}'
            raise TaskError(errmsg)

        catPath = self.jmeta.category
        if catPath not in repo.consumerCategories:
            errmsg = 'consumer category branch %s does not exist in %s' \
                                      % (catPath, str(repo.consumerCategories))
            raise TaskError(errmsg)

        repoPath = f'{sysPath}/{catPath}'
        logger.info('xml input file repo path : ' + repoPath)

        inputXmlFile = f'{self.jobId}.{self.jmeta.fileExt}'
        logger.info('xml input file : ' + inputXmlFile)

        xmlFilePath = f'{repoPath}/{inputXmlFile}'
        if not os.path.exists(xmlFilePath):
            errmsg = 'xform xml input file does not exist in source repo'
            raise TaskError(errmsg)

        workbase = f'{apiBase}/{self.jmeta.workspace}'
        if not os.path.exists(workbase):
            errmsg = f'xform workspace path does not exist : {workbase}'
            raise TaskError(errmsg)

        tsXref = datetime.now().strftime('%y%m%d%H%M%S')

        workspace = f'{workbase}/{tsXref}'
        logger.info('session workspace : ' + workspace)
        logger.info('creating session workspace ... ')

        try:
            self.sysCmd(['mkdir', '-p', workspace])
        except TaskError as ex:
            logger.error(f'{self.jobId}, workspace creation failed')
            raise

        try:
            cmdArgs = ['cp', xmlFilePath, workspace]
            self.sysCmd(cmdArgs)
        except TaskError as ex:
            logger.error(f'copy to workspace failed : {inputXmlFile}')
            raise

        xmlFilePath = f'{workspace}/{inputXmlFile}'
        lineCount = getLineCount(xmlFilePath)
        if lineCount <= 2000:
            logMsg = f'file split not required, line count : {lineCount} < 2000'
            logger.info(f'{self.jobId}, {logMsg}')
            self.jobRange = 1
            dbKey = f'{self.jobId}|XFORM|input|1|xmlFile'
            self._leveldb[dbKey] = inputXmlFile
        else:
            self.jobRange = 2
            splitSize = int(math.ceil(lineCount / self.jobRange))
            # round up to the nearest 50
            #splitSize = int(math.ceil(splitSize / 50.0)) * 50
            logger.info(
                f'{self.jobId}, line count, split size : {lineCount}, {splitSize}'
            )
            try:
                splitFileName = self.jobId
                cmdArgs = [
                    'split', '-l',
                    str(splitSize), inputXmlFile, splitFileName
                ]
                self.sysCmd(cmdArgs, cwd=workspace)
            except TaskError as ex:
                logger.error(f'{inputXmlFile}, split command failed')
                raise

            for i in range(1, self.jobRange + 1):
                self.putSplitFilename(i)

        # put workspace path in storage for micro-service access
        dbKey = f'{self.jobId}|workspace'
        self._leveldb[dbKey] = workspace
        self.workspace = workspace
Пример #16
0
 def make(cls, contextId=None):
     if not contextId:
         raise TaskError(
             '{self.name}, cannot make context datasource with an id')
     datasource = LeveldbDatasource.make(contextId)
     return cls(contextId, datasource)