def calculate(self, ctx, input): # ctx is the context object # return variables are: output #BEGIN calculate ''' Compute reaction probabilities from a probabilistic annotation. The input dictionary must contain the following keys: probanno: Name of ProbAnno object to input probanno_workspace: Workspace from which to grab the ProbAnno object rxnprobs: Name of RxnProbs object rxnprobs_workspace: Workspace to which to save the RxnProbs object The following keys are optional: verbose: Print lots of messages on the progress of the algorithm template_model: Name of TemplateModel object template_workspace: Workspace from which to grab TemplateModel object @param ctx Current context object @param input Dictionary with input parameters for function @return Object info for RxnProbs object @raise WrongVersionError when ProbAnno object version number is invalid @raise ValueError when template_workspace input argument is not specified ''' # Sanity check on input arguments input = self._checkInputArguments(ctx, input, ["probanno", "probanno_workspace", "rxnprobs", "rxnprobs_workspace"], { "verbose" : False , "template_model" : None, "template_workspace" : None } ) # Make sure the static database files are ready. self._checkDatabaseFiles(ctx) # Set log level to INFO when verbose parameter is enabled. if input['verbose']: ctx.set_log_level(log.DEBUG) # Create a workspace client. wsClient = Workspace(self.config["workspace_url"], token=ctx['token']) # Get the ProbAnno object from the specified workspace. probannoObjectId = make_object_identity(input["probanno_workspace"], input["probanno"]) objectList = wsClient.get_objects( [ probannoObjectId ] ) probannoObject = objectList[0] if probannoObject['info'][2] != ProbAnnoType: message = "ProbAnno object type %s is not %s for object %s" %(probannoObject['info'][2], ProbAnnoType, probannoObject['info'][1]) ctx.log_err(message) raise WrongVersionError(message) genome = probannoObject["data"]["genome"] # Create a temporary directory for storing intermediate files when debug is turned on. if ctx.get_log_level() >= log.DEBUG2: workFolder = tempfile.mkdtemp("", "calculate-%s-" %(genome), self.config["work_folder_path"]) ctx.log_debug('Intermediate files saved in '+workFolder) else: workFolder = None # When a template model is specified, use it to build dictionaries for roles, # complexes, and reactions instead of retrieving from static database files. complexesToRoles = None reactionsToComplexes = None if input["template_model"] is not None or input["template_workspace"] is not None: if not(input["template_model"] is not None and input["template_workspace"] is not None) : message = "Template model workspace is required if template model ID is provided" ctx.log_err(message) raise ValueError(message) # Create a dictionary to map a complex to a list of roles and a dictionary # to map a reaction to a list of complexes. The dictionaries are specific to # the specified template model instead of covering everything in the central # data model. complexesToRoles = dict() reactionsToComplexes = dict() # Get the list of RoleComplexReactions for the template model from the # fba modeling service. The RoleComplexReactions structure has a list # of ComplexReactions structures for the given role. And each ComplexReactions # structure has a list of reactions for the given complex. fbaClient = fbaModelServices(self.config['fbamodeling_url'], token=ctx['token']) roleComplexReactionsList = fbaClient.role_to_reactions( { 'templateModel': input['template_model'], 'workspace': input['template_workspace'] } ) # Build the two dictionaries from the returned list. for rcr in roleComplexReactionsList: for complex in rcr['complexes']: complexId = re.sub(r'cpx0*(\d+)', r'kb|cpx.\1', complex['name']) # Convert ModelSEED format to KBase format if complexId in complexesToRoles: complexesToRoles[complexId].append(rcr['name']) else: complexesToRoles[complexId] = [ rcr['name'] ] for reaction in complex['reactions']: reactionId = reaction['reaction'] if reactionId in reactionsToComplexes: reactionsToComplexes[reactionId].append(complexId) else: reactionsToComplexes[reactionId] = [ complexId ] # Calculate per-gene role probabilities. roleProbs = self._rolesetProbabilitiesToRoleProbabilities(ctx, input, genome, probannoObject["data"]["roleset_probabilities"], workFolder) # Calculate whole cell role probabilities. # Note - eventually workFolder will be replaced with a rolesToReactions call totalRoleProbs = self._totalRoleProbabilities(ctx, input, genome, roleProbs, workFolder) # Calculate complex probabilities. complexProbs = self._complexProbabilities(ctx, input, genome, totalRoleProbs, workFolder, complexesToRequiredRoles = complexesToRoles) # Calculate reaction probabilities. reactionProbs = self._reactionProbabilities(ctx, input, genome, complexProbs, workFolder, rxnsToComplexes = reactionsToComplexes) # If the reaction probabilities were not calculated using the data from the fba modeling service # via the template model, we need to convert from the KBase ID format to the ModelSEED format. if input["template_model"] is None: reactionList = list() for index in range(len(reactionProbs)): reactionList.append(reactionProbs[index][0]) EntityAPI = CDMI_EntityAPI(self.config["cdmi_url"]) numAttempts = 4 while numAttempts > 0: try: numAttempts -= 1 reactionData = EntityAPI.get_entity_Reaction( reactionList, [ "source_id" ] ) if len(reactionList) == len(reactionData): numAttempts = 0 except HTTPError as e: pass for index in range(len(reactionProbs)): rxnId = reactionProbs[index][0] reactionProbs[index][0] = reactionData[rxnId]['source_id'] # Create a reaction probability object objectData = dict() objectData["genome"] = probannoObject["data"]["genome"] objectData['genome_workspace'] = probannoObject['data']['genome_workspace'] if input["template_model"] is None: objectData['template_model'] = 'None' else: objectData["template_model"] = input["template_model"] if input["template_workspace"] is None: objectData['template_workspace'] = 'None' else: objectData["template_workspace"] = input["template_workspace"] objectData["probanno"] = input['probanno'] objectData['probanno_workspace'] = input['probanno_workspace'] objectData["id"] = input["rxnprobs"] objectData["reaction_probabilities"] = reactionProbs objectMetaData = { "num_reaction_probs": len(objectData["reaction_probabilities"]) } objectProvData = dict() objectProvData['time'] = timestamp(0) objectProvData['service'] = os.environ['KB_SERVICE_NAME'] objectProvData['service_ver'] = ServiceVersion objectProvData['method'] = 'calculate' objectProvData['method_params'] = input.items() objectProvData['input_ws_objects'] = [ '%s/%s/%d' %(probannoObject['info'][7], probannoObject['info'][1], probannoObject['info'][4]) ] objectSaveData = dict(); objectSaveData['type'] = RxnProbsType objectSaveData['name'] = input["rxnprobs"] objectSaveData['data'] = objectData objectSaveData['meta'] = objectMetaData objectSaveData['provenance'] = [ objectProvData ] objectInfo = wsClient.save_objects( { 'workspace': input["rxnprobs_workspace"], 'objects': [ objectSaveData ] } ) output = objectInfo[0] #END calculate # At some point might do deeper type checking... if not isinstance(output, list): raise ValueError('Method calculate return value ' + 'output is not type list as required.') # return the results return [output]
def runAnnotate(self, job): ''' Run an annotate job to create a ProbAnno typed object. A ProbAnno typed object is created in four steps: (1) extract amino acid sequences from a Genome typed object to a fasta file, (2) run a BLAST search using the amino acid sequences against the subsystem BLAST database, (3) calculate annotation likelihood scores for each roleset implied by the functions of proteins in subsystems, and (4) save the likelihood scores to a ProbAnno typed object. The Job dictionary contains three main sections: (1) input parameters to the annotate() function, (2) context of server instance running the annotate() function, and (3) config variables of server. @param job Job dictionary created by server's annotate() function @return Nothing (although job is marked as complete) ''' # The input parameters and user context for annotate() were stored in the job data for the job. input = job["input"] if input['verbose']: self.logger.set_log_level(log.DEBUG) self.ctx = job["context"] self.config = job['config'] # Create a DataParser object for working with the static database files. self.dataParser = DataParser(self.config) status = None try: # Make sure the database files are available. self.dataParser.checkIfDatabaseFilesExist() # Make sure the job directory exists. workFolder = make_job_directory(self.config['work_folder_path'], job['id']) # Create a user and job state client and authenticate as the user. ujsClient = UserAndJobState(self.config['userandjobstate_url'], token=self.ctx['token']) # Get the Genome object from the specified workspace. try: ujsClient.update_job_progress(job['id'], self.ctx['token'], 'getting genome object', 1, timestamp(3600)) except: pass wsClient = Workspace(self.config["workspace_url"], token=self.ctx['token']) genomeObjectId = make_object_identity(input["genome_workspace"], input["genome"]) objectList = wsClient.get_objects( [ genomeObjectId ] ) genomeObject = objectList[0] # Convert Genome object to fasta file. try: ujsClient.update_job_progress(job['id'], self.ctx['token'], 'converting Genome object to fasta file', 1, timestamp(3600)) except: pass fastaFile = self._genomeToFasta(input, genomeObject, workFolder) # Run blast using the fasta file. try: ujsClient.update_job_progress(job['id'], self.ctx['token'], 'running blast', 1, timestamp(3600)) except: pass blastResultFile = self._runBlast(input, fastaFile, workFolder) # Calculate roleset probabilities. try: ujsClient.update_job_progress(job['id'], self.ctx['token'], 'calculating roleset probabilities', 1, timestamp(300)) except: pass rolestringTuples = self._rolesetProbabilitiesMarble(input, blastResultFile, workFolder) # Build ProbAnno object and store in the specified workspace. try: ujsClient.update_job_progress(job['id'], self.ctx['token'], 'building ProbAnno object', 1, timestamp(120)) except: pass output = self._buildProbAnnoObject(input, genomeObject, blastResultFile, rolestringTuples, workFolder, wsClient) # Mark the job as done. status = "done" tb = None self._log(log.INFO, 'Job '+job['id']+' finished for genome '+input['genome']+' to probanno '+input['probanno']) except: tb = traceback.format_exc() sys.stderr.write('\n'+tb) status = "failed" self._log(log.ERR, 'Job '+job['id']+' failed for genome '+input['genome']+' to probanno '+input['probanno']) # Mark the job as complete with the given status. ujsClient.complete_job(job['id'], self.ctx['token'], status, tb, { }) # Remove the temporary work directory. if self.logger.get_log_level() < log.DEBUG2 and status == 'done': try: shutil.rmtree(workFolder) except OSError: # For some reason deleting the directory was failing in production. Rather than have all jobs look like they failed # I catch and log the exception here (since the user still gets the same result if the directory remains intact) msg = 'Unable to delete temporary directory %s\n' %(workFolder) sys.stderr.write('WARNING: '+msg) self._log(log.WARNING, msg) return
def annotate(self, ctx, input): # ctx is the context object # return variables are: jobid #BEGIN annotate ''' Compute probabilistic annotations from the specified genome object. The input dictionary must contain the following keys: genome: Name of genome object genome_workspace: Workspace from which to grab the Genome object probanno: Name of probanno object to output probanno_workspace: Workspace to which to save the ProbAnno object The following keys are optional: verbose: Print lots of messages on the progress of the algorithm @param ctx Current context object @param input Dictionary with input parameters for function @return Job ID of job started to compute annotation likelihoods ''' input = self._checkInputArguments(ctx, input, [ "genome", "genome_workspace", "probanno", "probanno_workspace"], { "verbose" : False } ) # Make sure the static database files are ready. self._checkDatabaseFiles(ctx) # Set log level to INFO when verbose parameter is enabled. if input['verbose']: ctx.set_log_level(log.DEBUG) # Make sure the Genome object is available. wsClient = Workspace(self.config["workspace_url"], token=ctx['token']) genomeIdentity = make_object_identity(input['genome_workspace'], input['genome']) wsClient.get_object_info( [ genomeIdentity ], 0 ) # Create a user and job state client and authenticate as the user. ujsClient = UserAndJobState(self.config['userandjobstate_url'], token=ctx['token']) # Create a job to track running probabilistic annotation. description = 'pa-annotate for genome %s to probanno %s for user %s' %(input['genome'], input['probanno'], ctx['user_id']) progress = { 'ptype': 'task', 'max': 5 } jobid = ujsClient.create_and_start_job(ctx['token'], 'initializing', description, progress, timestamp(3600)) ctx.log_info('Job '+jobid+' started for genome '+input['genome']+' to probanno '+input['probanno']) # Run the job on the local machine. if self.config["job_queue"] == "local": # Create working directory for job and build file names. jobDirectory = make_job_directory(self.config['work_folder_path'], jobid) jobDataFilename = os.path.join(jobDirectory, 'jobdata.json') outputFilename = os.path.join(jobDirectory, 'stdout.log') errorFilename = os.path.join(jobDirectory, 'stderr.log') # Save data required for running the job. jobData = { 'id': jobid, 'input': input, 'context': ctx, 'config': self.config } json.dump(jobData, open(jobDataFilename, "w"), indent=4) # Start worker to run the job. jobScript = os.path.join(os.environ['KB_TOP'], 'bin/pa-runjob') cmdline = "nohup %s %s >%s 2>%s &" %(jobScript, jobDirectory, outputFilename, errorFilename) status = os.system(cmdline) ctx.log_info('Job %s is running on local host, status %d' %(jobid, status)) #END annotate # At some point might do deeper type checking... if not isinstance(jobid, basestring): raise ValueError('Method annotate return value ' + 'jobid is not type basestring as required.') # return the results return [jobid]
def _buildProbAnnoObject(self, input, genomeObject, blastResultFile, queryToRolesetProbs, workFolder, wsClient): ''' Create a ProbAnno typed object and save it to a workspace. The queryToRolesetProbs dictionary has this format: querygene -> [ (roleset, likelihood), ... ] The probabilistic annotation object adds fields for the probability of each role being linked to each gene. @param input Dictionary of input parameters to annotate() function @param genomeObject Genome typed object from workspace @param blastResultFile Path to output file from BLAST in tab-delimited format @param queryToRolesetProbs: Dictionary keyed by query protein of list of tuples with roleset and likelihood @param workFolder Path to directory in which to store temporary files @param wsClient Workspace client object @return metadata @raise NoGeneIdsError ''' sys.stderr.write("Building ProbAnno object %s/%s for genome %s..." %(input["probanno_workspace"], input["probanno"], input["genome"])) # Read in the target roles (this function returns the roles as lists!) targetToRoles, rolesToTargets = self.dataParser.readFilteredOtuRoles() targetToRoleSet = dict() for target in targetToRoles: stri = self.config["separator"].join(sorted(targetToRoles[target])) targetToRoleSet[target] = stri # This is a dictionary from query ID to (target, -log E-value) pairs. # We just use it to identify whether or not we actually hit anything in the db # when searching for the query gene. queryToTargetEvals = self.dataParser.parseBlastOutput(blastResultFile) # For each query ID: # 1. Identify their rolestring probabilities (these are the first and second elements of the tuple) # 2. Iterate over the target genes and identify those with each function (a list of these and their blast scores is # the third element of the tuple) - should be able to set it up to only iterate once over the list. # 3. Add that tuple to the JSON file with the key "alternativeFunctions" # The Genome object data ["features"] is a list of dictionaries. We want to make our data structure and # then add that to the dictionary. I use the ii in range so I can edit the elements without changes being lost. objectData = dict() objectData["id"] = input["probanno"] objectData["genome"] = input["genome"] objectData["genome_workspace"] = input["genome_workspace"]; objectData["roleset_probabilities"] = queryToRolesetProbs; objectData["skipped_features"] = [] for ii in range(len(genomeObject["data"]["features"])): feature = genomeObject["data"]["features"][ii] if "id" not in genomeObject["data"]: raise NoGeneIdsError("No gene IDs found in input Genome object %s/%s (this should never happen)" %(input["genome_workspace"], input["genome"])) queryid = feature["id"] # This can happen if I couldn't find hits from that gene to anything in the database. In this case, I'll just skip it. # TODO Or should I make an empty object? I should ask Chris. if queryid not in queryToRolesetProbs or queryid not in queryToTargetEvals: objectData["skipped_features"].append(queryid) # Store the ProbAnno object in the specified workspace. objectMetaData = dict() objectMetaData['num_rolesets'] = len(objectData["roleset_probabilities"]) objectMetaData['num_skipped_features'] = len(objectData["skipped_features"]) objectProvData = dict() objectProvData['time'] = timestamp(0) objectProvData['service'] = os.environ['KB_SERVICE_NAME'] objectProvData['service_ver'] = ServiceVersion objectProvData['method'] = 'annotate' objectProvData['method_params'] = input.items() objectProvData['input_ws_objects'] = [ '%s/%s/%d' %(genomeObject['info'][7], genomeObject['info'][1], genomeObject['info'][4]) ] objectSaveData = dict() objectSaveData['type'] = ProbAnnoType objectSaveData['name'] = input["probanno"] objectSaveData['data'] = objectData objectSaveData['meta'] = objectMetaData objectSaveData['provenance'] = [ objectProvData ] retryCount = 3 while retryCount > 0: try: objectInfo = wsClient.save_objects( { 'workspace': input["probanno_workspace"], 'objects': [ objectSaveData ] } ) sys.stderr.write("done\n") return objectInfo[0] except HTTPError as e: # Hopefully this is just a temporary glitch, try again in a few seconds since we worked so hard to build the object. retryCount -= 1 self._log(log.WARNING, 'HTTP error %s when saving %s to workspace %s' %(e.reason, input['probanno'], input['probanno_workspace'])) time.sleep(15) # Saving the object failed so raise the last exception that was caught. raise e