def test_shock_copy_node(self): test_phrase = "Hi there!" path_to_temp_file = "/kb/module/work/tmp/temp_copy_" + str( time.time()) + ".fq" self.textToFile(test_phrase, path_to_temp_file) dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'], token=self.ctx['token']) attributes = {'foo': 'bar'} shock_id = dfu.file_to_shock({ 'file_path': path_to_temp_file, 'attributes': attributes })['shock_id'] # Check what's saved os.remove(path_to_temp_file) node_info = dfu.shock_to_file({ 'shock_id': shock_id, 'file_path': path_to_temp_file }) self.assertEqual(test_phrase, self.fileToText(path_to_temp_file)) self.assertEqual( node_info.get('attributes'), attributes, "Unexpected attributes in node info: " + str(node_info)) # Let's copy shock node shock_id2 = dfu.copy_shock_node({'shock_id': shock_id})['shock_id'] path_to_temp_file2 = "/kb/module/work/tmp/temp_copy2_" + str( time.time()) + ".fq" node_info2 = dfu.shock_to_file({ 'shock_id': shock_id2, 'file_path': path_to_temp_file2 }) self.assertEqual(test_phrase, self.fileToText(path_to_temp_file2)) self.assertEqual( node_info2.get('attributes'), attributes, "Unexpected attributes in node info: " + str(node_info2))
def download_genome_to_json_files(token, genome_ref, target_dir): if not os.path.exists(target_dir): os.makedirs(target_dir) file_name_to_data_map = {} dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'], token=token, service_ver='dev') genome_data = dfu.get_objects({'object_refs': [genome_ref]})['data'][0] genome_obj = genome_data['data'] genome_meta = genome_data['info'][10] file_name_to_data_map["genome.json"] = genome_obj file_name_to_data_map["genome.meta.json"] = genome_meta if 'genbank_handle_ref' in genome_obj: gbk_file_name = "genome.gbk" dfu.shock_to_file({ 'handle_id': genome_obj['genbank_handle_ref'], 'file_path': os.path.join(target_dir, gbk_file_name) }) genome_obj['genbank_handle_ref'] = gbk_file_name if 'contigset_ref' in genome_obj: contigset_data = dfu.get_objects( {'object_refs': [genome_obj['contigset_ref']]})['data'][0] contigset_obj = contigset_data['data'] contigset_meta = contigset_data['info'][10] file_name_to_data_map["contigset.json"] = contigset_obj file_name_to_data_map["contigset.meta.json"] = contigset_meta genome_obj['contigset_ref'] = "contigset.json" elif 'assembly_ref' in genome_obj: assembly_data = dfu.get_objects( {'object_refs': [genome_obj['assembly_ref']]})['data'][0] assembly_obj = assembly_data['data'] assembly_meta = assembly_data['info'][10] file_name_to_data_map["assembly.json"] = assembly_obj file_name_to_data_map["assembly.meta.json"] = assembly_meta genome_obj['assembly_ref'] = "assembly.json" fasta_handle_ref = assembly_obj['fasta_handle_ref'] fasta_file_name = "assembly.fa" dfu.shock_to_file({ 'handle_id': fasta_handle_ref, 'file_path': os.path.join(target_dir, fasta_file_name) }) assembly_obj['fasta_handle_ref'] = fasta_file_name assembly_obj['external_source_id'] = fasta_file_name if 'taxon_ref' in assembly_obj: taxon_obj = dfu.get_objects( {'object_refs': [assembly_obj['taxon_ref']]})['data'][0]['data'] file_name_to_data_map["taxon.json"] = taxon_obj assembly_obj['taxon_ref'] = "taxon.json" if 'taxon_ref' in genome_obj: genome_obj['taxon_ref'] = "taxon.json" taxon_obj['parent_taxon_ref'] = "" for target_file_name in file_name_to_data_map: with open(os.path.join(target_dir, target_file_name), 'w') as f: json.dump(file_name_to_data_map[target_file_name], f, sort_keys=True, indent=4)
def test_shock_handle_ws(self): test_phrase = "Hi there!" path_to_temp_file = "/kb/module/work/tmp/temp_" + str( time.time()) + ".fq" self.textToFile(test_phrase, path_to_temp_file) dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'], token=self.ctx['token']) uploaded = dfu.file_to_shock({ 'file_path': path_to_temp_file, 'make_handle': 1 }) fhandle = uploaded['handle'] self.assertTrue('hid' in fhandle, "Handle: " + str(fhandle)) data = {'hid': fhandle['hid']} obj_name = 'TestObject.1' info = self.getWsClient().save_objects({ 'workspace': self.getWsName(), 'objects': [{ 'type': 'Empty.AHandle', 'data': data, 'name': obj_name }] })[0] self.assertEqual(info[1], obj_name) ref = self.getWsName() + '/' + obj_name handle_data = self.getWsClient().get_objects([{'ref': ref}])[0]['data'] self.assertTrue('hid' in handle_data, "Data: " + str(handle_data)) hid = handle_data['hid'] path_to_temp_file2 = "/kb/module/work/tmp/temp2_" + str( time.time()) + ".fq" dfu.shock_to_file({'handle_id': hid, 'file_path': path_to_temp_file2}) self.assertEqual(test_phrase, self.fileToText(path_to_temp_file2))
def _get_cached_index(self, assembly_info, validated_params): try: # note: list_reference_objects does not yet support reference paths, so we need to call # with the direct reference. So we won't get a cache hit if you don't have direct access # to the assembly object right now (although you can still always build the assembly object) # Once this call supports paths, this should be changed to set ref = assembly_info['ref'] info = assembly_info['info'] ref = str(info[6]) + '/' + str(info[0]) + '/' + str(info[4]) objs = self.ws.list_referencing_objects([{'ref': ref}])[0] # iterate through each of the objects that reference the assembly bowtie2_indexes = [] for o in objs: if o[2].startswith('KBaseRNASeq.Bowtie2IndexV2'): bowtie2_indexes.append(o) # Nothing refs this assembly, so cache miss if len(bowtie2_indexes) == 0: return False # if there is more than one hit, get the most recent one # (obj_info[3] is the save_date timestamp (eg 2017-05-30T22:56:49+0000), so we can sort on that) bowtie2_indexes.sort(key=lambda x: x[3]) bowtie2_index_info = bowtie2_indexes[-1] index_ref = str(bowtie2_index_info[6]) + '/' + str(bowtie2_index_info[0]) + '/' + str(bowtie2_index_info[4]) # get the object data index_obj_data = self.ws.get_objects2({'objects': [{'ref': index_ref}]})['data'][0]['data'] # download the handle object os.makedirs(validated_params['output_dir']) dfu = DataFileUtil(self.callback_url) dfu.shock_to_file({'file_path': os.path.join(validated_params['output_dir'], 'bt2_index.tar.gz'), 'handle_id': index_obj_data['handle']['hid'], 'unpack': 'unpack'}) print('Cache hit: ') pprint(index_obj_data) return {'output_dir': validated_params['output_dir'], 'index_files_basename': index_obj_data['index_files_basename']} except Exception: # if we fail in saving the cached object, don't worry print('WARNING: exception encountered when trying to lookup in cache:') print(traceback.format_exc()) print('END WARNING: exception encountered when trying to lookup in cache.') return None
def get_genbank_handle(self, data): if 'genbank_handle_ref' not in data: return None if data['genbank_handle_ref'] is None: return None print('pulling cached genbank file from Shock: ' + str(data['genbank_handle_ref'])) dfu = DataFileUtil(self.cfg.callbackURL) file = dfu.shock_to_file({ 'handle_id': data['genbank_handle_ref'], 'file_path': self.cfg.sharedFolder, 'unpack': 'unpack' }) return {'genbank_file': {'file_path': file['file_path']}}
def get_gff_handle(self, data, output_dir): if 'gff_handle_ref' not in data: return None if data['gff_handle_ref'] is None: return None print('pulling cached GFF file from Shock: ' + str(data['gff_handle_ref'])) dfu = DataFileUtil(self.cfg.callbackURL) file_ret = dfu.shock_to_file({ 'handle_id': data['gff_handle_ref'], 'file_path': output_dir, 'unpack': 'unpack' }) return {'file_path': file_ret['file_path']}
def download_file_from_shock(logger, shock_service_url = None, shock_id = None, filename = None, directory = None, filesize= None, token = None): """ Given a SHOCK instance URL and a SHOCK node id, download the contents of that node to a file on disk. """ if filename is not None: shockFileName = filename if directory is not None: filePath = os.path.join(directory, shockFileName) else: filePath = shockFileName #shock_service_url is from config dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'], token=token) return dfu.shock_to_file({"shock_id" : shock_id, "file_path":filePath, "unpack" : None})
def stage_input(self, params): ''' Setup the input_directory by fetching the files and uncompressing if needed. ''' # construct the input directory where we stage files input_directory = os.path.join(self.cfg.sharedFolder, 'genome-upload-staging-'+str(uuid.uuid4())) os.makedirs(input_directory) # at this point, the 'file' input is validated, so we don't have to catch any special cases # we expect one and only one of path, shock_id, or ftp_url # determine how to get the file: if it is from shock, download it. If it # is just sitting there, then use it. Move the file to the staging input directory file = params['file'] genbank_file_path = None if 'path' in file and file['path'] is not None: # copy the local file to the input staging directory # (NOTE: could just move it, but then this method would have the side effect of moving your # file which another SDK module might have an open handle on) local_file_path = file['path'] genbank_file_path = os.path.join(input_directory, os.path.basename(local_file_path)) shutil.copy2(local_file_path, genbank_file_path) if 'shock_id' in file and file['shock_id'] is not None: # handle shock file print('Downloading file from SHOCK node: ' + str(self.cfg.shockURL) + ' - ' + str(file['shock_id'])) sys.stdout.flush() dfUtil = DataFileUtil(self.cfg.callbackURL) file_name = dfUtil.shock_to_file({ 'file_path': input_directory, 'shock_id': file['shock_id'] })['node_file_name'] genbank_file_path = os.path.join(input_directory, file_name) if 'ftp_url' in file and file['ftp_url'] is not None: # Note that the Transform originally had a script_utils.download_from_urls method # that, if the url is a folder, pulls all subfiles. That code recently broke when # fetching from NCBI (not clear if it is our issue or NCBI), but for now just # support the most common case- an FTP to a single file. print('Downloading file from: ' + str(file['ftp_url'])) sys.stdout.flush() url = urlparse(file['ftp_url']) if url.scheme != 'ftp' and url.scheme != 'http': raise ValueError('Only FTP/HTTP servers are supported') file_name = 'genome.gbk' if url.path != '': file_name = url.path.split('/')[-1] req = urllib2.Request(file['ftp_url']) response = urllib2.urlopen(req) file_data = response.read() genbank_file_path = os.path.join(input_directory, file_name) with open(genbank_file_path, "w") as genbank_file: genbank_file.write(file_data) # extract the file if it is compressed if genbank_file_path is not None: print("staged input file =" + genbank_file_path) sys.stdout.flush() dfUtil = DataFileUtil(self.cfg.callbackURL) dfUtil.unpack_file({ 'file_path': genbank_file_path }) else: raise ValueError('No valid files could be extracted based on the input') return input_directory
def genbank_to_genome_annotation(self, ctx, params): """ :param params: instance of type "GenbankToGenomeAnnotationParams" (file_path or shock_id -- Local path or shock_id of the uploaded file with genome sequence in GenBank format or zip-file with GenBank files. genome_name -- The name you would like to use to reference this GenomeAnnotation. If not supplied, will use the Taxon Id and the data source to determine the name. taxon_wsname - name of the workspace containing the Taxonomy data, defaults to 'ReferenceTaxons') -> structure: parameter "file_path" of String, parameter "shock_id" of String, parameter "ftp_url" of String, parameter "genome_name" of String, parameter "workspace_name" of String, parameter "source" of String, parameter "taxon_wsname" of String, parameter "convert_to_legacy" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)) :returns: instance of type "GenomeAnnotationDetails" -> structure: parameter "genome_annotation_ref" of String """ # ctx is the context object # return variables are: details #BEGIN genbank_to_genome_annotation print('genbank_to_genome_annotation -- paramaters = ') pprint(params) # validate input and set defaults. Note that because we don't call the uploader method # as a stand alone script, we do the validation here. if 'workspace_name' not in params: raise ValueError('workspace_name field was not defined') workspace_name = params['workspace_name'] if 'genome_name' not in params: raise ValueError('genome_name field was not defined') genome_name = params['genome_name'] source = 'Genbank' if 'source' in params: source = source; taxon_wsname = 'ReferenceTaxons' if 'taxon_wsname' in params: taxon_wsname = params['taxon_wsname'] # other options to handle # release # taxon_reference # exclude_feature_types # type # construct the input directory where we stage files input_directory = os.path.join(self.sharedFolder, 'genome-upload-staging-'+str(uuid.uuid4())) os.makedirs(input_directory) # determine how to get the file: if it is from shock, download it. If it # is just sitting there, then use it. Move the file to the staging input directory genbank_file_path = None if 'file_path' not in params: if 'shock_id' not in params: if 'ftp_url' not in params: raise ValueError('No input file (either file_path, shock_id, or ftp_url) provided') else: # TODO handle ftp - this creates a directory for us, so update the input directory print('calling Transform download utility: script_utils.download'); print('URL provided = '+params['ftp_url']); script_utils.download_from_urls( working_directory = input_directory, token = ctx['token'], # not sure why this requires a token to download from a url... urls = { 'ftpfiles': params['ftp_url'] } ); input_directory = os.path.join(input_directory,'ftpfiles') # unpack everything in input directory dir_contents = os.listdir(input_directory) print('downloaded directory listing:') pprint(dir_contents) dir_files = [] for f in dir_contents: if os.path.isfile(os.path.join(input_directory, f)): dir_files.append(f) print('processing files in directory...') for f in dir_files: # unpack if needed using the standard transform utility print('unpacking '+f) script_utils.extract_data(filePath=os.path.join(input_directory,f)) else: # handle shock file dfUtil = DataFileUtil(self.callback_url, token=ctx['token']) file_name = dfUtil.shock_to_file({ 'file_path': input_directory, 'shock_id': params['shock_id'] })['node_file_name'] genbank_file_path = os.path.join(input_directory, file_name) else: # copy the local file to the input staging directory # (NOTE: could just move it, but then this method would have the side effect of moving your # file which another SDK module might have an open handle on) local_file_path = params['file_path'] genbank_file_path = os.path.join(input_directory, os.path.basename(local_file_path)) shutil.copy2(local_file_path, genbank_file_path) if genbank_file_path is not None: print("input genbank file =" + genbank_file_path) # unpack if needed using the standard transform utility script_utils.extract_data(filePath=genbank_file_path) # do the upload (doesn't seem to return any information) uploader.upload_genome( logger=None, shock_service_url = self.shockURL, handle_service_url = self.handleURL, workspace_service_url = self.workspaceURL, input_directory=input_directory, workspace_name = workspace_name, core_genome_name = genome_name, source = source, taxon_wsname = taxon_wsname ) #### Code to convert to legacy type if requested if 'convert_to_legacy' in params and params['convert_to_legacy']==1: from doekbase.data_api.converters import genome as cvt print('Converting to legacy type, object={}'.format(genome_name)) cvt.convert_genome( shock_url=self.shockURL, handle_url=self.handleURL, ws_url=self.workspaceURL, obj_name=genome_name, ws_name=workspace_name) # clear the temp directory shutil.rmtree(input_directory) # get WS metadata to return the reference to the object (could be returned by the uploader method...) ws = Workspace(url=self.workspaceURL) info = ws.get_object_info_new({'objects':[{'ref':workspace_name + '/' + genome_name}],'includeMetadata':0, 'ignoreErrors':0})[0] details = { 'genome_annotation_ref':str(info[6]) + '/' + str(info[0]) + '/' + str(info[4]) } #END genbank_to_genome_annotation # At some point might do deeper type checking... if not isinstance(details, dict): raise ValueError('Method genbank_to_genome_annotation return value ' + 'details is not type dict as required.') # return the results return [details]
class AssemblyToFasta: def __init__(self, callback_url, scratch): self.scratch = scratch self.dfu = DataFileUtil(callback_url) def export_as_fasta(self, ctx, params): ''' Used almost exclusively for download only ''' # validate parameters if 'input_ref' not in params: raise ValueError( 'Cannot export Assembly- not input_ref field defined.') # export to a file file = self.assembly_as_fasta(ctx, {'ref': params['input_ref']}) # create the output directory and move the file there export_package_dir = os.path.join(self.scratch, file['assembly_name']) os.makedirs(export_package_dir) shutil.move( file['path'], os.path.join(export_package_dir, os.path.basename(file['path']))) # package it up and be done package_details = self.dfu.package_for_download({ 'file_path': export_package_dir, 'ws_refs': [params['input_ref']] }) return {'shock_id': package_details['shock_id']} def assembly_as_fasta(self, ctx, params): ''' main function that accepts a ref to an object and writes a file ''' self.validate_params(params) print('downloading ws object data (' + params['ref'] + ')') assembly_object = self.dfu.get_objects( {'object_refs': [params['ref']]})['data'][0] ws_type = assembly_object['info'][2] obj_name = assembly_object['info'][1] if 'filename' in params: output_filename = params['filename'] else: output_filename = obj_name + '.fa' output_fasta_file_path = os.path.join(self.scratch, output_filename) if 'KBaseGenomes.ContigSet' in ws_type: self.process_legacy_contigset(output_fasta_file_path, assembly_object['data']) elif 'KBaseGenomeAnnotations.Assembly' in ws_type: self.process_assembly(output_fasta_file_path, assembly_object['data']) else: raise ValueError( 'Cannot write data to fasta; invalid WS type (' + ws_type + '). Supported types are KBaseGenomes.ContigSet and ' + 'KBaseGenomeAnnotations.Assembly') return {'path': output_fasta_file_path, 'assembly_name': obj_name} def fasta_rows_generator_from_contigset(self, contig_list): ''' generates SeqRecords iterator for writing from a legacy contigset object ''' for contig in contig_list: description = '' if 'description' in contig and contig['description']: description = contig['description'] yield SeqRecord(Seq(contig['sequence'], SingleLetterAlphabet), id=contig['id'], description=description) def process_legacy_contigset(self, output_fasta_path, data): ''' ''' SeqIO.write(self.fasta_rows_generator_from_contigset(data['contigs']), output_fasta_path, "fasta") def process_assembly(self, output_fasta_path, data): ''' ''' self.dfu.shock_to_file({ 'handle_id': data['fasta_handle_ref'], 'file_path': output_fasta_path, 'unpack': 'uncompress' }) def validate_params(self, params): for key in ['ref']: if key not in params: raise ValueError('required "' + key + '" field was not defined')
class RNASeqDownloaderUtils: def __init__(self, config): log('--->\nInitializing RNASeqDownloaderUtils instance:\n config: %s' % config) self.scratch = config['scratch'] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url, token=self.token) self.rau = ReadsAlignmentUtils(self.callback_url, token=self.token) def download_RNASeq(self, params): """ download_RNASeq: download RNASeq Alignment/Expression/DifferentialExpression zip file params: input_ref: RNASeq object reference ID rna_seq_type: one of ['RNASeqAlignment', 'RNASeqExpression', 'RNASeqDifferentialExpression'] return: shock_id: Shock ID of stored zip file """ log('--->\nrunning RNASeqDownloaderUtils.download_RNASeq:\nparams: %s' % params) # Validate params self.validate_download_rna_seq_alignment_parameters(params) # Download RNASeq zip file # RNASeq Alignemnt, Expression and DifferentialExpression # has same object_data/handle_data structure returnVal = self._download_rna_seq_zip(params.get('input_ref')) return returnVal def download_RNASeq_Alignment(self, params): """ download_RNASeq: download RNASeq Alignment/Expression/DifferentialExpression zip file params: input_ref: RNASeq object reference ID rna_seq_type: 'RNASeqAlignment' download_file_type: one of 'bam', 'sam' or 'bai' return: shock_id: Shock ID of stored zip file """ log('--->\nrunning RNASeqDownloaderUtils.download_RNASeq_Alignment:\nparams: %s' % params) # Validate params self.validate_download_rna_seq_alignment_parameters(params) input_ref = params.get('input_ref') returnVal = dict() download_file_type = params.get('download_file_type') if download_file_type == 'bam': destination_dir = self.rau.download_alignment({ 'source_ref': input_ref, 'downloadBAI': True })['destination_dir'] shock_id = self._upload_dir_to_shock(destination_dir) elif download_file_type == 'sam': destination_dir = self.rau.download_alignment({ 'source_ref': input_ref, 'downloadSAM': True, 'downloadBAI': True })['destination_dir'] files = os.listdir(destination_dir) bam_files = [x for x in files if re.match('.*\.bam', x)] for bam_file in bam_files: log('removing file: {}'.format(bam_file)) os.remove(os.path.join(destination_dir, bam_file)) shock_id = self._upload_dir_to_shock(destination_dir) returnVal['shock_id'] = shock_id return returnVal def validate_download_rna_seq_alignment_parameters(self, params): """ validate_download_rna_seq_alignment_parameters: validates params passed to download_rna_seq_alignment method """ # check required parameters for p in ['input_ref', 'rna_seq_type']: if p not in params: raise ValueError('"' + p + '" parameter is required, but missing') # check supportive RNASeq types valid_rnaseq_types = [ 'RNASeqAlignment', 'RNASeqExpression', 'RNASeqDifferentialExpression' ] if params['rna_seq_type'] not in valid_rnaseq_types: raise ValueError('Unexpected RNASeq type: %s' % params['rna_seq_type']) def _download_rna_seq_zip(self, input_ref): """ _download_rna_seq_zip: download RNASeq's archive zip file returns: shock_id: Shock ID of stored zip file """ # get object data object_data = self._get_object_data(input_ref) log('---> getting object data\n object_date: %s' % json.dumps(object_data, indent=1)) # get handle data handle = self._get_handle_data(object_data) log('---> getting handle data\n handle data: %s' % json.dumps(object_data, indent=1)) # make tmp directory for downloading dstdir = os.path.join(self.scratch, 'tmp') if not os.path.exists(dstdir): os.makedirs(dstdir) # download original zip file and save to tmp directory handle_id = handle.get('hid') original_zip_file_path = self._download_original_zip_file( handle_id, dstdir) log('---> loading %s to shock' % original_zip_file_path) shock_id = self._upload_to_shock(original_zip_file_path) log('---> removing folder: %s' % dstdir) shutil.rmtree(dstdir) returnVal = {"shock_id": shock_id} return returnVal def _get_object_data(self, input_ref): """ _get_object_data: get object_data using DataFileUtil """ get_objects_params = { 'object_refs': [input_ref], 'ignore_errors': False } object_data = self.dfu.get_objects(get_objects_params) return object_data def _get_handle_data(self, object_data): """ _get_handle_data: get Handle from object_data """ try: handle = object_data.get('data')[0].get('data').get('file') except: error_msg = "Unexpected object format. Refer to DataFileUtil.get_objects definition\n" error_msg += "object_data:\n%s" % json.dumps(object_data, indent=1) raise ValueError(error_msg) if handle is None: error_msg = "object_data does NOT have Handle(file key)\n" error_msg += "object_data:\n%s" % json.dumps(object_data, indent=1) raise ValueError(error_msg) elif handle.get('hid') is None: error_msg = "Handle does have NOT HandleId(hid key)\n" error_msg += "handle_data:\n%s" % json.dumps(handle, indent=1) raise ValueError(error_msg) else: return handle def _download_original_zip_file(self, handle_id, dstdir): """ _download_original_zip_file: download original archive .zip file using DataFileUtil """ shock_to_file_params = {'handle_id': handle_id, 'file_path': dstdir} original_zip_file = self.dfu.shock_to_file(shock_to_file_params) original_zip_file_path = original_zip_file.get('file_path') return original_zip_file_path def _upload_to_shock(self, file_path): """ _upload_to_shock: upload target file to shock using DataFileUtil """ file_to_shock_params = {'file_path': file_path} shock_file = self.dfu.file_to_shock(file_to_shock_params) shock_id = shock_file.get('shock_id') return shock_id def _upload_dir_to_shock(self, directory): """ _upload_to_shock: upload target file to shock using DataFileUtil """ file_to_shock_params = {'file_path': directory, 'pack': 'zip'} shock_file = self.dfu.file_to_shock(file_to_shock_params) shock_id = shock_file.get('shock_id') return shock_id
class DifferentialExpressionUtils: ''' Module Name: DifferentialExpressionUtils Module Description: A KBase module: DifferentialExpressionUtils ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.0.1" GIT_URL = "https://github.com/kbaseapps/DifferentialExpressionUtils.git" GIT_COMMIT_HASH = "76ae39b906473558b32b54acd66385e7474b0115" #BEGIN_CLASS_HEADER PARAM_IN_SRC_DIR = 'source_dir' PARAM_IN_SRC_REF = 'source_ref' PARAM_IN_DST_REF = 'destination_ref' PARAM_IN_TOOL_USED = 'tool_used' PARAM_IN_TOOL_VER = 'tool_version' PARAM_IN_EXPR_SET_REF = 'expressionset_ref' PARAM_IN_GENOME_REF = 'genome_ref' PARAM_IN_DIFFEXP_FILEPATH = 'diffexpr_filepath' def log(self, message, prefix_newline=False): print(('\n' if prefix_newline else '') + str(time.time()) + ': ' + message) def _check_required_param(self, in_params, param_list): """ Check if each of the params in the list are in the input params """ for param in param_list: if (param not in in_params or not in_params[param]): raise ValueError('{} parameter is required'.format(param)) def _proc_ws_obj_params(self, ctx, params): """ Check the validity of workspace and object params and return them """ dst_ref = params.get(self.PARAM_IN_DST_REF) ws_name, obj_name_id = os.path.split(dst_ref) if not bool(ws_name.strip()) or ws_name == '/': raise ValueError("Workspace name or id is required in " + self.PARAM_IN_DST_REF) if not bool(obj_name_id.strip()): raise ValueError("Object name or id is required in " + self.PARAM_IN_DST_REF) dfu = DataFileUtil(self.callback_url) if not isinstance(ws_name, int): try: ws_id = dfu.ws_name_to_id(ws_name) except DFUError as se: prefix = se.message.split('.')[0] raise ValueError(prefix) self.log('Obtained workspace name/id ' + str(ws_id)) return ws_name, ws_id, obj_name_id def _proc_upload_diffexpr_params(self, ctx, params): """ Check the presence and validity of upload expression params """ self._check_required_param(params, [ self.PARAM_IN_DST_REF, self.PARAM_IN_GENOME_REF, self.PARAM_IN_TOOL_USED, self.PARAM_IN_TOOL_VER, self.PARAM_IN_DIFFEXP_FILEPATH ]) ws_name, ws_id, obj_name_id = self._proc_ws_obj_params(ctx, params) diffexpr_filepath = params.get(self.PARAM_IN_DIFFEXP_FILEPATH) if not (os.path.isfile(diffexpr_filepath)): raise ValueError( 'File {} does not exist: '.format(diffexpr_filepath)) return ws_name, ws_id, obj_name_id def _get_ws_info(self, obj_ref): ws = Workspace(self.ws_url) try: info = ws.get_object_info_new({'objects': [{'ref': obj_ref}]})[0] except WorkspaceError as wse: self.log('Logging workspace exception') self.log(str(wse)) raise return info def _get_diffexpr_data(self, expressionset_ref): """ Get data from expressionset object required to create differential expression object """ expression_set = self.ws_client.get_objects2( {'objects': [{ 'ref': expressionset_ref }]})['data'][0] if not expression_set.get('info')[2].startswith( 'KBaseRNASeq.RNASeqExpressionSet'): raise TypeError( '"{}" should be of type KBaseRNASeq.RNASeqExpressionSet'. format(self.PARAM_IN_EXPR_SET_REF)) expression_set_data = expression_set['data'] diffexpr_data = {} diffexpr_data['expressionSet_id'] = expressionset_ref diffexpr_data['alignmentSet_id'] = expression_set_data.get( 'alignmentSet_id') diffexpr_data['sampleset_id'] = expression_set_data.get('sampleset_id') diffexpr_data['genome_id'] = expression_set_data.get('genome_id') condition = [] mapped_expr_ids = expression_set_data.get('mapped_expression_ids') for i in mapped_expr_ids: for alignment_id, expression_id in i.items(): expression_data = self.ws_client.get_objects2( {'objects': [{ 'ref': expression_id }]})['data'][0]['data'] expression_condition = expression_data.get('condition') if expression_condition not in condition: condition.append(expression_condition) diffexpr_data.update({'condition': condition}) return diffexpr_data #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.config = config self.scratch = config['scratch'] self.callback_url = os.environ['SDK_CALLBACK_URL'] self.ws_url = config['workspace-url'] self.ws_client = Workspace(self.ws_url) self.dfu = DataFileUtil(self.callback_url) self.demu = GenDiffExprMatrix(config) #END_CONSTRUCTOR pass def upload_differentialExpression(self, ctx, params): """ Uploads the differential expression * :param params: instance of type "UploadDifferentialExpressionParams" (* Required input parameters for uploading Differential expression data string destination_ref - object reference of Differential expression data. The object ref is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the workspace name or id and obj_name_or_id is the object name or id string source_dir - directory with the files to be uploaded string expressionset_ref - expressionset object reference string tool_used - cufflinks, ballgown or deseq string tool_version - version of the tool used string diffexpr_filename - name of the differential expression data file in source_dir, created by cuffdiff, deseq or ballgown *) -> structure: parameter "destination_ref" of String, parameter "source_dir" of String, parameter "expressionset_ref" of String, parameter "tool_used" of String, parameter "tool_version" of String, parameter "diffexpr_filename" of String, parameter "tool_opts" of mapping from String to String, parameter "comments" of String :returns: instance of type "UploadDifferentialExpressionOutput" (* Output from upload differential expression *) -> structure: parameter "obj_ref" of String """ # ctx is the context object # return variables are: returnVal #BEGIN upload_differentialExpression self.log( 'Starting upload differential expression, parsing parameters ') pprint(params) ws_name, ws_id, obj_name_id = self._proc_upload_diffexpr_params( ctx, params) # add more to params to pass on to create diff expr matrix params['ws_name'] = ws_name params['ws_id'] = ws_id params['obj_name'] = obj_name_id demset_ref = self.demu.gen_diffexpr_matrices(params) self.log('Differential Expression Matrix set ref: ') pprint(demset_ref) returnVal = {'diffExprMatrixSet_ref': demset_ref} print('Uploaded object: ') print(returnVal) #END upload_differentialExpression # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError( 'Method upload_differentialExpression return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def download_differentialExpression(self, ctx, params): """ Downloads expression * :param params: instance of type "DownloadDifferentialExpressionParams" (* Required input parameters for downloading Differential expression string source_ref - object reference of expression source. The object ref is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the workspace name or id and obj_name_or_id is the object name or id *) -> structure: parameter "source_ref" of String :returns: instance of type "DownloadDifferentialExpressionOutput" (* The output of the download method. *) -> structure: parameter "destination_dir" of String """ # ctx is the context object # return variables are: returnVal #BEGIN download_differentialExpression self.log('Running download_differentialExpression with params:\n' + pformat(params)) inref = params.get(self.PARAM_IN_SRC_REF) if not inref: raise ValueError('{} parameter is required'.format( self.PARAM_IN_SRC_REF)) try: expression = self.dfu.get_objects({'object_refs': [inref]})['data'] except DFUError as e: self.log('Logging stacktrace from workspace exception:\n' + e.data) raise # set the output dir timestamp = int( (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000) output_dir = os.path.join(self.scratch, 'download_' + str(timestamp)) os.mkdir(output_dir) file_ret = self.dfu.shock_to_file({ 'shock_id': expression[0]['data']['file']['id'], 'file_path': output_dir, 'unpack': 'unpack' }) if not os.listdir(output_dir): raise ValueError('No files were downloaded: ' + output_dir) for f in glob.glob(output_dir + '/*.zip'): os.remove(f) returnVal = {'destination_dir': output_dir} #END download_differentialExpression # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError( 'Method download_differentialExpression return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def export_differentialExpression(self, ctx, params): """ Wrapper function for use by in-narrative downloaders to download expressions from shock * :param params: instance of type "ExportParams" (* Required input parameters for exporting expression string source_ref - object reference of Differential expression. The object ref is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the workspace name or id and obj_name_or_id is the object name or id *) -> structure: parameter "source_ref" of String :returns: instance of type "ExportOutput" -> structure: parameter "shock_id" of String """ # ctx is the context object # return variables are: output #BEGIN export_differentialExpression inref = params.get(self.PARAM_IN_SRC_REF) if not inref: raise ValueError(self.PARAM_IN_SRC_REF + ' parameter is required') try: expression = self.dfu.get_objects({'object_refs': [inref]})['data'] except DFUError as e: self.log('Logging stacktrace from workspace exception:\n' + e.data) raise output = {'shock_id': expression[0]['data']['file']['id']} #END export_differentialExpression # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError( 'Method export_differentialExpression return value ' + 'output is not type dict as required.') # return the results return [output] def status(self, ctx): #BEGIN_STATUS returnVal = { 'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH } #END_STATUS return [returnVal]
class FastaToAssembly: def __init__(self, callback_url, scratch): self.scratch = scratch self.dfu = DataFileUtil(callback_url) # Note added X due to kb|g.1886.fasta self.valid_chars = "-ACGTUWSMKRYBDHVNX" self.amino_acid_specific_characters = "PLIFQE" def import_fasta(self, ctx, params): print('validating parameters') self.validate_params(params) print('staging input files') fasta_file_path = self.stage_input(params) if 'min_contig_length' in params: min_contig_length = int(params['min_contig_length']) print('filtering fasta file by contig length (min len=' + str(min_contig_length) + 'bp)') fasta_file_path = self.filter_contigs_by_length(fasta_file_path, min_contig_length) print('parsing FASTA file: ' + str(fasta_file_path)) assembly_data = self.parse_fasta(fasta_file_path, params) print(' - parsed ' + str(assembly_data['num_contigs']) + ' contigs, ' + str(assembly_data['dna_size']) + 'bp') print('saving assembly to KBase') # save file to shock and build handle fasta_file_handle_info = self.save_fasta_file_to_shock(fasta_file_path) # construct the output object assembly_object_to_save = self.build_assembly_object(assembly_data, fasta_file_handle_info, params) # save to WS and return if 'workspace_id' in params: workspace_id = int(params['workspace_id']) else: workspace_id = self.dfu.ws_name_to_id(params['workspace_name']) assembly_info = self.save_assembly_object(workspace_id, params['assembly_name'], assembly_object_to_save) return assembly_info def build_assembly_object(self, assembly_data, fasta_file_handle_info, params): ''' construct the WS object data to save based on the parsed info and params ''' assembly_data['assembly_id'] = params['assembly_name'] assembly_data['fasta_handle_ref'] = fasta_file_handle_info['handle']['hid'] assembly_data['fasta_handle_info'] = fasta_file_handle_info assembly_data['type'] = 'Unknown' if 'type' in params: assembly_data['type'] = params['type'] if 'taxon_ref' in params: assembly_data['taxon_ref'] = params['taxon_ref'] if 'external_source' in params: assembly_data['external_source'] = params['external_source'] if 'external_source_id' in params: assembly_data['external_source_id'] = params['external_source_id'] if 'external_source_origination_date' in params: assembly_data['external_source_origination_date'] = params['external_source_origination_date'] return assembly_data def parse_fasta(self, fasta_file_path, params): ''' Do the actual work of inspecting each contig ''' # variables to store running counts of things total_length = 0 base_counts = {'A': 0, 'G': 0, 'C': 0, 'T': 0} md5_list = [] # map from contig_id to contig_info all_contig_data = {} extra_contig_info = {} if'contig_info' in params: extra_contig_info = params['contig_info'] for record in SeqIO.parse(fasta_file_path, "fasta"): # SeqRecord(seq=Seq('TTAT...', SingleLetterAlphabet()), # id='gi|113968346|ref|NC_008321.1|', # name='gi|113968346|ref|NC_008321.1|', # description='gi|113968346|ref|NC_008321.1| Shewanella sp. MR-4 chromosome, complete genome', # dbxrefs=[]) sequence = str(record.seq).upper() contig_info = { 'contig_id': record.id, 'name': record.id, 'description': record.description[len(record.id):].strip(), 'length': len(record.seq) } # 1) compute sequence character statistics running total total_length += contig_info['length'] sequence_count_table = dict(Counter(sequence)) for character in sequence_count_table: if character in base_counts: base_counts[character] = base_counts[character] + sequence_count_table[character] else: base_counts[character] = sequence_count_table[character] if character not in self.valid_chars: if character in self.amino_acid_specific_characters: raise ValueError('This fasta file may have amino acids in it instead ' + 'of the required nucleotides.') raise ValueError("This FASTA file has non nucleic acid characters : {0}".format(character)) # 2) record number of 'N' characters (only set if there are some) Ncount = 0 if 'N' in sequence_count_table: Ncount = sequence_count_table['N'] contig_info['Ncount'] = Ncount # 2b) record if the contig is circular if record.id in extra_contig_info: if 'is_circ' in extra_contig_info[record.id]: contig_info['is_circ'] = int(extra_contig_info[record.id]['is_circ']) if 'description' in extra_contig_info[record.id]: contig_info['description'] = str(extra_contig_info[record.id]['description']) # 3) record md5 checksum contig_md5 = md5(sequence).hexdigest() contig_info['md5'] = contig_md5 md5_list.append(contig_md5) # 4) record the all important GC to ~3 significant digits GC_count = 0 for base in ['G', 'C']: if base in sequence_count_table: GC_count += sequence_count_table[base] contig_info['gc_content'] = round(float(GC_count) / float(contig_info['length']), 5) # 5) add to contig list if contig_info['contig_id'] in all_contig_data: raise ValueError('The fasta header key ' + contig_info['contig_id'] + 'appears more than once in the file') all_contig_data[contig_info['contig_id']] = contig_info # Aggregate stats for the data total_gc_content = None if total_length > 0: total_gc_content = round(float(base_counts['G'] + base_counts['C']) / float(total_length), 5) assembly_data = { 'md5': md5(",".join(sorted(md5_list))).hexdigest(), 'base_counts': base_counts, 'dna_size': total_length, 'gc_content': total_gc_content, 'contigs': all_contig_data, 'num_contigs': len(all_contig_data) } return assembly_data def fasta_filter_contigs_generator(self, fasta_record_iter, min_contig_length): ''' generates SeqRecords iterator for writing from a legacy contigset object ''' rows = 0 rows_added = 0 for record in fasta_record_iter: rows += 1 if len(record.seq) >= min_contig_length: rows_added += 1 yield record print(' - filtered out ' + str(rows - rows_added) + ' of ' + str(rows) + ' contigs that were shorter than ' + str(min_contig_length) + 'bp.') def filter_contigs_by_length(self, fasta_file_path, min_contig_length): ''' removes all contigs less than the min_contig_length provided ''' filtered_fasta_file_path = fasta_file_path + '.filtered.fa' fasta_record_iter = SeqIO.parse(fasta_file_path, 'fasta') SeqIO.write(self.fasta_filter_contigs_generator(fasta_record_iter, min_contig_length), filtered_fasta_file_path, 'fasta') return filtered_fasta_file_path def save_assembly_object(self, workspace_id, assembly_name, obj_data): print('Saving Assembly to Workspace') sys.stdout.flush() obj_info = self.dfu.save_objects({'id': workspace_id, 'objects': [{'type': 'KBaseGenomeAnnotations.Assembly', 'data': obj_data, 'name': assembly_name }] })[0] return obj_info def save_fasta_file_to_shock(self, fasta_file_path): ''' Given the path to the file, upload to shock and return Handle information returns: typedef structure { string shock_id; Handle handle; string node_file_name; string size; } FileToShockOutput; ''' print('Uploading fasta file (' + str(fasta_file_path) + ') to SHOCK') sys.stdout.flush() return self.dfu.file_to_shock({'file_path': fasta_file_path, 'make_handle': 1}) def stage_input(self, params): ''' Setup the input_directory by fetching the files and returning the path to the file''' file_path = None if 'file' in params: file_path = os.path.abspath(params['file']['path']) elif 'shock_id' in params: print('Downloading file from SHOCK node: ' + str(params['shock_id'])) sys.stdout.flush() input_directory = os.path.join(self.scratch, 'assembly-upload-staging-' + str(uuid.uuid4())) os.makedirs(input_directory) file_name = self.dfu.shock_to_file({'file_path': input_directory, 'shock_id': params['shock_id'] })['node_file_name'] file_path = os.path.join(input_directory, file_name) elif 'ftp_url' in params: print('Downloading file from: ' + str(params['ftp_url'])) sys.stdout.flush() file_path = self.dfu.download_web_file({'file_url': params['ftp_url'], 'download_type': 'FTP' })['copy_file_path'] # extract the file if it is compressed if file_path is not None: unpacked_file = self.dfu.unpack_file({'file_path': file_path}) return unpacked_file['file_path'] raise ValueError('No valid fasta could be extracted based on the input parameters') def validate_params(self, params): for key in ('workspace_name', 'assembly_name'): if key not in params: raise ValueError('required "' + key + '" field was not defined') # one and only one of either 'file', 'shock_id', or ftp_url is required input_count = 0 for key in ('file', 'shock_id', 'ftp_url'): if key in params and params[key] is not None: input_count = input_count + 1 if key == 'file': if not isinstance(params[key], dict) or 'path' not in params[key]: raise ValueError('when specifying a fasta file input, "path" field was not defined in "file"') if input_count == 0: raise ValueError('required fasta file as input, set as either "file", "shock_id", or "ftp_url"') if input_count > 1: raise ValueError('required exactly one fasta file as input source, you set more than one of ' + 'these fields: "file", "shock_id", or "ftp_url"')
class CufflinksUtils: CUFFLINKS_TOOLKIT_PATH = '/opt/cufflinks/' GFFREAD_TOOLKIT_PATH = '/opt/cufflinks/' def __init__(self, config): """ :param config: :param logger: :param directory: Working directory :param urls: Service urls """ # BEGIN_CONSTRUCTOR self.ws_url = config["workspace-url"] self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.srv_wiz_url = config['srv-wiz-url'] self.token = config['KB_AUTH_TOKEN'] self.shock_url = config['shock-url'] self.dfu = DataFileUtil(self.callback_url) self.gfu = GenomeFileUtil(self.callback_url) self.au = AssemblyUtil(self.callback_url) self.rau = ReadsAlignmentUtils(self.callback_url) self.set_api = SetAPI(self.srv_wiz_url, service_ver='dev') self.eu = ExpressionUtils(self.callback_url) self.ws = Workspace(self.ws_url, token=self.token) self.scratch = os.path.join(config['scratch'], str(uuid.uuid4())) self._mkdir_p(self.scratch) self.tool_used = "Cufflinks" self.tool_version = os.environ['VERSION'] # END_CONSTRUCTOR pass def parse_FPKMtracking_calc_TPM(self, filename): """ Generates TPM from FPKM :return: """ fpkm_dict = {} tpm_dict = {} gene_col = 0 fpkm_col = 9 sum_fpkm = 0.0 with open(filename) as f: next(f) for line in f: larr = line.split("\t") gene_id = larr[gene_col] if gene_id != "": fpkm = float(larr[fpkm_col]) sum_fpkm = sum_fpkm + fpkm fpkm_dict[gene_id] = math.log(fpkm + 1, 2) tpm_dict[gene_id] = fpkm if sum_fpkm == 0.0: log("Warning: Unable to calculate TPM values as sum of FPKM values is 0" ) else: for g in tpm_dict: tpm_dict[g] = math.log((tpm_dict[g] / sum_fpkm) * 1e6 + 1, 2) return fpkm_dict, tpm_dict def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _validate_run_cufflinks_params(self, params): """ _validate_run_cufflinks_params: Raises an exception if params are invalid """ log('Start validating run_cufflinks params') # check for required parameters for p in ['alignment_object_ref', 'workspace_name', 'genome_ref']: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) def _run_command(self, command): """ _run_command: run command and print result """ log('Start executing command:\n{}'.format(command)) pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True) output = pipe.communicate()[0] exitCode = pipe.returncode if (exitCode == 0): log('Executed command:\n{}\n'.format(command) + 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output)) else: error_msg = 'Error running command:\n{}\n'.format(command) error_msg += 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output) raise ValueError(error_msg) def _run_gffread(self, gff_path, gtf_path): """ _run_gffread: run gffread script ref: http://cole-trapnell-lab.github.io/cufflinks/file_formats/#the-gffread-utility """ log('converting gff to gtf') command = self.GFFREAD_TOOLKIT_PATH + '/gffread ' command += "-E {0} -T -o {1}".format(gff_path, gtf_path) self._run_command(command) def _create_gtf_annotation_from_genome(self, genome_ref): """ Create reference annotation file from genome """ ref = self.ws.get_object_subset([{ 'ref': genome_ref, 'included': ['contigset_ref', 'assembly_ref'] }]) if 'contigset_ref' in ref[0]['data']: contig_id = ref[0]['data']['contigset_ref'] elif 'assembly_ref' in ref[0]['data']: contig_id = ref[0]['data']['assembly_ref'] if contig_id is None: raise ValueError( "Genome at {0} does not have reference to the assembly object". format(genome_ref)) print(contig_id) log("Generating GFF file from Genome") try: ret = self.au.get_assembly_as_fasta({'ref': contig_id}) output_file = ret['path'] mapping_filename = c_mapping.create_sanitized_contig_ids( output_file) os.remove(output_file) # get the GFF ret = self.gfu.genome_to_gff({'genome_ref': genome_ref}) genome_gff_file = ret['file_path'] c_mapping.replace_gff_contig_ids(genome_gff_file, mapping_filename, to_modified=True) gtf_ext = ".gtf" if not genome_gff_file.endswith(gtf_ext): gtf_path = os.path.splitext(genome_gff_file)[0] + '.gtf' self._run_gffread(genome_gff_file, gtf_path) else: gtf_path = genome_gff_file log("gtf file : " + gtf_path) except Exception: raise ValueError( "Generating GTF file from Genome Annotation object Failed : {}" .format("".join(traceback.format_exc()))) return gtf_path def _get_gtf_file(self, alignment_ref): """ _get_gtf_file: get the reference annotation file (in GTF or GFF3 format) """ result_directory = self.scratch alignment_data = self.ws.get_objects2( {'objects': [{ 'ref': alignment_ref }]})['data'][0]['data'] genome_ref = alignment_data.get('genome_id') # genome_name = self.ws.get_object_info([{"ref": genome_ref}], includeMetadata=None)[0][1] # ws_gtf = genome_name+"_GTF_Annotation" genome_data = self.ws.get_objects2({'objects': [{ 'ref': genome_ref }]})['data'][0]['data'] gff_handle_ref = genome_data.get('gff_handle_ref') if gff_handle_ref: log('getting reference annotation file from genome') annotation_file = self.dfu.shock_to_file({ 'handle_id': gff_handle_ref, 'file_path': result_directory, 'unpack': 'unpack' })['file_path'] else: annotation_file = self._create_gtf_annotation_from_genome( genome_ref) return annotation_file def _get_gtf_file_from_genome_ref(self, genome_ref): """ _get_gtf_file: get the reference annotation file (in GTF or GFF3 format) """ result_directory = self.scratch genome_data = self.ws.get_objects2({'objects': [{ 'ref': genome_ref }]})['data'][0]['data'] gff_handle_ref = genome_data.get('gff_handle_ref') if gff_handle_ref: log('getting reference annotation file from genome') annotation_file = self.dfu.shock_to_file({ 'handle_id': gff_handle_ref, 'file_path': result_directory, 'unpack': 'unpack' })['file_path'] else: annotation_file = self._create_gtf_annotation_from_genome( genome_ref) return annotation_file def _get_input_file(self, alignment_ref): """ _get_input_file: get input BAM file from Alignment object """ bam_file_dir = self.rau.download_alignment( {'source_ref': alignment_ref})['destination_dir'] files = os.listdir(bam_file_dir) bam_file_list = [ file for file in files if re.match(r'.*\_sorted\.bam', file) ] if not bam_file_list: bam_file_list = [ file for file in files if re.match(r'.*(?<!sorted)\.bam', file) ] if not bam_file_list: raise ValueError('Cannot find .bam file from alignment {}'.format( alignment_ref)) bam_file_name = bam_file_list[0] bam_file = os.path.join(bam_file_dir, bam_file_name) return bam_file def _generate_command(self, params): """ _generate_command: generate cufflinks command """ cufflinks_command = '/opt/cufflinks/cufflinks' cufflinks_command += (' -q --no-update-check -p ' + str(params.get('num_threads', 1))) if 'max_intron_length' in params and params[ 'max_intron_length'] is not None: cufflinks_command += (' --max-intron-length ' + str(params['max_intron_length'])) if 'min_intron_length' in params and params[ 'min_intron_length'] is not None: cufflinks_command += (' --min-intron-length ' + str(params['min_intron_length'])) if 'overhang_tolerance' in params and params[ 'overhang_tolerance'] is not None: cufflinks_command += (' --overhang-tolerance ' + str(params['overhang_tolerance'])) cufflinks_command += " -o {0} -G {1} {2}".format( params['result_directory'], params['gtf_file'], params['input_file']) log('Generated cufflinks command: {}'.format(cufflinks_command)) return cufflinks_command def _process_rnaseq_alignment_object(self, params): """ _process_alignment_object: process KBaseRNASeq.RNASeqAlignment type input object """ log('start processing RNASeqAlignment object\nparams:\n{}'.format( json.dumps(params, indent=1))) alignment_ref = params.get('alignment_ref') result_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_directory) params['result_directory'] = str(result_directory) # input files params['input_file'] = self._get_input_file(alignment_ref) if not params.get('gtf_file'): params['gtf_file'] = self._get_gtf_file(alignment_ref) if '/' not in params['genome_ref']: params['genome_ref'] = params['workspace_name'] + '/' + params[ 'genome_ref'] command = self._generate_command(params) self._run_command(command) expression_obj_ref = self._save_rnaseq_expression( result_directory, alignment_ref, params.get('workspace_name'), params.get('genome_ref'), params['gtf_file'], params['expression_suffix']) returnVal = { 'result_directory': result_directory, 'expression_obj_ref': expression_obj_ref, 'alignment_ref': alignment_ref } expression_name = self.ws.get_object_info([{ "ref": expression_obj_ref }], includeMetadata=None)[0][1] widget_params = { "output": expression_name, "workspace": params.get('workspace_name') } returnVal.update(widget_params) return returnVal def _process_kbasesets_alignment_object(self, params): """ _process_alignment_object: process KBaseRNASeq.RNASeqAlignment type input object """ log('start processing KBaseSets object\nparams:\n{}'.format( json.dumps(params, indent=1))) alignment_ref = params.get('alignment_ref') result_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_directory) params['result_directory'] = str(result_directory) # input files params['input_file'] = self._get_input_file(alignment_ref) if not params.get('gtf_file'): params['gtf_file'] = self._get_gtf_file(alignment_ref) command = self._generate_command(params) self._run_command(command) expression_obj_ref = self._save_kbasesets_expression( result_directory, alignment_ref, params.get('workspace_name'), params.get('genome_ref'), params.get('gtf_file'), params.get('expression_suffix')) returnVal = { 'result_directory': result_directory, 'expression_obj_ref': expression_obj_ref, 'alignment_ref': alignment_ref } expression_name = self.ws.get_object_info([{ "ref": expression_obj_ref }], includeMetadata=None)[0][1] widget_params = { "output": expression_name, "workspace": params.get('workspace_name') } returnVal.update(widget_params) return returnVal def _generate_html_report(self, result_directory, obj_ref): """ _generate_html_report: generate html summary report """ log('Start generating html report') html_report = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file_path = os.path.join(output_directory, 'report.html') expression_object = self.ws.get_objects2( {'objects': [{ 'ref': obj_ref }]})['data'][0] expression_object_type = expression_object.get('info')[2] Overview_Content = '' if re.match('KBaseRNASeq.RNASeqExpression-\d.\d', expression_object_type): Overview_Content += '<p>Generated Expression Object:</p><p>{}</p>'.format( expression_object.get('info')[1]) elif re.match('KBaseRNASeq.RNASeqExpressionSet-\d.\d', expression_object_type): Overview_Content += '<p>Generated Expression Set Object:</p><p>{}</p>'.format( expression_object.get('info')[1]) Overview_Content += '<br><p>Generated Expression Object:</p>' for expression_ref in expression_object['data'][ 'sample_expression_ids']: expression_name = self.ws.get_object_info( [{ "ref": expression_ref }], includeMetadata=None)[0][1] Overview_Content += '<p>{}</p>'.format(expression_name) elif re.match('KBaseSets.ExpressionSet-\d.\d', expression_object_type): pprint(expression_object) Overview_Content += '<p>Generated Expression Set Object:</p><p>{}</p>'.format( expression_object.get('info')[1]) Overview_Content += '<br><p>Generated Expression Object:</p>' for expression_ref in expression_object['data']['items']: expression_name = self.ws.get_object_info( [{ "ref": expression_ref['ref'] }], includeMetadata=None)[0][1] condition = expression_ref['label'] Overview_Content += '<p>condition:{0}; expression_name: {1}</p>'.format( condition, expression_name) with open(result_file_path, 'w') as result_file: with open( os.path.join(os.path.dirname(__file__), 'report_template.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace( '<p>Overview_Content</p>', Overview_Content) result_file.write(report_template) html_report.append({ 'path': result_file_path, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for Cufflinks App' }) return html_report def _save_rnaseq_expression(self, result_directory, alignment_ref, workspace_name, genome_ref, gtf_file, expression_suffix): """ _save_rnaseq_expression: save Expression object to workspace """ log('start saving Expression object') alignment_object_name = self.ws.get_object_info( [{ "ref": alignment_ref }], includeMetadata=None)[0][1] # set expression name if re.match('.*_[Aa]lignment$', alignment_object_name): expression_name = re.sub('_[Aa]lignment$', expression_suffix, alignment_object_name) else: # assume user specified suffix expression_name = alignment_object_name + expression_suffix expression_ref = self.eu.upload_expression({ 'destination_ref': workspace_name + '/' + expression_name, 'source_dir': result_directory, 'alignment_ref': alignment_ref, 'tool_used': self.tool_used, 'tool_version': self.tool_version })['obj_ref'] return expression_ref def _save_kbasesets_expression(self, result_directory, alignment_ref, workspace_name, genome_ref, gtf_file, expression_suffix): """ _save_kbasesets_expression: save Expression object to workspace using ExpressionUtils and SetAPI """ log('start saving Expression object') alignment_info = self.ws.get_object_info3( {'objects': [{ "ref": alignment_ref }]}) alignment_object_name = alignment_info['infos'][0][1] # set expression name if re.match('.*_[Aa]lignment$', alignment_object_name): expression_name = re.sub('_[Aa]lignment$', expression_suffix, alignment_object_name) else: # assume user specified suffix expression_name = alignment_object_name + expression_suffix expression_ref = self.eu.upload_expression({ 'destination_ref': workspace_name + '/' + expression_name, 'source_dir': result_directory, 'alignment_ref': alignment_ref, 'tool_used': self.tool_used, 'tool_version': self.tool_version })['obj_ref'] return expression_ref def _save_rnaseq_expression_set(self, alignment_expression_map, alignment_set_ref, workspace_name, expression_set_name): """ _save_rnaseq_expression_set: save ExpressionSet object to workspace """ log('start saving ExpressionSet object') if isinstance(workspace_name, int) or workspace_name.isdigit(): workspace_id = workspace_name else: workspace_id = self.dfu.ws_name_to_id(workspace_name) expression_set_data = self._generate_expression_set_data( alignment_expression_map, alignment_set_ref, expression_set_name) object_type = 'KBaseRNASeq.RNASeqExpressionSet' save_object_params = { 'id': workspace_id, 'objects': [{ 'type': object_type, 'data': expression_set_data, 'name': expression_set_name }] } dfu_oi = self.dfu.save_objects(save_object_params)[0] expression_set_ref = str(dfu_oi[6]) + '/' + str(dfu_oi[0]) + '/' + str( dfu_oi[4]) return expression_set_ref def _save_kbasesets_expression_set(self, alignment_expression_map, alignment_set_ref, workspace_name, expression_set_name): """ _save_kbasesets_expression_set: save ExpressionSet object to workspace """ log('start saving ExpressionSet object') if isinstance(workspace_name, int) or workspace_name.isdigit(): workspace_id = workspace_name else: workspace_id = self.dfu.ws_name_to_id(workspace_name) expression_set_data = self._generate_expression_set_data( alignment_expression_map, alignment_set_ref, expression_set_name) object_type = 'KBaseRNASeq.RNASeqExpressionSet' save_object_params = { 'id': workspace_id, 'objects': [{ 'type': object_type, 'data': expression_set_data, 'name': expression_set_name }] } dfu_oi = self.dfu.save_objects(save_object_params)[0] expression_set_ref = str(dfu_oi[6]) + '/' + str(dfu_oi[0]) + '/' + str( dfu_oi[4]) return expression_set_ref def _generate_report(self, obj_ref, workspace_name, result_directory, exprMatrix_FPKM_ref=None, exprMatrix_TPM_ref=None): """ _generate_report: generate summary report """ log('creating report') output_files = self._generate_output_file_list(result_directory) output_html_files = self._generate_html_report(result_directory, obj_ref) expression_object = self.ws.get_objects2( {'objects': [{ 'ref': obj_ref }]})['data'][0] expression_info = expression_object['info'] expression_data = expression_object['data'] expression_object_type = expression_info[2] if re.match('KBaseRNASeq.RNASeqExpression-\d+.\d+', expression_object_type): objects_created = [{ 'ref': obj_ref, 'description': 'Expression generated by Cufflinks' }] elif re.match('KBaseRNASeq.RNASeqExpressionSet-\d+.\d+', expression_object_type): objects_created = [{ 'ref': obj_ref, 'description': 'Expression generated by Cufflinks' }] elif re.match('KBaseSets.ExpressionSet-\d+.\d+', expression_object_type): objects_created = [{ 'ref': obj_ref, 'description': 'ExpressionSet generated by Cufflinks' }] items = expression_data['items'] for item in items: objects_created.append({ 'ref': item['ref'], 'description': 'Expression generated by Cufflinks' }) objects_created.append({ 'ref': exprMatrix_FPKM_ref, 'description': 'FPKM ExpressionMatrix generated by Cufflinks' }) objects_created.append({ 'ref': exprMatrix_TPM_ref, 'description': 'TPM ExpressionMatrix generated by Cufflinks' }) report_params = { 'message': '', 'workspace_name': workspace_name, 'file_links': output_files, 'objects_created': objects_created, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 366, 'report_object_name': 'kb_cufflinks_report_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output def _parse_FPKMtracking(self, filename, metric): result = {} pos1 = 0 if metric == 'FPKM': pos2 = 7 if metric == 'TPM': pos2 = 8 with open(filename) as f: next(f) for line in f: larr = line.split("\t") if larr[pos1] != "": try: result[larr[pos1]] = math.log(float(larr[pos2]) + 1, 2) except ValueError: result[larr[pos1]] = math.log(1, 2) return result def _generate_output_file_list(self, result_directory): """ _generate_output_file_list: zip result files and generate file_links for report """ log('Start packing result files') output_files = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file = os.path.join(output_directory, 'cufflinks_result.zip') with zipfile.ZipFile(result_file, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as zip_file: for root, dirs, files in os.walk(result_directory): for file in files: if not (file.endswith('.DS_Store')): zip_file.write( os.path.join(root, file), os.path.join(os.path.basename(root), file)) output_files.append({ 'path': result_file, 'name': os.path.basename(result_file), 'label': os.path.basename(result_file), 'description': 'File(s) generated by Cufflinks App' }) return output_files def _generate_expression_data(self, result_directory, alignment_ref, gtf_file, workspace_name, expression_suffix): """ _generate_expression_data: generate Expression object with cufflinks output files """ alignment_data_object = self.ws.get_objects2( {'objects': [{ 'ref': alignment_ref }]})['data'][0] # set expression name alignment_object_name = alignment_data_object['info'][1] if re.match('.*_[Aa]lignment$', alignment_object_name): expression_name = re.sub('_[Aa]lignment$', expression_suffix, alignment_object_name) else: # assume user specified suffix expression_name = alignment_object_name + expression_suffix expression_data = { 'id': expression_name, 'type': 'RNA-Seq', 'numerical_interpretation': 'FPKM', 'processing_comments': 'log2 Normalized', 'tool_used': self.tool_used, 'tool_version': self.tool_version } alignment_data = alignment_data_object['data'] condition = alignment_data.get('condition') expression_data.update({'condition': condition}) genome_id = alignment_data.get('genome_id') expression_data.update({'genome_id': genome_id}) read_sample_id = alignment_data.get('read_sample_id') expression_data.update( {'mapped_rnaseq_alignment': { read_sample_id: alignment_ref }}) exp_dict, tpm_exp_dict = self.parse_FPKMtracking_calc_TPM( os.path.join(result_directory, 'genes.fpkm_tracking')) expression_data.update({'expression_levels': exp_dict}) expression_data.update({'tpm_expression_levels': tpm_exp_dict}) handle = self.dfu.file_to_shock({ 'file_path': result_directory, 'pack': 'zip', 'make_handle': True })['handle'] expression_data.update({'file': handle}) return expression_data def _generate_expression_set_data(self, alignment_expression_map, alignment_set_ref, expression_set_name): """ _generate_expression_set_data: generate ExpressionSet object with cufflinks output files """ alignment_set_data_object = self.ws.get_objects2( {'objects': [{ 'ref': alignment_set_ref }]})['data'][0] alignment_set_data = alignment_set_data_object['data'] expression_set_data = { 'tool_used': self.tool_used, 'tool_version': self.tool_version, 'id': expression_set_name, 'alignmentSet_id': alignment_set_ref, 'genome_id': alignment_set_data.get('genome_id'), 'sampleset_id': alignment_set_data.get('sampleset_id') } sample_expression_ids = [] mapped_expression_objects = [] mapped_expression_ids = [] for alignment_expression in alignment_expression_map: alignment_ref = alignment_expression.get('alignment_ref') expression_ref = alignment_expression.get('expression_obj_ref') sample_expression_ids.append(expression_ref) mapped_expression_ids.append({alignment_ref: expression_ref}) alignment_name = self.ws.get_object_info( [{ "ref": alignment_ref }], includeMetadata=None)[0][1] expression_name = self.ws.get_object_info( [{ "ref": expression_ref }], includeMetadata=None)[0][1] mapped_expression_objects.append({alignment_name: expression_name}) expression_set_data['sample_expression_ids'] = sample_expression_ids expression_set_data[ 'mapped_expression_objects'] = mapped_expression_objects expression_set_data['mapped_expression_ids'] = mapped_expression_ids return expression_set_data def _process_alignment_set_object(self, params, alignment_object_type): """ _process_alignment_set_object: process KBaseRNASeq.RNASeqAlignmentSet type input object and KBaseSets.ReadsAlignmentSet type object """ log('start processing KBaseRNASeq.RNASeqAlignmentSet object or KBaseSets.ReadsAlignmentSet object' '\nparams:\n{}'.format(json.dumps(params, indent=1))) alignment_set_ref = params.get('alignment_set_ref') if re.match('^KBaseRNASeq.RNASeqAlignmentSet-\d*', alignment_object_type): params['gtf_file'] = self._get_gtf_file(alignment_set_ref) else: if not '/' in params['genome_ref']: params['genome_ref'] = params['workspace_name'] + '/' + params[ 'genome_ref'] params['gtf_file'] = self._get_gtf_file_from_genome_ref( params['genome_ref']) alignment_set = self.set_api.get_reads_alignment_set_v1({ 'ref': alignment_set_ref, 'include_item_info': 0, 'include_set_item_ref_paths': 1 }) mul_processor_params = [] for alignment in alignment_set["data"]["items"]: alignment_ref = alignment['ref_path'] alignment_upload_params = params.copy() alignment_upload_params['alignment_ref'] = alignment_ref mul_processor_params.append(alignment_upload_params) # use the following when you want to run the cmd sequentially # self._process_kbasesets_alignment_object(mul_processor_params[0]) cpus = min(params.get('num_threads'), multiprocessing.cpu_count()) pool = Pool(ncpus=cpus) log('running _process_alignment_object with {} cpus'.format(cpus)) alignment_expression_map = pool.map( self._process_kbasesets_alignment_object, mul_processor_params) result_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_directory) expression_items = list() for proc_alignment_return in alignment_expression_map: expression_obj_ref = proc_alignment_return.get( 'expression_obj_ref') alignment_ref = proc_alignment_return.get('alignment_ref') alignment_info = self.ws.get_object_info3({ 'objects': [{ "ref": alignment_ref }], 'includeMetadata': 1 }) condition = alignment_info['infos'][0][10]['condition'] expression_items.append({ "ref": expression_obj_ref, "label": condition, }) expression_name = self.ws.get_object_info( [{ "ref": expression_obj_ref }], includeMetadata=None)[0][1] self._run_command('cp -R {} {}'.format( proc_alignment_return.get('result_directory'), os.path.join(result_directory, expression_name))) expression_set = { "description": "generated by kb_cufflinks", "items": expression_items } expression_set_info = self.set_api.save_expression_set_v1({ "workspace": params['workspace_name'], "output_object_name": params['expression_set_name'], "data": expression_set }) returnVal = { 'result_directory': result_directory, 'expression_obj_ref': expression_set_info['set_ref'] } widget_params = { "output": params.get('expression_set_name'), "workspace": params.get('workspace_name') } returnVal.update(widget_params) return returnVal def _generate_output_object_name(self, params, alignment_object_type, alignment_object_name): """ Generates the output object name based on input object type and name and stores it in params with key equal to 'expression' or 'expression_set' based on whether the input object is an alignment or alignment_set. :param params: module input params :param alignment_object_type: input alignment object type :param alignment_object_name: input alignment object name :param alignment_object_data: input alignment object data """ expression_set_suffix = params['expression_set_suffix'] expression_suffix = params['expression_suffix'] if re.match('^KBaseRNASeq.RNASeqAlignment-\d*', alignment_object_type): if re.match('.*_[Aa]lignment$', alignment_object_name): params['expression_name'] = re.sub('_[Aa]lignment$', expression_suffix, alignment_object_name) else: # assume user specified suffix params[ 'expression_name'] = alignment_object_name + expression_suffix if re.match('^KBaseRNASeq.RNASeqAlignmentSet-\d*', alignment_object_type): if re.match('.*_[Aa]lignment_[Ss]et$', alignment_object_name): # set expression set name params['expression_set_name'] = re.sub('_[Aa]lignment_[Ss]et$', expression_set_suffix, alignment_object_name) else: # assume user specified suffix params[ 'expression_set_name'] = alignment_object_name + expression_set_suffix if re.match('^KBaseSets.ReadsAlignmentSet-\d*', alignment_object_type): if re.match('.*_[Aa]lignment_[Ss]et$', alignment_object_name): # set expression set name params['expression_set_name'] = re.sub('_[Aa]lignment_[Ss]et$', expression_set_suffix, alignment_object_name) else: # assume user specified suffix params[ 'expression_set_name'] = alignment_object_name + expression_set_suffix def _save_expression_matrix(self, expressionset_ref, workspace_name): """ _save_expression_matrix: save FPKM and TPM ExpressionMatrix """ log('start saving ExpressionMatrix object') expression_set_name = self.ws.get_object_info( [{ "ref": expressionset_ref }], includeMetadata=None)[0][1] output_obj_name_prefix = re.sub('_*[Ee]xpression_*[Ss]et', '', expression_set_name) upload_expression_matrix_params = { 'expressionset_ref': expressionset_ref, 'output_obj_name': output_obj_name_prefix, 'workspace_name': workspace_name } expression_matrix_refs = self.eu.get_expressionMatrix( upload_expression_matrix_params) return expression_matrix_refs def run_cufflinks_app(self, params): log('--->\nrunning CufflinksUtil.run_cufflinks_app\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self._validate_run_cufflinks_params(params) alignment_object_ref = params.get('alignment_object_ref') alignment_object_info = self.ws.get_object_info3( {"objects": [{ "ref": alignment_object_ref }]})['infos'][0] alignment_object_type = alignment_object_info[2] alignment_object_name = alignment_object_info[1] # get output object name self._generate_output_object_name(params, alignment_object_type, alignment_object_name) log('--->\nalignment object type: \n' + '{}'.format(alignment_object_type)) if re.match('^KBaseRNASeq.RNASeqAlignment-\d*', alignment_object_type): params.update({'alignment_ref': alignment_object_ref}) returnVal = self._process_rnaseq_alignment_object(params) report_output = self._generate_report( returnVal.get('expression_obj_ref'), params.get('workspace_name'), returnVal.get('result_directory')) returnVal.update(report_output) elif re.match('^KBaseRNASeq.RNASeqAlignmentSet-\d*', alignment_object_type) or \ re.match('^KBaseSets.ReadsAlignmentSet-\d*', alignment_object_type): params.update({'alignment_set_ref': alignment_object_ref}) returnVal = self._process_alignment_set_object( params, alignment_object_type) expression_matrix_refs = self._save_expression_matrix( returnVal['expression_obj_ref'], params.get('workspace_name')) returnVal.update(expression_matrix_refs) report_output = self._generate_report( returnVal['expression_obj_ref'], params.get('workspace_name'), returnVal['result_directory'], expression_matrix_refs['exprMatrix_FPKM_ref'], expression_matrix_refs['exprMatrix_TPM_ref']) returnVal.update(report_output) else: raise ValueError( 'None RNASeqAlignment type\nObject info:\n{}'.format( alignment_object_info)) return returnVal
class kb_virsorterTest(unittest.TestCase): @classmethod def setUpClass(cls): token = environ.get('KB_AUTH_TOKEN', None) user_id = requests.post( 'https://kbase.us/services/authorization/Sessions/Login', data='token={}&fields=user_id'.format(token)).json()['user_id'] # WARNING: don't call any logging methods on the context object, # it'll result in a NoneType error cls.ctx = MethodContext(None) cls.ctx.update({'token': token, 'user_id': user_id, 'provenance': [ {'service': 'kb_virsorter', 'method': 'please_never_use_it_in_production', 'method_params': [] }], 'authenticated': 1}) config_file = environ.get('KB_DEPLOYMENT_CONFIG', None) cls.cfg = {} config = ConfigParser() config.read(config_file) for nameval in config.items('kb_virsorter'): cls.cfg[nameval[0]] = nameval[1] cls.wsURL = cls.cfg['workspace-url'] cls.wsClient = workspaceService(cls.wsURL, token=token) cls.serviceImpl = kb_virsorter(cls.cfg) cls.testobjref = [] #cls.testobjdata = [] cls.testwsname = [] @classmethod def tearDownClass(cls): if hasattr(cls, 'wsName'): cls.wsClient.delete_workspace({'workspace': cls.wsName}) print('Test workspace was deleted') if hasattr(cls, 'testwsname') and len(cls.testwsname) > 0: try: print('Deleting workspace 2 ' + cls.testwsname[0]) cls.wsClient.delete_workspace({'workspace': cls.testwsname[0]}) print('Test workspace 2 was deleted ' + cls.testwsname[0]) except Exception as e: print e #if hasattr(cls, 'testobjdata'): # try: # print('Deleting shock data ' + str(len(cls.testobjdata))) # print('Deleting shock data ' + str(len(cls.testobjdata[0]['data'][0]))) # print('Deleting shock data ' + str(cls.testobjdata[0])) # node = cls.testobjdata[0]['data'][0]['lib']['file']['id'] # cls.delete_shock_node(node) # print('Test shock data was deleted') # except Exception as e: # print e def getWsClient(self): return self.__class__.wsClient def getWsName(self): if hasattr(self.__class__, 'wsName'): return self.__class__.wsName suffix = int(time.time() * 1000) wsName = "test_kb_virsorter_" + str(suffix) ret = self.getWsClient().create_workspace({'workspace': wsName}) self.__class__.wsName = wsName return wsName def getImpl(self): return self.__class__.serviceImpl def getContext(self): return self.__class__.ctx def write_file(self, filename, content): tmp_dir = self.cfg['scratch'] file_path = os.path.join(tmp_dir, filename) with open(file_path, 'w') as fh1: fh1.write(content) return file_path def delete_shock_node(self, node_id): header = {'Authorization': 'Oauth {0}'.format(cls.token)} requests.delete(cls.shockURL + '/node/' + node_id, headers=header, allow_redirects=True) def ztest_aaa_upload_to_shock(self): print "upload ref data to shock staging" self.dfUtil = DataFileUtil(os.environ['SDK_CALLBACK_URL']) #file_path = self.write_file('Phage_gene_catalog.tar.gz', 'Test') input_file_name = 'Phage_gene_catalog_plus_viromes.tar.gz'#'Phage_gene_catalog.tar.gz'#''PFAM_27.tar.gz' source_file_path = "/kb/module/work/"+input_file_name# os.path.join(tmp_dir, input_file_name) tmp_dir = self.cfg['scratch'] target_file_path = os.path.join(tmp_dir, input_file_name) print "file_path " + source_file_path+"\t"+target_file_path orig_size = os.path.getsize(source_file_path) shutil.copy(source_file_path, target_file_path) print "Testing "+target_file_path print(os.path.isfile(target_file_path)) ret1 = self.dfUtil.file_to_shock( {'file_path': target_file_path}) print str(ret1) shock_id = ret1['shock_id'] print "shock_id "+shock_id file_path2 = os.path.join("/kb/module/work/", 'test.tar.gz') #ret2 = self.dfUtil.shock_to_file( # {'shock_id': shock_id, 'file_path': file_path2})[0] ret2 = self.dfUtil.shock_to_file( {'shock_id': shock_id, 'file_path': file_path2}) print(ret2) file_name = ret2['node_file_name'] attribs = ret2['attributes'] self.assertEqual(file_name, 'Phage_gene_catalog_plus_viromes.tar.gz') self.assertEqual(ret2['file_path'], file_path2) self.assertEqual(ret2['size'], orig_size) self.assertIsNone(attribs) #self.delete_shock_node(shock_id) def create_random_string(self): N = 20 return ''.join( random.SystemRandom().choice(string.ascii_uppercase + string.digits) for _ in range(N)) def test_virsorter_ok(self): self.upload_assembly() if not self.testwsname: self.testwsname.append(self.create_random_string()) print "upload_reads self.testwsname[0] " + self.testwsname[0] #try: # ret = self.wsClient.create_workspace({'workspace': self.testwsname[0]}) # test_ws_name #except Exception as e: # # print "ERROR" # # print(type(e)) # # print(e.args) # print(e) # pass print "self.testwsname "+ str(self.testwsname) params = {} params['assembly_ref'] = str(self.testobjref[0])#str(self.testwsname[0])+"/"+ #"16589/2/1"#""#'16589/2/1'#self.testobjref params['ws_name'] = self.testwsname[0] result = self.getImpl().run_virsorter(self.getContext(), params) print('RESULT run_virsorter:') pprint(result) #testresult = [ # {'blah': 'blah', 'bleh': 'bleh'}] testresult = [{'report_ref': result[0]['report_ref'], 'report_name': result[0]['report_name']}] self.assertEqual(sorted(result), sorted(testresult)) def upload_assembly(self): if not self.testobjref: print "upload_assembly start" indata = 'U00096.2.fa'#_first1000. ftarget = os.path.join(self.cfg['scratch'], indata)#self.scratch, indata) print "ftarget " + ftarget ret = shutil.copy('../test_data/' + indata, ftarget) #self.readsUtilClient = ReadsUtils(os.environ['SDK_CALLBACK_URL']) self.assemblyUtilClient = AssemblyUtil(os.environ['SDK_CALLBACK_URL']) if not self.testwsname: self.testwsname.append(self.create_random_string()) print "upload_assembly self.testwsname[0] " + self.testwsname[0] try: ret = self.wsClient.create_workspace({'workspace': self.testwsname[0]}) #test_ws_name except Exception as e: #print "ERROR" #print(type(e)) #print(e.args) print(e) pass try: print "attempt upload" print "ftarget " + ftarget ref = self.assemblyUtilClient.save_assembly_from_fasta( { 'workspace_name': self.testwsname[0], 'assembly_name': 'Ecolik12MG1655', 'file': {'path': ftarget}}) print "upload_assembly" print ref #self.testobjref = [] self.testobjref.append(self.testwsname[0] + '/Ecolik12MG1655/1') #self.testobjdata = [] #self.testobjdata.append(self.dfu.get_objects( # {'object_refs': [self.testobjref[0]]})) ##print self.testobjdata[0] except Exception as e: print e pass print "self.testobjref[0]" print self.testobjref print self.testobjref[0]
class BallgownUtil: def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.shock_url = config['shock-url'] self.dfu = DataFileUtil(self.callback_url) self.rau = ReadsAlignmentUtils(self.callback_url) self.fv = KBaseFeatureValues(self.callback_url) self.deu = DifferentialExpressionUtils(self.callback_url, service_ver='dev') self.ws = Workspace(self.ws_url, token=self.token) self.scratch = config['scratch'] self.config = config def _xor(self, a, b): return bool(a) != bool(b) def _validate_run_ballgown_app_params(self, params): """ _validate_run_ballgown_app_params: validates params passed to run_ballgown_app method """ log('start validating run_ballgown_app params') # check for required parameters for p in ['expressionset_ref', 'diff_expression_matrix_set_suffix', 'workspace_name']: if p not in params: raise ValueError('"{}" parameter is required, but missing'.format(p)) run_all_combinations = params.get('run_all_combinations') condition_pair_subset = params.get('condition_pair_subset') if not self._xor(run_all_combinations, condition_pair_subset): error_msg = "Invalid input:\nselect 'Run All Paired Condition Combinations' " error_msg += "or provide subset of condition pairs. Don't provide both, or neither." raise ValueError(error_msg) def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _generate_html_report(self, result_directory, params, diff_expression_matrix_set_ref): """ _generate_html_report: generate html summary report """ log('start generating html report') html_report = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file_path = os.path.join(output_directory, 'report.html') for file in glob.glob(os.path.join(result_directory, '*.tsv')): shutil.copy(file, output_directory) # volcano_plot exists only if there are two condition groups for file in glob.glob(os.path.join(result_directory, '*.png')): shutil.copy(file, output_directory) diff_expr_set = self.ws.get_objects2({'objects': [{'ref': diff_expression_matrix_set_ref[ 'diffExprMatrixSet_ref']}]})['data'][0] diff_expr_set_data = diff_expr_set['data'] diff_expr_set_info = diff_expr_set['info'] diff_expr_set_name = diff_expr_set_info[1] overview_content = '' overview_content += '<br/><table><tr><th>Generated DifferentialExpressionMatrixSet' overview_content += ' Object</th></tr>' overview_content += '<tr><td>{} ({})'.format(diff_expr_set_name, diff_expression_matrix_set_ref[ 'diffExprMatrixSet_ref']) overview_content += '</td></tr></table>' overview_content += '<p><br/></p>' overview_content += '<br/><table><tr><th>Generated DifferentialExpressionMatrix' overview_content += ' Object</th><th></th><th></th><th></th></tr>' overview_content += '<tr><th>Differential Expression Matrix Name</th>' overview_content += '<th>Condition 1</th>' overview_content += '<th>Condition 2</th>' overview_content += '</tr>' for item in diff_expr_set_data['items']: item_diffexprmatrix_object = self.ws.get_objects2({'objects': [{'ref': item['ref']}]})[ 'data'][0] item_diffexprmatrix_info = item_diffexprmatrix_object['info'] item_diffexprmatrix_data = item_diffexprmatrix_object['data'] diffexprmatrix_name = item_diffexprmatrix_info[1] overview_content += '<tr><td>{} ({})</td>'.format(diffexprmatrix_name, item['ref']) overview_content += '<td>{}</td>'.format(item_diffexprmatrix_data. get('condition_mapping').keys()[0]) overview_content += '<td>{}</td>'.format(item_diffexprmatrix_data. get('condition_mapping').values()[0]) overview_content += '</tr>' overview_content += '</table>' # visualization image_content = '' for image in glob.glob(output_directory + "/*.png"): image = image.replace(output_directory + '/', '') caption = image.replace(output_directory + '/', '').replace('.png', '') image_content += '<p style="text-align:center"><img align="center" src="{}" ' \ 'width="600" height="400"></a><a target="_blank"><br>' \ '<p align="center">{}</p></p>'.format( image, caption) with open(result_file_path, 'w') as result_file: with open(os.path.join(os.path.dirname(__file__), 'report_template.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace('<p>Overview_Content</p>', overview_content) report_template = report_template.replace('<p>Image Gallery</p>', image_content) result_file.write(report_template) report_shock_id = self.dfu.file_to_shock({'file_path': output_directory, 'pack': 'zip'})['shock_id'] html_report.append({'shock_id': report_shock_id, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for Ballgown App'}) return html_report def _generate_output_file_list(self, result_directory): """ _generate_output_file_list: zip result files and generate file_links for report """ log('Start packing result files') output_files = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file = os.path.join(output_directory, 'ballgown_result.zip') with zipfile.ZipFile(result_file, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as zip_file: for root, dirs, files in os.walk(result_directory): for file in files: if not (file.endswith('.zip') or file.endswith('.png') or file.endswith('.DS_Store')): zip_file.write(os.path.join(root, file), file) output_files.append({'path': result_file, 'name': os.path.basename(result_file), 'label': os.path.basename(result_file), 'description': 'File(s) generated by Ballgown App'}) return output_files def _generate_report(self, params, result_directory, diff_expression_matrix_set_ref): """ _generate_report: generate summary report """ log('creating report') output_files = self._generate_output_file_list(result_directory) output_html_files = self._generate_html_report( result_directory, params, diff_expression_matrix_set_ref) report_params = { 'message': '', 'workspace_name': params.get('workspace_name'), 'file_links': output_files, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 333, 'report_object_name': 'kb_ballgown_report_' + str(uuid.uuid4())} kbase_report_client = KBaseReport(self.callback_url) output = kbase_report_client.create_extended_report(report_params) report_output = {'report_name': output['name'], 'report_ref': output['ref']} return report_output def get_sample_dir_group_file(self, mapped_expression_ids, condition_labels): ngroups = 0 group_name_indices = {} group_counts = {} for group in condition_labels: if not group in group_name_indices: group_name_indices[group] = ngroups ngroups = ngroups + 1 if not group in group_counts: group_counts[group] = 1 else: group_counts[group] = group_counts[group] + 1 # checks for proper ballgown execution: if ngroups < 2: raise Exception("At least two condition groups are needed for this analysis. ") for group in condition_labels: if group_counts[group] < 2: raise Exception( "Condition group {0} has less than 2 members; ballgown will not run. " "At least two condition groups are needed for this analysis. ".format(group)) group_file_dir = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(group_file_dir) try: condition_labels_uniqued = list(set(condition_labels)) sgf_name = os.path.join(group_file_dir, 'sample_dir_group_file_' + condition_labels_uniqued[0] + '_' + condition_labels_uniqued[1]) sgf = open(sgf_name, "w") except Exception: raise Exception( "Can't open file {0} for writing {1}".format( sgf_name, traceback.format_exc())) index = 0 # condition label index for ii in mapped_expression_ids: for alignment_id, expression_id in ii.items(): expression_object = self.ws.get_objects2( {'objects': [{'ref': expression_id}]})['data'][0] handle_id = expression_object['data']['file']['hid'] expression_name = expression_object['info'][1] expression_dir = os.path.join(group_file_dir, expression_name) self._mkdir_p(expression_dir) print('expression_name: ' + str(expression_dir) + ' ' + str(group_name_indices[condition_labels[index]])) sgf.write("{0} {1}\n".format(expression_dir, group_name_indices[condition_labels[index]])) self.dfu.shock_to_file({'handle_id': handle_id, 'file_path': expression_dir, 'unpack': 'unpack'}) required_files = [ 'e2t.ctab', 'e_data.ctab', 'i2t.ctab', 'i_data.ctab', 't_data.ctab'] for file in glob.glob(expression_dir + '/*'): if not os.path.basename(file) in required_files: os.remove(file) index += 1 return sgf_name def _cleanup(self, directory=None): """ Clean up after the job. At the moment this just means removing the working directory, but later could mean other things. """ try: # it would not delete if fold is not empty shutil.rmtree(directory, ignore_errors=True) # need to iterate each entry except IOError as e: log("Unable to remove working directory {0}".format(directory)) raise def _setupWorkingDir(self, directory=None): """ Clean up an existing workingdir and create a new one """ try: if os.path.exists(directory): self._cleanup(directory) os.mkdir(directory) except IOError: log("Unable to setup working dir {0}".format(directory)) raise def _check_intron_measurements(self, sample_dir_group_table_file): """ Check if intron measurements files are non-empty :param sample_dir_group_table_file: :return: """ log('checking for intron level measurements... ') file = open(sample_dir_group_table_file, 'r') textFileLines = file.readlines() for line in textFileLines: expr_dir = line.split()[0] log(expr_dir) i2t_file = open(os.path.join(expr_dir, 'i2t.ctab'), 'r') if len(i2t_file.readlines()) <= 1: # only header line exists raise Exception("No intron measurements found! Input expressions are possibly " "from a prokaryote. Ballgown functions only on eukaryotic data." " Consider using DeSeq2 or CuffDiff instead of BallGown.") idata_file = open(os.path.join(expr_dir, 'i_data.ctab'), 'r') if len(idata_file.readlines()) <= 1: # only header line exists raise Exception("No intron measurements found! Input expressions are possibly " "from a prokaryote. Ballgown functions only on eukaryotic data." " Consider using DeSeq2 or CuffDiff instead of BallGown") def run_ballgown_diff_exp(self, rscripts_dir, sample_dir_group_table_file, ballgown_output_dir, output_csv, volcano_plot_file ): """ Make R call to execute the system :param rscripts_dir: :param sample_dir_group_table_file: :param ballgown_output_dir: sample_group_table is a listing of output Stringtie subdirectories, (full path specification) paired with group label (0 or 1), ie /path/WT_rep1_stringtie 0 /path/WT_rep2_stringtie 0 /path/EXP_rep1_stringtie 1 /path/EXP_rep2_stringtie 1 (order doesn't matter, but the directory-group correspondance does) :param output_csv: :param volcano_plot_file: :return: """ # check if intron-level expression measurements are present self._check_intron_measurements(sample_dir_group_table_file) rcmd_list = ['Rscript', os.path.join(rscripts_dir, 'ballgown_fpkmgenematrix.R'), '--sample_dir_group_table', sample_dir_group_table_file, '--output_dir', ballgown_output_dir, '--output_csvfile', output_csv, '--volcano_plot_file', volcano_plot_file ] rcmd_str = " ".join(str(x) for x in rcmd_list) log("rcmd_string is {0}".format(rcmd_str)) openedprocess = subprocess.Popen(rcmd_str, shell=True) openedprocess.wait() # Make sure the openedprocess.returncode is zero (0) if openedprocess.returncode != 0: log("R script did not return normally, return code - " + str(openedprocess.returncode)) raise Exception("Rscript failure") def load_diff_expr_matrix(self, ballgown_output_dir, output_csv): """ Reads csv diff expr matrix file from Ballgown and returns as a dictionary of rows with the gene as key. Each key gives a row of length three corresponding to fold_change, pval and qval in string form - can include 'NA' :param ballgown_output_dir :param output_csv: :return: """ diff_matrix_file = os.path.join(ballgown_output_dir, output_csv) if not os.path.isfile(diff_matrix_file): raise Exception("differential expression matrix csvfile {0} doesn't exist!".format( diff_matrix_file)) n = 0 dm = {} with open(diff_matrix_file, "r") as csv_file: csv_rows = csv.reader(csv_file, delimiter="\t", quotechar='"') for row in csv_rows: n = n + 1 if (n == 1): if (row != ['id', 'fc', 'pval', 'qval']): raise Exception( "did not get expected column heading from {0}".format( diff_matrix_file)) else: if (len(row) != 4): raise Exception( "did not get 4 elements in row {0} of csv file {1} ".format( n, diff_matrix_file)) key = row[0] # put in checks for NA or numeric for row[1] through 4 if (key in dm): raise Exception( "duplicate key {0} in row {1} of csv file {2} ".format( key, n, diff_matrix_file)) dm[key] = row[1:5] return dm def _transform_expression_set_data(self, expression_set_data): """ The stitch to connect KBaseSets.ExpressionSet-2.0 type data to the older KBaseRNASeq.RNASeqExpressionSet-3.0 that the implementation depends on. This is done by doing a dive into the nested alignment object ref and getting the required data :param expression_set_data: :return: transformed expression_set_data """ transform = dict() # get genome id expression_ref = expression_set_data['items'][0]['ref'] wsid, objid, ver = expression_ref.split('/') expression_obj = self.ws.get_objects([{'objid': objid, 'wsid': wsid}]) transform['genome_id'] = expression_obj[0]['data']['genome_id'] # get sampleset_id #alignment_ref = expression_obj[0]['data']['mapped_rnaseq_alignment'].values()[0] #wsid, objid, ver = alignment_ref.split('/') #alignment_obj = self.ws.get_objects([{'objid': objid, 'wsid': wsid}]) #transform['sampleset_id'] = alignment_obj[0]['data']['sampleset_id'] # build mapped_expression_ids mapped_expression_ids = list() for item in expression_set_data['items']: expression_ref = item['ref'] wsid, objid, ver = expression_ref.split('/') expression_obj = self.ws.get_objects([{'objid': objid, 'wsid': wsid}]) alignment_ref = expression_obj[0]['data']['mapped_rnaseq_alignment'].values()[0] mapped_expression_ids.append({alignment_ref: expression_ref}) transform['mapped_expression_ids'] = mapped_expression_ids return transform def _build_condition_label_list(self, mapped_expression_ids): """ Extracts the condition labels from each expression in the specified expression set data and builds a list of condition labels :param expression_set_data: expression set data :return: list of condition labels whose order resembles the expression order in the expression data """ condition_labels = list() for ii in mapped_expression_ids: for alignment_id, expression_id in ii.items(): expression_object = self.ws.get_objects2( {'objects': [{'ref': expression_id}]})['data'][0] condition_labels.append(expression_object['data']['condition']) return condition_labels def _update_output_file_header(self, output_file): """ Modify header of output file (required by DifferentialExpressionUtils) :param output_file: :return: """ f = open(output_file, 'r') filedata = f.read() f.close() modified_output = filedata.replace( '"id"\t"fc"\t"pval"\t"qval"', 'gene_id\tlog2_fold_change\tp_value\tq_value') f = open(output_file, 'w') f.write(modified_output) f.close() def _check_input_labels(self, condition_pair_subset, available_condition_labels): """ _check_input_labels: check input condition pairs """ checked = True # example struct: [{u'condition': u'hy5'}, {u'condition': u'WT'}] condition_values = set() for condition in condition_pair_subset: condition_values.add(condition['condition']) if len(condition_values) < 2: error_msg = 'At least two unique conditions must be specified. ' raise ValueError(error_msg) for condition in condition_pair_subset: label = condition['condition'].strip() if label not in available_condition_labels: error_msg = 'Condition label "{}" is not a valid condition. '.format(label) error_msg += 'Must be one of "{}"'.format(available_condition_labels) raise ValueError(error_msg) return checked def run_ballgown_app(self, params): """ run_ballgown_app: run Ballgown app (https://www.bioconductor.org/packages/release/bioc/html/ballgown.html) required params: expressionset_ref: ExpressionSet object reference diff_expression_matrix_set_suffix: suffix to KBaseSets.DifferetialExpressionMatrixSet name condition_labels: conditions for expression set object alpha_cutoff: q value cutoff fold_change_cutoff: fold change cutoff workspace_name: the name of the workspace it gets saved to optional params: fold_scale_type: one of ["linear", "log2+1", "log10+1"] return: result_directory: folder path that holds all files generated by run_deseq2_app diff_expression_matrix_set_ref: generated KBaseSets.DifferetialExpressionMatrixSet object reference report_name: report name generated by KBaseReport report_ref: report reference generated by KBaseReport """ log('--->\nrunning BallgownUtil.run_ballgown_app\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self._validate_run_ballgown_app_params(params) expressionset_ref = params.get('expressionset_ref') expression_set_info = self.ws.get_object_info3({ "objects": [{"ref": expressionset_ref}]})['infos'][0] expression_object_type = expression_set_info[2] # set output object name differential_expression_suffix = params['diff_expression_matrix_set_suffix'] expression_name = expression_set_info[1] if re.match('.*_[Ee]xpression$', expression_name): params['diff_expression_matrix_set_name'] = re.sub( '_[Ee]xpression$', differential_expression_suffix, expression_name) if re.match('.*_[Ee]xpression_[Ss]et$', expression_name): params['diff_expression_matrix_set_name'] = re.sub( '_[Ee]xpression_[Ss]et$', differential_expression_suffix, expression_name) else: params['diff_expression_matrix_set_name'] = expression_name + \ differential_expression_suffix log('--->\nexpression object type: \n' + '{}'.format(expression_object_type)) if re.match('KBaseRNASeq.RNASeqExpressionSet-\d.\d', expression_object_type): expression_set_data = self.ws.get_objects2( {'objects': [{'ref': expressionset_ref}]})['data'][0]['data'] elif re.match('KBaseSets.ExpressionSet-\d.\d', expression_object_type): expression_set_data = self.ws.get_objects2( {'objects': [{'ref': expressionset_ref}]})['data'][0]['data'] expression_set_data = self._transform_expression_set_data(expression_set_data) mgroup = MultiGroup(self.ws) pairwise_mapped_expression_ids = mgroup.build_pairwise_groups( expression_set_data['mapped_expression_ids']) ballgown_output_dir = os.path.join(self.scratch, "ballgown_out") log("ballgown output dir is {0}".format(ballgown_output_dir)) self._setupWorkingDir(ballgown_output_dir) # get set of all condition labels available_condition_labels = \ self._build_condition_label_list(expression_set_data['mapped_expression_ids']) if params.get('run_all_combinations'): requested_condition_labels = available_condition_labels else: # get set of user specified condition labels condition_pair_subset = params.get('condition_pair_subset') if self._check_input_labels(condition_pair_subset, available_condition_labels): requested_condition_labels = list() # example: [{u'condition': u'hy5'}, {u'condition': u'WT'}] for condition in condition_pair_subset: if condition.get('condition').strip() not in requested_condition_labels: requested_condition_labels.append(condition.get('condition').strip()) log("User requested pairwise combinations from condition label list : " + str(requested_condition_labels)) diff_expr_files = list() for mapped_expression_ids in pairwise_mapped_expression_ids: print('processing pairwise combination: ') pprint(mapped_expression_ids) print('with condtion labels: ') condition_labels = self._build_condition_label_list(mapped_expression_ids) pprint(condition_labels) # skip if condition labels in this pairwise combination don't exist in # set of user requested condition labels skip = False for condition in condition_labels: if condition not in requested_condition_labels: log("skipping " + str(condition_labels)) skip = True if skip: continue sample_dir_group_file = self.get_sample_dir_group_file(mapped_expression_ids, condition_labels) log("about to run_ballgown_diff_exp") rscripts_dir = '/kb/module/rscripts' condition_labels_uniqued = list() for condition in condition_labels: if condition not in condition_labels_uniqued: condition_labels_uniqued.append(condition) output_csv = 'ballgown_diffexp_' + \ condition_labels_uniqued[0] + '_' + condition_labels_uniqued[1] + '.tsv' volcano_plot_file = 'volcano_plot_' + \ condition_labels_uniqued[0] + '_' + condition_labels_uniqued[1] + '.png' self.run_ballgown_diff_exp(rscripts_dir, sample_dir_group_file, ballgown_output_dir, output_csv, volcano_plot_file) log("back from run_ballgown_diff_exp, about to load diff exp matrix file") # diff_expr_matrix = self.load_diff_expr_matrix(ballgown_output_dir, # output_csv) # read file before its zipped self._update_output_file_header(os.path.join(ballgown_output_dir, output_csv)) diff_expr_file = dict() diff_expr_file.update({'condition_mapping': {condition_labels_uniqued[0]: condition_labels_uniqued[1]}}) diff_expr_file.update( {'diffexpr_filepath': os.path.join(ballgown_output_dir, output_csv)}) diff_expr_files.append(diff_expr_file) deu_param = { 'destination_ref': params['workspace_name'] + '/' + params['diff_expression_matrix_set_name'], 'diffexpr_data': diff_expr_files, 'tool_used': TOOL_NAME, 'tool_version': TOOL_VERSION, 'genome_ref': expression_set_data.get('genome_id'), } diff_expression_matrix_set_ref = self.deu.save_differential_expression_matrix_set( deu_param) returnVal = {'result_directory': ballgown_output_dir, 'diff_expression_matrix_set_ref': diff_expression_matrix_set_ref['diffExprMatrixSet_ref']} report_output = self._generate_report(params, ballgown_output_dir, diff_expression_matrix_set_ref) returnVal.update(report_output) return returnVal
class FastaGFFToGenome: def __init__(self, config): self.cfg = config self.dfu = DataFileUtil(self.cfg.callbackURL) def import_file(self, params): # 1) validate parameters self._validate_import_file_params(params) # 2) construct the input directory staging area input_directory = os.path.join(self.cfg.sharedFolder, 'fast_gff_upload_' + str(uuid.uuid4())) os.makedirs(input_directory) file_paths = self._stage_input(params, input_directory) # 3) extract out the parameters params = self._set_parsed_params(params) # 4) do the upload result = self.upload_genome( shock_service_url=self.cfg.shockURL, handle_service_url=self.cfg.handleURL, workspace_service_url=self.cfg.workspaceURL, callback_url=self.cfg.callbackURL, input_fasta_file=file_paths["fasta_file"], input_gff_file=file_paths["gff_file"], workspace_name=params['workspace_name'], core_genome_name=params['genome_name'], scientific_name=params['scientific_name'], taxon_wsname=params['taxon_wsname'], taxon_reference=params['taxon_reference'], source=params['source'], genome_type=params['type'], release=params['release']) # 5) generate report output_data_ref = params['workspace_name'] + "/" + params['genome_name'] reportObj = { 'objects_created': [{ 'ref': output_data_ref, 'description': 'KBase Genome object' }], 'text_message': result['report_string'] } reportClient = KBaseReport(os.environ['SDK_CALLBACK_URL']) report_info = reportClient.create({ 'report': reportObj, 'workspace_name': params['workspace_name'] }) # 6) clear the temp directory shutil.rmtree(input_directory) # 7) return the result info = result['genome_info'] details = { 'genome_ref': str(info[6]) + '/' + str(info[0]) + '/' + str(info[4]), 'genome_info': info, 'report_name': report_info['name'], 'report_ref': report_info['ref'] } return details def upload_genome(self, shock_service_url=None, handle_service_url=None, workspace_service_url=None, callback_url=None, input_gff_file=None, input_fasta_file=None, workspace_name=None, core_genome_name=None, scientific_name="unknown_taxon", taxon_wsname='ReferenceTaxons', taxon_reference=None, source=None, release=None, genome_type=None): # retrieve taxon taxonomy, taxon_reference = self._retrieve_taxon( taxon_reference, taxon_wsname, scientific_name) # reading in Fasta file assembly = self._retrieve_fasta_file(input_fasta_file, core_genome_name, scientific_name, source) if taxon_reference is not None: assembly["taxon_ref"] = taxon_reference # reading in GFF file feature_list = self._retrieve_gff_file(input_gff_file) # compile links between features feature_hierarchy = self._generate_feature_hierarchy(feature_list) # retrieve genome feature list (genome_features_list, genome_mrnas_list, genome_cdss_list) = self._retrieve_genome_feature_list( feature_list, feature_hierarchy, assembly) # remove sequences before loading for contig in assembly["contigs"]: del assembly["contigs"][contig]["sequence"] aUtil = AssemblyUtil(callback_url) assembly_ref = aUtil.save_assembly_from_fasta({ 'file': { 'path': input_fasta_file, 'assembly_name': assembly['assembly_id'] }, 'workspace_name': workspace_name, 'assembly_name': assembly['assembly_id'] }) # generate genome info genome = self._gen_genome_info(core_genome_name, scientific_name, assembly_ref, genome_features_list, genome_cdss_list, genome_mrnas_list, source, assembly, taxon_reference, taxonomy, input_gff_file) workspace_id = self.dfu.ws_name_to_id(workspace_name) genome_info = self.dfu.save_objects({ "id": workspace_id, "objects": [{ "name": core_genome_name, "type": "KBaseGenomes.Genome", "data": genome }] })[0] report_string = '' return {'genome_info': genome_info, 'report_string': report_string} def _validate_import_file_params(self, params): """ validate_import_file_params: validates params passed to FastaGFFToGenome.import_file method """ # check for required parameters for p in ['workspace_name', 'genome_name', 'fasta_file', 'gff_file']: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) # one and only one of 'path', or 'shock_id' is required for key in ('fasta_file', 'gff_file'): file = params[key] if not isinstance(file, dict): raise ValueError( 'Required "{}" field must be a map/dict'.format(key)) n_valid_fields = 0 if 'path' in file and file['path'] is not None: n_valid_fields += 1 if 'shock_id' in file and file['shock_id'] is not None: n_valid_fields += 1 if 'ftp_url' in file and file['ftp_url'] is not None: n_valid_fields += 1 raise ValueError( 'FTP link is currently not supported for FastaGFFToGenome') if n_valid_fields < 1: error_msg = 'Required "{}" field must include one source: '.format( key) error_msg += 'path | shock_id' raise ValueError(error_msg) if n_valid_fields > 1: error_msg = 'Required "{}" field has too many sources specified: '.format( key) error_msg += str(file.keys()) raise ValueError(error_msg) # check for valid type param valid_types = ['Reference', 'User upload', 'Representative'] if params.get('type') and params['type'] not in valid_types: error_msg = 'Entered value for type is not one of the valid entries of ' error_msg += '[' + ''.join('"' + str(e) + '", ' for e in valid_types)[0:-2] + ']' raise ValueError(error_msg) def _set_parsed_params(self, params): log('Setting params') # default params default_params = { 'taxon_wsname': self.cfg.raw['taxon-workspace-name'], 'scientific_name': 'unknown_taxon', 'taxon_reference': None, 'source': 'User', 'release': None, 'type': 'User upload', 'metadata': {} } for field in default_params: if field not in params: params[field] = default_params[field] log(json.dumps(params, indent=1)) return params def _stage_input(self, params, input_directory): """ stage_input: Setup the input_directory by fetching the files and uncompressing if needed """ file_paths = dict() for key in ('fasta_file', 'gff_file'): file = params[key] file_path = None if 'path' in file and file['path'] is not None: local_file_path = file['path'] file_path = os.path.join(input_directory, os.path.basename(local_file_path)) log('Moving file from {} to {}'.format(local_file_path, file_path)) shutil.copy2(local_file_path, file_path) if 'shock_id' in file and file['shock_id'] is not None: # handle shock file log('Downloading file from SHOCK node: {}-{}'.format( self.cfg.sharedFolder, file['shock_id'])) sys.stdout.flush() file_name = self.dfu.shock_to_file({ 'file_path': input_directory, 'shock_id': file['shock_id'] })['node_file_name'] file_path = os.path.join(input_directory, file_name) # extract the file if it is compressed if file_path is not None: print("staged input file =" + file_path) sys.stdout.flush() dfUtil_result = self.dfu.unpack_file({'file_path': file_path}) file_paths[key] = dfUtil_result['file_path'] else: raise ValueError( 'No valid files could be extracted based on the input') return file_paths def _retrieve_taxon(self, taxon_reference, taxon_wsname, scientific_name): """ _retrieve_taxon: retrieve taxonomy and taxon_reference """ taxon_id = -1 taxon_object_name = "unknown_taxon" # retrieve lookup object if scientific name provided if (taxon_reference is None and scientific_name is not "unknown_taxon"): # retrieve taxon lookup object then find taxon id taxon_lookup = self.dfu.get_objects({ 'object_refs': [taxon_wsname + "/taxon_lookup"], 'ignore_errors': 0 })['data'][0]['data']['taxon_lookup'] if (scientific_name[0:3] in taxon_lookup and scientific_name in taxon_lookup[scientific_name[0:3]]): taxon_id = taxon_lookup[scientific_name[0:3]][scientific_name] taxon_object_name = "{}_taxon".format(str(taxon_id)) # retrieve Taxon object taxon_info = {} if (taxon_reference is None): taxon_info = self.dfu.get_objects({ 'object_refs': [taxon_wsname + "/" + taxon_object_name], 'ignore_errors': 0 })['data'][0] taxon_reference = "{}/{}/{}".format(taxon_info['info'][6], taxon_info['info'][0], taxon_info['info'][4]) else: taxon_info = self.dfu.get_objects({ "object_refs": [taxon_reference], 'ignore_errors': 0 })['data'][0] taxonomy = taxon_info['data']['scientific_lineage'] return taxonomy, taxon_reference def _retrieve_fasta_file(self, input_fasta_file, core_genome_name, scientific_name, source): """ _retrieve_fasta_file: retrieve info from fasta_file https://www.biostars.org/p/710/ """ log("Reading FASTA file") assembly = { "contigs": {}, "dna_size": 0, "gc_content": 0, "md5": [], "base_counts": {} } contig_seq_start = 0 input_file_handle = open(input_fasta_file, 'rb') # alternate header and sequence faiter = (x[1] for x in itertools.groupby(input_file_handle, lambda line: line[0] == ">")) for header in faiter: # drop the ">" header = header.next()[1:].strip() # join all sequence lines to one. seq = "".join(s.strip() for s in faiter.next()) try: fasta_header, fasta_description = header.split(' ', 1) except: fasta_header = header fasta_description = None # Handle record seq = seq.upper() # Build contig objects for Assembly seq_count = dict(collections.Counter(seq)) # to delete at end, but required for now contig_dict = {"sequence": seq} Ncount = 0 if "N" in seq_count: Ncount = seq_count["N"] contig_dict["Ncount"] = Ncount for character in seq_count: if character in assembly["base_counts"]: assembly["base_counts"][character] += seq_count[character] else: assembly["base_counts"][character] = seq_count[character] contig_seq_length = len(seq) assembly["dna_size"] += contig_seq_length contig_gc_length = seq.count("G") contig_gc_length += seq.count("C") contig_dict["gc_content"] = float("{0:.2f}".format( float(contig_gc_length) / float(contig_seq_length))) assembly["gc_content"] += contig_gc_length contig_dict["contig_id"] = fasta_header contig_dict["name"] = fasta_header contig_dict["length"] = contig_seq_length contig_dict["md5"] = hashlib.md5(seq).hexdigest() assembly["md5"].append(contig_dict["md5"]) if fasta_description is not None: contig_dict["description"] = fasta_description contig_dict["is_circular"] = "Unknown" contig_dict["start_position"] = contig_seq_start contig_dict["num_bytes"] = sys.getsizeof(contig_dict["sequence"]) assembly["contigs"][fasta_header] = contig_dict # used for start of next sequence and total gc_content contig_seq_start += contig_seq_length assembly["gc_content"] = float("{0:.2f}".format( float(assembly["gc_content"]) / float(contig_seq_start))) assembly["md5"] = hashlib.md5(",".join(assembly["md5"])).hexdigest() assembly["assembly_id"] = core_genome_name + "_assembly" assembly["name"] = scientific_name assembly["external_source"] = source assembly["external_source_id"] = os.path.basename(input_fasta_file) assembly["external_source_origination_date"] = str( os.stat(input_fasta_file).st_ctime) assembly["num_contigs"] = len(assembly["contigs"].keys()) assembly["type"] = "Unknown" assembly[ "notes"] = "Note MD5s are generated from uppercasing the sequences" return assembly def _retrieve_gff_file(self, input_gff_file): """ _retrieve_gff_file: retrieve info from gff_file """ log("Reading GFF file") feature_list = dict() is_phytozome = 0 is_patric = 0 gff_file_handle = open(input_gff_file, 'rb') current_line = gff_file_handle.readline() line_count = 0 while (current_line != ''): current_line = current_line.strip() if (current_line.isspace() or current_line == "" or current_line.startswith("#")): pass else: #Split line (contig_id, source_id, feature_type, start, end, score, strand, phase, attributes) = current_line.split('\t') #Checking to see if Phytozome if ("phytozome" in source_id or "Phytozome" in source_id): is_phytozome = 1 #Checking to see if Phytozome if ("PATRIC" in source_id): is_patric = 1 #PATRIC prepends their contig ids with some gibberish if (is_patric and "|" in contig_id): contig_id = contig_id.split("|", 1)[1] #Features grouped by contigs first if (contig_id not in feature_list): feature_list[contig_id] = list() #Populating basic feature object ftr = { 'contig': contig_id, 'source': source_id, 'type': feature_type, 'start': int(start), 'end': int(end), 'score': score, 'strand': strand, 'phase': phase, 'attributes': attributes } #Populating with attribute key-value pair #This is where the feature id is from for attribute in attributes.split(";"): attribute = attribute.strip() #Sometimes empty string if (attribute == ""): continue #Use of 1 to limit split as '=' character can also be made available later #Sometimes lack of "=", assume spaces instead if ("=" in attribute): key, value = attribute.split("=", 1) elif (" " in attribute): key, value = attribute.split(" ", 1) else: log("Warning: attribute " + attribute + " cannot be separated into key,value pair") ftr[key] = value feature_list[contig_id].append(ftr) current_line = gff_file_handle.readline() gff_file_handle.close() #Some GFF/GTF files don't use "ID" so we go through the possibilities feature_list = self._add_missing_identifiers(feature_list) #Most bacterial files have only CDSs #In order to work with prokaryotic and eukaryotic gene structure synonymously #Here we add feature dictionaries representing the parent gene and mRNAs feature_list = self._add_missing_parents(feature_list) #Phytozome has the annoying habit of editing their identifiers so we fix them if (is_phytozome): self._update_phytozome_features(feature_list) #All identifiers need to be checked so that they follow the same general rules #Rules are listed within the function itself feature_list = self._update_identifiers(feature_list) #If phytozome, the edited files need to be re-printed as GFF so that it works better with RNA-Seq pipeline if (is_phytozome): self._print_phytozome_gff(input_gff_file, feature_list) return feature_list def _add_missing_identifiers(self, feature_list): #General rule is to iterate through a range of possibilities if "ID" is missing for contig in feature_list.keys(): for i in range(len(feature_list[contig])): if ("ID" not in feature_list[contig][i]): for key in ("transcriptId", "proteinId", "PACid", "pacid", "Parent"): if (key in feature_list[contig][i]): feature_list[contig][i]['ID'] = feature_list[ contig][i][key] break #If the process fails, throw an error for ftr_type in ("gene", "mRNA", "CDS"): if (ftr_type not in feature_list[contig][i]): continue if ("ID" not in feature_list[contig][i]): log("Error: Cannot find unique ID to utilize in GFF attributes: "+ \ feature_list[contig][i]['contig']+"."+ \ feature_list[contig][i]['source']+"."+ \ feature_list[contig][i]['type']+": "+ \ feature_list[contig][i]['attributes']) return feature_list def _generate_feature_hierarchy(self, feature_list): feature_hierarchy = {contig: {} for contig in feature_list} #Need to remember mRNA/gene links for CDSs mRNA_gene_dict = {} exon_list_position_dict = {} for contig in feature_list: for i in range(len(feature_list[contig])): ftr = feature_list[contig][i] if ("gene" in ftr["type"]): feature_hierarchy[contig][ftr["ID"]] = { "utrs": [], "mrnas": [], "cdss": [], "index": i } if ("UTR" in ftr["type"]): feature_hierarchy[contig][mRNA_gene_dict[ ftr["Parent"]]]["utrs"].append({ "id": ftr["ID"], "index": i }) if ("RNA" in ftr["type"]): feature_hierarchy[contig][ftr["Parent"]]["mrnas"].append({ "id": ftr["ID"], "index": i, "cdss": [] }) mRNA_gene_dict[ftr["ID"]] = ftr["Parent"] exon_list_position_dict[ftr["ID"]] = len( feature_hierarchy[contig][ftr["Parent"]]["mrnas"]) - 1 if ("CDS" in ftr["type"]): feature_hierarchy[contig][mRNA_gene_dict[ftr["Parent"]]]["mrnas"]\ [exon_list_position_dict[ftr["Parent"]]]["cdss"].append( { "id": ftr["ID"], "index" : i } ) return feature_hierarchy def _add_missing_parents(self, feature_list): #General rules is if CDS or RNA missing parent, add them for contig in feature_list.keys(): ftrs = feature_list[contig] new_ftrs = [] for i in range(len(ftrs)): if ("Parent" not in ftrs[i]): #Assuming parent doesn't exist at all, so create de novo instead of trying to find it if ("RNA" in ftrs[i]["type"] or "CDS" in ftrs[i]["type"]): new_gene_ftr = copy.deepcopy(ftrs[i]) new_gene_ftr["type"] = "gene" ftrs[i]["Parent"] = new_gene_ftr["ID"] new_ftrs.append(new_gene_ftr) if ("CDS" in ftrs[i]["type"]): new_rna_ftr = copy.deepcopy(ftrs[i]) new_rna_ftr["type"] = "mRNA" new_ftrs.append(new_rna_ftr) ftrs[i]["Parent"] = new_rna_ftr["ID"] new_ftrs.append(ftrs[i]) feature_list[contig] = new_ftrs return feature_list def _update_phytozome_features(self, feature_list): #General rule is to use the "Name" field where possible #And update parent attribute correspondingly for contig in feature_list.keys(): feature_position_dict = {} for i in range(len(feature_list[contig])): #Maintain old_id for reference #Sometimes ID isn't available, so use PACid old_id = None for key in ("ID", "PACid", "pacid"): if (key in feature_list[contig][i]): old_id = feature_list[contig][i][key] break if (old_id is None): #This should be an error print ("Cannot find unique ID, PACid, or pacid in GFF attributes: ",\ feature_list[contig][i][contig],feature_list[contig][i][source],feature_list[contig][i][attributes]) continue #Retain old_id feature_position_dict[old_id] = i #In Phytozome, gene and mRNA have "Name" field, CDS do not if ("Name" in feature_list[contig][i]): feature_list[contig][i]["ID"] = feature_list[contig][i][ "Name"] if ("Parent" in feature_list[contig][i]): #Update Parent to match new ID of parent ftr feature_list[contig][i]["Parent"] = feature_list[contig][ feature_position_dict[feature_list[contig][i] ["Parent"]]]["ID"] return feature_list def _update_identifiers(self, feature_list): #General rules: #1) Genes keep identifier #2) RNAs keep identifier only if its different from gene, otherwise append ".mRNA" #3) CDS always uses RNA identifier with ".CDS" appended #4) CDS appended with an incremented digit CDS_count_dict = dict() mRNA_parent_dict = dict() for contig in feature_list.keys(): for ftr in feature_list[contig]: if ("Parent" in ftr): #Retain old_id of parents old_id = ftr["ID"] if (ftr["ID"] == ftr["Parent"] or "CDS" in ftr["type"]): ftr["ID"] = ftr["Parent"] + "." + ftr["type"] #link old to new ids for mRNA to use with CDS if ("RNA" in ftr["type"]): mRNA_parent_dict[old_id] = ftr["ID"] if ("CDS" in ftr["type"]): #Increment CDS identifier if (ftr["ID"] not in CDS_count_dict): CDS_count_dict[ftr["ID"]] = 1 else: CDS_count_dict[ftr["ID"]] += 1 ftr["ID"] = ftr["ID"] + "." + str( CDS_count_dict[ftr["ID"]]) #Recall new mRNA id for parent ftr["Parent"] = mRNA_parent_dict[ftr["Parent"]] return feature_list def _print_phytozome_gff(self, input_gff_file, feature_list): #Write modified feature ids to new file input_gff_file = input_gff_file.replace("gene", "edited_gene") + ".gz" try: print "Printing to new file: " + input_gff_file gff_file_handle = gzip.open(input_gff_file, 'wb') except: print "Failed to open" for contig in sorted(feature_list.iterkeys()): for ftr in feature_list[contig]: #Re-build attributes attributes_dict = {} for attribute in ftr["attributes"].split(";"): attribute = attribute.strip() #Sometimes empty string if (attribute == ""): continue #Use of 1 to limit split as '=' character can also be made available later #Sometimes lack of "=", assume spaces instead if ("=" in attribute): key, value = attribute.split("=", 1) elif (" " in attribute): key, value = attribute.split(" ", 1) else: log("Warning: attribute " + attribute + " cannot be separated into key,value pair") if (ftr[key] != value): value = ftr[key] attributes_dict[key] = value ftr["attributes"] = ";".join(key + "=" + attributes_dict[key] for key in attributes_dict.keys()) new_line = "\t".join( str(ftr[key]) for key in [ 'contig', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes' ]) gff_file_handle.write(new_line) gff_file_handle.close() return def _retrieve_genome_feature_list(self, feature_list, feature_hierarchy, assembly): genome_features_list = list() genome_mrnas_list = list() genome_cdss_list = list() genome_translation_issues = list() for contig in feature_hierarchy: for gene in feature_hierarchy[contig]: #We only iterate through the gene objects #And then for each gene object, retrieve the necessary mRNA and CDS objects indirectly ftr = feature_list[contig][feature_hierarchy[contig][gene] ["index"]] contig_sequence = assembly["contigs"][ ftr["contig"]]["sequence"] gene_ftr = self._convert_ftr_object( ftr, contig_sequence ) #reverse-complementation for negative strands done here #Add non-optional terms gene_ftr["mrnas"] = list() gene_ftr["cdss"] = list() gene_ftr["ontology_terms"] = dict() #Retaining longest sequences for gene feature longest_protein_length = 0 longest_protein_sequence = "" for mRNA in feature_hierarchy[contig][gene]["mrnas"]: ######################################################## # Construct mRNA Ftr ######################################################## ftr = feature_list[contig][mRNA["index"]] contig_sequence = assembly["contigs"][ ftr["contig"]]["sequence"] mRNA_ftr = self._convert_ftr_object( ftr, contig_sequence ) #reverse-complementation for negative strands done here #Modify mrna object for use in mrna array #Objects will be un-used until further notice mRNA_ftr['parent_gene'] = gene_ftr['id'] #If there are CDS, then New CDS ID without incrementation as they were aggregated if (len(mRNA['cdss']) > 0): mRNA_ftr['cds'] = mRNA_ftr['id'] + ".CDS" else: mRNA_ftr['cds'] = "" #Add to mrnas array genome_mrnas_list.append(mRNA_ftr) #Add ids to gene_ftr arrays gene_ftr["mrnas"].append(mRNA_ftr["id"]) ######################################################## # Construct transcript, protein sequence, UTR, CDS locations ######################################################## #At time of writing, all of this aggregation should probably be done in a single function cds_exons_locations_array = list() cds_cdna_sequence = str() protein_sequence = str() if (len(mRNA["cdss"]) > 0): (cds_exons_locations_array, cds_cdna_sequence, protein_sequence) = \ self._cds_aggregation_translation(mRNA["cdss"],feature_list[contig],assembly,genome_translation_issues) UTRs = list() if ("utrs" in feature_hierarchy[contig][gene] and len(feature_hierarchy[contig][gene]["utrs"]) > 0): for UTR in feature_hierarchy[contig][gene]["utrs"]: ftr = feature_list[contig][UTR["index"]] if ("Parent" in ftr and ftr["Parent"] == mRNA_ftr["id"]): UTRs.append(ftr) mrna_exons_locations_array = copy.deepcopy( cds_exons_locations_array) mrna_transcript_sequence = str(cds_cdna_sequence) if (len(UTRs) > 0): (mrna_exons_locations_array, mrna_transcript_sequence) = \ self._utr_aggregation(UTRs,assembly,mrna_exons_locations_array,cds_cdna_sequence) #Update sequence and locations mRNA_ftr["dna_sequence"] = mrna_transcript_sequence mRNA_ftr["dna_sequence_length"] = len( mrna_transcript_sequence) mRNA_ftr["location"] = mrna_exons_locations_array mRNA_ftr["md5"] = hashlib.md5( mRNA_ftr["dna_sequence"]).hexdigest() #Remove DNA del mRNA_ftr["dna_sequence"] del mRNA_ftr["dna_sequence_length"] #Skip CDS if not present if (len(mRNA["cdss"]) == 0): continue #Remove asterix representing stop codon if present if (len(protein_sequence) > 0 and protein_sequence[-1] == '*'): protein_sequence = protein_sequence[:-1] #Save longest sequence if (len(protein_sequence) > longest_protein_length): longest_protein_length = len(protein_sequence) longest_protein_sequence = protein_sequence ######################################################## # Construct CDS Ftr ######################################################## CDS_ftr = dict() CDS_ftr['type'] = 'CDS' #New CDS ID without incrementation as they were aggregated CDS_ftr['id'] = mRNA_ftr['id'] + '.CDS' #Add gene/mrna links CDS_ftr['parent_gene'] = gene_ftr['id'] CDS_ftr['parent_mrna'] = mRNA_ftr['id'] #Update sequence and locations CDS_ftr["dna_sequence"] = cds_cdna_sequence CDS_ftr["dna_sequence_length"] = len(cds_cdna_sequence) CDS_ftr["location"] = cds_exons_locations_array CDS_ftr["md5"] = hashlib.md5( CDS_ftr["dna_sequence"]).hexdigest() #Add protein CDS_ftr["protein_translation"] = str( protein_sequence).upper() CDS_ftr["protein_translation_length"] = len( CDS_ftr["protein_translation"]) #Only generate md5 for dna sequences #CDS_ftr["md5"] = hashlib.md5(CDS_ftr["protein_translation"]).hexdigest() #Add empty non-optional fields for populating in future CDS_ftr["ontology_terms"] = dict() if ("aliases" not in CDS_ftr): CDS_ftr["aliases"] = list() if ("function" not in CDS_ftr): CDS_ftr["function"] = "" #Add to cdss array genome_cdss_list.append(CDS_ftr) #Add ids to gene_ftr arrays gene_ftr["cdss"].append(CDS_ftr["id"]) gene_ftr["protein_translation"] = longest_protein_sequence gene_ftr["protein_translation_length"] = longest_protein_length genome_features_list.append(gene_ftr) msg = "Genome features processed: {} genes, {} RNAs, and {} CDSs\n".format( len(genome_features_list), len(genome_mrnas_list), len(genome_cdss_list)) msg += "{} mRNA(s) had errors during translation".format( len(genome_translation_issues)) log(msg) return genome_features_list, genome_mrnas_list, genome_cdss_list def _gen_genome_info(self, core_genome_name, scientific_name, assembly_ref, genome_features_list, genome_cdss_list, genome_mrnas_list, source, assembly, taxon_reference, taxonomy, input_gff_file): """ _gen_genome_info: generate genome info """ genome = dict() genome["id"] = core_genome_name genome["scientific_name"] = scientific_name genome["assembly_ref"] = assembly_ref genome["features"] = genome_features_list genome["cdss"] = genome_cdss_list genome["mrnas"] = genome_mrnas_list genome["source"] = source genome["domain"] = "Eukaryota" genome["genetic_code"] = 1 genome["gc_content"] = assembly["gc_content"] genome["dna_size"] = assembly["dna_size"] if taxon_reference is not None: genome["taxon_ref"] = taxon_reference genome["taxonomy"] = taxonomy gff_file_to_shock = self.dfu.file_to_shock({ 'file_path': input_gff_file, 'make_handle': 1, 'pack': "gzip" }) gff_handle_ref = gff_file_to_shock['handle']['hid'] genome['gff_handle_ref'] = gff_handle_ref return genome def _convert_ftr_object(self, old_ftr, contig): new_ftr = dict() new_ftr["id"] = old_ftr["ID"] dna_sequence = Seq(contig[old_ftr["start"] - 1:old_ftr["end"]], IUPAC.ambiguous_dna) # reverse complement if (old_ftr["strand"] == "-"): dna_sequence = dna_sequence.reverse_complement() old_start = old_ftr["start"] old_ftr["start"] = old_ftr["end"] old_ftr["end"] = old_start new_ftr["dna_sequence"] = str(dna_sequence).upper() new_ftr["dna_sequence_length"] = len(dna_sequence) new_ftr["md5"] = hashlib.md5(str(dna_sequence)).hexdigest() new_ftr["location"] = [[ old_ftr["contig"], old_ftr["start"], old_ftr["strand"], len(dna_sequence) ]] new_ftr["type"] = old_ftr["type"] new_ftr["aliases"] = list() for key in ("transcriptId", "proteinId", "PACid", "pacid"): if (key in old_ftr.keys()): new_ftr["aliases"].append(key + ":" + old_ftr[key]) return new_ftr def _utr_aggregation(self, utr_list, assembly, exons, exon_sequence): #create copies of locations and transcript utrs_exons = list(exons) utr_exon_sequence = exon_sequence five_prime_dna_sequence = "" three_prime_dna_sequence = "" five_prime_locations = list() three_prime_locations = list() for UTR in (utr_list): contig_sequence = assembly["contigs"][UTR["contig"]]["sequence"] UTR_ftr = self._convert_ftr_object( UTR, contig_sequence ) #reverse-complementation for negative strands done here #aggregate sequences and locations if ("five_prime" in UTR_ftr["id"]): five_prime_dna_sequence += UTR_ftr["dna_sequence"] five_prime_locations.append(UTR_ftr["location"][0]) if ("three_prime" in UTR_ftr["id"]): three_prime_dna_sequence += UTR_ftr["dna_sequence"] three_prime_locations.append(UTR_ftr["location"][0]) #Handle five_prime UTRs if (len(five_prime_locations) > 0): #Sort UTRs by "start" (reverse-complement UTRs in Phytozome appear to be incorrectly ordered in the GFF file) five_prime_locations = sorted(five_prime_locations, key=lambda x: x[1]) #Merge last UTR with CDS if "next" to each other if( ( utrs_exons[0][2] == "+" and five_prime_locations[-1][1]+five_prime_locations[-1][3] == utrs_exons[0][1] ) or \ ( utrs_exons[0][2] == "-" and five_prime_locations[-1][1]-five_prime_locations[-1][3] == utrs_exons[0][1] ) ): #Remove last UTR last_five_prime_location = five_prime_locations[-1] five_prime_locations = five_prime_locations[:-1] #"Add" last UTR to first exon utrs_exons[0][1] = last_five_prime_location[1] utrs_exons[0][3] += last_five_prime_location[3] #Prepend other UTRs if available if (len(five_prime_locations) > 0): utrs_exons = five_prime_locations + utrs_exons utr_exon_sequence = five_prime_dna_sequence + utr_exon_sequence #Handle three_prime UTRs if (len(three_prime_locations) > 0): #Sort UTRs by "start" (reverse-complement UTRs in Phytozome appear to be incorrectly ordered in the GFF file three_prime_locations = sorted(three_prime_locations, key=lambda x: x[1]) #Merge first UTR with CDS if "next to each other if( ( utrs_exons[0][2] == "+" and utrs_exons[-1][1]+utrs_exons[-1][3] == three_prime_locations[0][1] ) or \ ( utrs_exons[0][2] == "-" and utrs_exons[-1][1]-utrs_exons[-1][3] == three_prime_locations[0][1] ) ): #Remove first UTR first_three_prime_location = three_prime_locations[0] three_prime_locations = three_prime_locations[1:] #"Add" first UTR to last exon utrs_exons[-1][3] += first_three_prime_location[3] #Append other UTRs if available if (len(three_prime_locations) > 0): utrs_exons = utrs_exons + three_prime_locations utr_exon_sequence += three_prime_dna_sequence return (utrs_exons, utr_exon_sequence) def _cds_aggregation_translation(self, cds_list, feature_list, assembly, issues): dna_sequence = "" locations = list() # collect phases, and lengths of exons # right now, this is only for the purpose of error reporting phases = list() exons = list() #Saving parent mRNA identifier Parent_mRNA = cds_list[0]["id"] for CDS in (cds_list): ftr = feature_list[CDS["index"]] phases.append(ftr["phase"]) Parent_mRNA = ftr["Parent"] contig_sequence = assembly["contigs"][ftr["contig"]]["sequence"] CDS_ftr = self._convert_ftr_object( ftr, contig_sequence ) #reverse-complementation for negative strands done here exons.append(len(CDS_ftr["dna_sequence"])) # Remove base(s) according to phase, but only for first CDS if (CDS == cds_list[0] and int(ftr["phase"]) != 0): log("Adjusting phase for first CDS: " + CDS["id"]) CDS_ftr["dna_sequence"] = CDS_ftr["dna_sequence"][ int(ftr["phase"]):] #aggregate sequences and locations dna_sequence += CDS_ftr["dna_sequence"] locations.append(CDS_ftr["location"][0]) # translate sequence dna_sequence_obj = Seq(dna_sequence, IUPAC.ambiguous_dna) rna_sequence = dna_sequence_obj.transcribe() # incomplete gene model with no start codon if str(rna_sequence.upper())[:3] not in codon_table.start_codons: msg = "Missing start codon for {}. Possibly incomplete gene model.".format( Parent_mRNA) log(msg) # You should never have this problem, needs to be reported rather than "fixed" codon_count = len(str(rna_sequence)) % 3 if codon_count != 0: msg = "Number of bases for RNA sequence for {} ".format( Parent_mRNA) msg += "is not divisible by 3. " msg += "The resulting protein may well be mis-translated." log(msg) issues.append(Parent_mRNA) protein_sequence = Seq("") try: protein_sequence = rna_sequence.translate() except CodonTable.TranslationError as te: log("TranslationError for: " + feature_object["id"], phases, exons, " : " + str(te)) return (locations, dna_sequence.upper(), str(protein_sequence).upper())
class ExpressionUtils: ''' Module Name: ExpressionUtils Module Description: A KBase module: ExpressionUtils This module is intended for use by Assemblers to upload RNASeq Expression files (gtf, fpkm and ctab). This module generates the ctab files and tpm data if they are absent. The expression files are uploaded as a single compressed file.This module also generates expression levels and tpm expression levels from the input files and saves them in the workspace object. Once uploaded, the expression files can be downloaded onto an output directory. ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.1.1" GIT_URL = "https://github.com/JamesJeffryes/ExpressionUtils.git" GIT_COMMIT_HASH = "62ce653aa5c5b39a597486613bc140b173a35c99" #BEGIN_CLASS_HEADER PARAM_IN_SRC_DIR = 'source_dir' PARAM_IN_SRC_REF = 'source_ref' PARAM_IN_DST_REF = 'destination_ref' PARAM_IN_ALIGNMENT_REF = 'alignment_ref' PARAM_IN_GENOME_REF = 'genome_ref' PARAM_IN_ANNOTATION_ID = 'annotation_id' PARAM_IN_BAM_FILE_PATH = 'bam_file_path' PARAM_IN_DESCRIPTION = 'description' PARAM_IN_DATA_QUAL_LEVEL = 'data_quality_level' PARAM_IN_PROC_COMMENTS = 'processing_comments' PARAM_IN_PLATFORM = 'platform' PARAM_IN_MAPPED_SAMPLE_ID = 'mapped_sample_id' PARAM_IN_ORIG_MEDIAN = 'original_median' PARAM_IN_EXT_SRC_DATE = 'external_source_date' PARAM_IN_TRANSCRIPTS = 'transcripts' PARAM_IN_SRC = 'source' def _check_required_param(self, in_params, param_list): """ Check if each of the params in the list are in the input params """ for param in param_list: if (param not in in_params or not in_params[param]): raise ValueError('{} parameter is required'.format(param)) def _proc_ws_obj_params(self, ctx, params): """ Check the validity of workspace and object params and return them """ dst_ref = params.get(self.PARAM_IN_DST_REF) ws_name_id, obj_name_id = os.path.split(dst_ref) if not bool(ws_name_id.strip()) or ws_name_id == '/': raise ValueError("Workspace name or id is required in " + self.PARAM_IN_DST_REF) if not bool(obj_name_id.strip()): raise ValueError("Object name or id is required in " + self.PARAM_IN_DST_REF) dfu = DataFileUtil(self.callback_url) if not isinstance(ws_name_id, int): try: ws_name_id = dfu.ws_name_to_id(ws_name_id) except DFUError as se: prefix = se.message.split('.')[0] raise ValueError(prefix) self.__LOGGER.info('Obtained workspace name/id ' + str(ws_name_id)) return ws_name_id, obj_name_id def _proc_upload_expression_params(self, ctx, params): """ Check the presence and validity of upload expression params """ self._check_required_param(params, [ self.PARAM_IN_DST_REF, self.PARAM_IN_SRC_DIR, self.PARAM_IN_ALIGNMENT_REF ]) ws_name_id, obj_name_id = self._proc_ws_obj_params(ctx, params) source_dir = params.get(self.PARAM_IN_SRC_DIR) if not (os.path.isdir(source_dir)): raise ValueError('Source directory does not exist: ' + source_dir) if not os.listdir(source_dir): raise ValueError('Source directory is empty: ' + source_dir) return ws_name_id, obj_name_id, source_dir def _get_ws_info(self, obj_ref): ws = Workspace(self.ws_url) try: info = ws.get_object_info_new({'objects': [{'ref': obj_ref}]})[0] except WorkspaceError as wse: self.__LOGGER.error('Logging workspace exception') self.__LOGGER.error(str(wse)) raise return info def _get_genome_ref(self, assembly_or_genome_ref, params): if self.PARAM_IN_GENOME_REF in params and params[ self.PARAM_IN_GENOME_REF] is not None: return params[self.PARAM_IN_GENOME_REF] obj_type = self._get_ws_info(assembly_or_genome_ref)[2] if obj_type.startswith('KBaseGenomes.Genome'): return assembly_or_genome_ref raise ValueError('Alignment object does not contain genome_ref; ' '"{}" parameter is required'.format( self.PARAM_IN_GENOME_REF)) def _get_expression_levels(self, source_dir, genome_ref, transcripts=False): fpkm_file_path = os.path.join(source_dir, 'genes.fpkm_tracking') if transcripts: fpkm_file_path = os.path.join(source_dir, 't_data.ctab') if not os.path.isfile(fpkm_file_path): raise ValueError('{} file is required'.format(fpkm_file_path)) id_col = 5 if transcripts else 0 self.__LOGGER.info( 'Generating expression levels from {}'.format(fpkm_file_path)) return self.expression_utils.get_expression_levels( fpkm_file_path, genome_ref, id_col) def _gen_ctab_files(self, params, alignment_ref): source_dir = params.get(self.PARAM_IN_SRC_DIR) if len(glob.glob(source_dir + '/*.ctab')) < 5: self.__LOGGER.info(' ======= Generating ctab files ==========') gtf_file = os.path.join(source_dir, 'transcripts.gtf') if not os.path.isfile(gtf_file): raise ValueError( "{} file is required to generate ctab files, found missing" .format(gtf_file)) if self.PARAM_IN_BAM_FILE_PATH in params and \ params[self.PARAM_IN_BAM_FILE_PATH] is not None: bam_file_path = params[self.PARAM_IN_BAM_FILE_PATH] else: self.__LOGGER.info( 'Downloading bam file from alignment object') rau = ReadsAlignmentUtils(self.callback_url) alignment_retVal = rau.download_alignment( {'source_ref': alignment_ref}) alignment_dir = alignment_retVal.get('destination_dir') allbamfiles = glob.glob(alignment_dir + '/*.bam') if len(allbamfiles) == 0: raise ValueError('bam file does not exist in {}'.format(d)) elif len(allbamfiles) == 1: bam_file_path = allbamfiles[0] elif len(allbamfiles) > 1: tmp_file_path = os.path.join(alignment_dir, 'accepted_hits.bam') if os.path.isfile(tmp_file_path): bam_file_path = tmp_file_path else: tmp_file_path = os.path.join( alignment_dir, 'accepted_hits_sorted.bam') if os.path.isfile(tmp_file_path): bam_file_path = tmp_file_path else: raise ValueError( 'accepted_hits.bam, accepted_hits_sorted.bam or other bam file not found in {}' .format(alignment_dir)) result = self.table_maker.build_ctab_files( ref_genome_path=gtf_file, alignment_path=bam_file_path, output_dir=source_dir) if result != 0: raise ValueError('Tablemaker failed') #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.__LOGGER = logging.getLogger('ExpressionUtils') self.__LOGGER.setLevel(logging.INFO) streamHandler = logging.StreamHandler(sys.stdout) formatter = logging.Formatter( "%(asctime)s - %(filename)s - %(lineno)d - %(levelname)s - %(message)s" ) formatter.converter = time.gmtime streamHandler.setFormatter(formatter) self.__LOGGER.addHandler(streamHandler) self.__LOGGER.info("Logger was set") self.config = config self.scratch = config['scratch'] self.callback_url = os.environ['SDK_CALLBACK_URL'] self.ws_url = config['workspace-url'] self.config['SDK_CALLBACK_URL'] = self.callback_url self.expression_utils = Expression_Utils(self.config) self.dfu = DataFileUtil(self.callback_url) self.table_maker = TableMaker(config, self.__LOGGER) self.expr_matrix_utils = ExprMatrixUtils(config, self.__LOGGER) #END_CONSTRUCTOR pass def upload_expression(self, ctx, params): """ Uploads the expression * :param params: instance of type "UploadExpressionParams" (* Required input parameters for uploading a reads expression data string destination_ref - object reference of expression data. The object ref is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the workspace name or id and obj_name_or_id is the object name or id string source_dir - directory with the files to be uploaded string alignment_ref - alignment workspace object reference *) -> structure: parameter "destination_ref" of String, parameter "source_dir" of String, parameter "alignment_ref" of String, parameter "genome_ref" of String, parameter "annotation_id" of String, parameter "bam_file_path" of String, parameter "transcripts" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "data_quality_level" of Long, parameter "original_median" of Double, parameter "description" of String, parameter "platform" of String, parameter "source" of String, parameter "external_source_date" of String, parameter "processing_comments" of String :returns: instance of type "UploadExpressionOutput" (* Output from upload expression *) -> structure: parameter "obj_ref" of String """ # ctx is the context object # return variables are: returnVal #BEGIN upload_expression self.__LOGGER.info('Starting upload expression, parsing parameters ') pprint(params) ws_name_id, obj_name_id, source_dir = self._proc_upload_expression_params( ctx, params) alignment_ref = params.get(self.PARAM_IN_ALIGNMENT_REF) try: alignment_obj = self.dfu.get_objects( {'object_refs': [alignment_ref]})['data'][0] except DFUError as e: self.__LOGGER.error( 'Logging stacktrace from workspace exception:\n' + e.data) raise alignment = alignment_obj['data'] assembly_or_genome_ref = alignment['genome_id'] genome_ref = self._get_genome_ref(assembly_or_genome_ref, params) expression_levels, tpm_expression_levels = self._get_expression_levels( source_dir, genome_ref, params.get(self.PARAM_IN_TRANSCRIPTS)) self._gen_ctab_files(params, alignment_ref) uploaded_file = self.dfu.file_to_shock({ 'file_path': source_dir, 'make_handle': 1, 'pack': 'zip' }) """ move the zipfile created in the source directory one level up """ path, dir = os.path.split(source_dir) zipfile = dir + '.zip' if os.path.isfile(os.path.join(source_dir, zipfile)): shutil.move(os.path.join(source_dir, zipfile), os.path.join(path, zipfile)) file_handle = uploaded_file['handle'] file_size = uploaded_file['size'] expression_data = { 'numerical_interpretation': 'FPKM', 'genome_id': genome_ref, 'mapped_rnaseq_alignment': { alignment['read_sample_id']: alignment_ref }, 'condition': alignment['condition'], 'file': file_handle, 'expression_levels': expression_levels, 'tpm_expression_levels': tpm_expression_levels } additional_params = [ self.PARAM_IN_ANNOTATION_ID, self.PARAM_IN_DESCRIPTION, self.PARAM_IN_DATA_QUAL_LEVEL, self.PARAM_IN_PLATFORM, self.PARAM_IN_PROC_COMMENTS, self.PARAM_IN_MAPPED_SAMPLE_ID, self.PARAM_IN_ORIG_MEDIAN, self.PARAM_IN_EXT_SRC_DATE, self.PARAM_IN_SRC ] for opt_param in additional_params: if opt_param in params and params[opt_param] is not None: expression_data[opt_param] = params[opt_param] extra_provenance_input_refs = list() extra_provenance_input_refs.append( params.get(self.PARAM_IN_ALIGNMENT_REF)) if self.PARAM_IN_GENOME_REF in params and params.get( self.PARAM_IN_GENOME_REF) is not None: extra_provenance_input_refs.append( params.get(self.PARAM_IN_GENOME_REF)) self.__LOGGER.info('=========== Adding extra_provenance_refs') self.__LOGGER.info(str(extra_provenance_input_refs)) self.__LOGGER.info('==========================================') res = self.dfu.save_objects({ "id": ws_name_id, "objects": [{ "type": "KBaseRNASeq.RNASeqExpression", "data": expression_data, "name": obj_name_id, "extra_provenance_input_refs": extra_provenance_input_refs }] })[0] self.__LOGGER.info('save complete') returnVal = { 'obj_ref': str(res[6]) + '/' + str(res[0]) + '/' + str(res[4]) } self.__LOGGER.info('Uploaded object: ') print(returnVal) #END upload_expression # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method upload_expression return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def download_expression(self, ctx, params): """ Downloads expression * :param params: instance of type "DownloadExpressionParams" (* Required input parameters for downloading expression string source_ref - object reference of expression source. The object ref is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the workspace name or id and obj_name_or_id is the object name or id *) -> structure: parameter "source_ref" of String :returns: instance of type "DownloadExpressionOutput" (* The output of the download method. *) -> structure: parameter "destination_dir" of String """ # ctx is the context object # return variables are: returnVal #BEGIN download_expression self.__LOGGER.info('Running download_expression with params:\n' + pformat(params)) inref = params.get(self.PARAM_IN_SRC_REF) if not inref: raise ValueError(self.PARAM_IN_SRC_REF + ' parameter is required') try: expression = self.dfu.get_objects({'object_refs': [inref]})['data'] except DFUError as e: self.__LOGGER.error( 'Logging stacktrace from workspace exception:\n' + e.data) raise # set the output dir timestamp = int( (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000) output_dir = os.path.join(self.scratch, 'download_' + str(timestamp)) os.mkdir(output_dir) file_ret = self.dfu.shock_to_file({ 'shock_id': expression[0]['data']['file']['id'], 'file_path': output_dir, 'unpack': 'unpack' }) if not os.listdir(output_dir): raise ValueError('No files were downloaded: ' + output_dir) for f in glob.glob(output_dir + '/*.zip'): os.remove(f) returnVal = {'destination_dir': output_dir} #END download_expression # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method download_expression return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def export_expression(self, ctx, params): """ Wrapper function for use by in-narrative downloaders to download expressions from shock * :param params: instance of type "ExportParams" (* Required input parameters for exporting expression string source_ref - object reference of expression source. The object ref is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the workspace name or id and obj_name_or_id is the object name or id *) -> structure: parameter "source_ref" of String :returns: instance of type "ExportOutput" -> structure: parameter "shock_id" of String """ # ctx is the context object # return variables are: output #BEGIN export_expression inref = params.get(self.PARAM_IN_SRC_REF) if not inref: raise ValueError(self.PARAM_IN_SRC_REF + ' parameter is required') try: expression = self.dfu.get_objects({'object_refs': [inref]})['data'] except DFUError as e: self.__LOGGER.error( 'Logging stacktrace from workspace exception:\n' + e.data) raise output = {'shock_id': expression[0]['data']['file']['id']} #END export_expression # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method export_expression return value ' + 'output is not type dict as required.') # return the results return [output] def get_expressionMatrix(self, ctx, params): """ :param params: instance of type "getExprMatrixParams" (* Following are the required input parameters to get Expression Matrix *) -> structure: parameter "workspace_name" of String, parameter "output_obj_name" of String, parameter "expressionset_ref" of String :returns: instance of type "getExprMatrixOutput" -> structure: parameter "exprMatrix_FPKM_ref" of String, parameter "exprMatrix_TPM_ref" of String """ # ctx is the context object # return variables are: returnVal #BEGIN get_expressionMatrix fpkm_ref, tpm_ref = self.expr_matrix_utils.get_expression_matrix( params) returnVal = { 'exprMatrix_FPKM_ref': fpkm_ref, 'exprMatrix_TPM_ref': tpm_ref } #END get_expressionMatrix # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method get_expressionMatrix return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def status(self, ctx): #BEGIN_STATUS returnVal = { 'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH } #END_STATUS return [returnVal]
class ReadsAlignmentUtils: ''' Module Name: ReadsAlignmentUtils Module Description: A KBase module: ReadsAlignmentUtils This module is intended for use by Aligners and Assemblers to upload and download alignment files. The alignment may be uploaded as a sam or bam file. If a sam file is given, it is converted to the sorted bam format and saved. Upon downloading, optional parameters may be provided to get files in sam and bai formats from the downloaded bam file. This utility also generates stats from the stored alignment. ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.0.1" GIT_URL = "https://github.com/kbaseapps/ReadsAlignmentUtils.git" GIT_COMMIT_HASH = "a807d122b097a4c6713a81d5a82eef335835f77a" #BEGIN_CLASS_HEADER PARAM_IN_FILE = 'file_path' PARAM_IN_SRC_REF = 'source_ref' PARAM_IN_DST_REF = 'destination_ref' PARAM_IN_CONDITION = 'condition' PARAM_IN_READ_LIB_REF = 'read_library_ref' PARAM_IN_ASM_GEN_REF = 'assembly_or_genome_ref' PARAM_IN_ALIGNED_USING = 'aligned_using' PARAM_IN_ALIGNER_VER = 'aligner_version' PARAM_IN_ALIGNER_OPTS = 'aligner_opts' PARAM_IN_REPLICATE_ID = 'replicate_id' PARAM_IN_PLATFORM = 'platform' PARAM_IN_BOWTIE2_INDEX = 'bowtie2_index' PARAM_IN_SAMPLESET_REF = 'sampleset_ref' PARAM_IN_MAPPED_SAMPLE_ID = 'mapped_sample_id' PARAM_IN_DOWNLOAD_SAM = 'downloadSAM' PARAM_IN_DOWNLOAD_BAI = 'downloadBAI' PARAM_IN_VALIDATE = 'validate' INVALID_WS_OBJ_NAME_RE = re.compile('[^\\w\\|._-]') INVALID_WS_NAME_RE = re.compile('[^\\w:._-]') def _get_file_path_info(self, file_path): """ Given a file path, returns the directory, file name, file base and file extension """ dir, file_name = os.path.split(file_path) file_base, file_ext = os.path.splitext(file_name) return dir, file_name, file_base, file_ext def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _check_required_param(self, in_params, param_list): """ Checks if each of the params in the list are in the input params """ for param in param_list: if (param not in in_params or not in_params[param]): raise ValueError('{} parameter is required'.format(param)) def _proc_ws_obj_params(self, ctx, params): """ Checks the validity of workspace and object params and returns them """ dst_ref = params.get(self.PARAM_IN_DST_REF) ws_name_id, obj_name_id = os.path.split(dst_ref) if not bool(ws_name_id.strip()) or ws_name_id == '/': raise ValueError("Workspace name or id is required in " + self.PARAM_IN_DST_REF) if not bool(obj_name_id.strip()): raise ValueError("Object name or id is required in " + self.PARAM_IN_DST_REF) if not isinstance(ws_name_id, int): try: ws_name_id = self.dfu.ws_name_to_id(ws_name_id) except DFUError as se: prefix = se.message.split('.')[0] raise ValueError(prefix) self.__LOGGER.info('Obtained workspace name/id ' + str(ws_name_id)) return ws_name_id, obj_name_id def _get_ws_info(self, obj_ref): ws = Workspace(self.ws_url) try: info = ws.get_object_info_new({'objects': [{'ref': obj_ref}]})[0] except WorkspaceError as wse: self.__LOGGER.error('Logging workspace exception') self.__LOGGER.error(str(wse)) raise return info def _proc_upload_alignment_params(self, ctx, params): """ Checks the presence and validity of upload alignment params """ self._check_required_param(params, [ self.PARAM_IN_DST_REF, self.PARAM_IN_FILE, self.PARAM_IN_CONDITION, self.PARAM_IN_READ_LIB_REF, self.PARAM_IN_ASM_GEN_REF ]) ws_name_id, obj_name_id = self._proc_ws_obj_params(ctx, params) file_path = params.get(self.PARAM_IN_FILE) if not (os.path.isfile(file_path)): raise ValueError('File does not exist: ' + file_path) lib_type = self._get_ws_info(params.get(self.PARAM_IN_READ_LIB_REF))[2] if lib_type.startswith('KBaseFile.SingleEndLibrary') or \ lib_type.startswith('KBaseFile.PairedEndLibrary') or \ lib_type.startswith('KBaseAssembly.SingleEndLibrary') or \ lib_type.startswith('KBaseAssembly.PairedEndLibrary'): pass else: raise ValueError(self.PARAM_IN_READ_LIB_REF + ' parameter should be of type' + ' KBaseFile.SingleEndLibrary or' + ' KBaseFile.PairedEndLibrary or' + ' KBaseAssembly.SingleEndLibrary or' + ' KBaseAssembly.PairedEndLibrary') obj_type = self._get_ws_info(params.get(self.PARAM_IN_ASM_GEN_REF))[2] if obj_type.startswith('KBaseGenomes.Genome') or \ obj_type.startswith('KBaseGenomeAnnotations.Assembly') or \ obj_type.startswith('KBaseGenomes.ContigSet'): pass else: raise ValueError(self.PARAM_IN_ASM_GEN_REF + ' parameter should be of type' + ' KBaseGenomes.Genome or' + ' KBaseGenomeAnnotations.Assembly or' + ' KBaseGenomes.ContigSet') return ws_name_id, obj_name_id, file_path, lib_type def _get_aligner_stats(self, bam_file): """ Gets the aligner stats from BAM file """ path, file = os.path.split(bam_file) return self.samtools.get_stats(file, path) def _validate(self, params): samt = SamTools(self.config, self.__LOGGER) if 'ignore' in params: path, file = os.path.split(params['file_path']) rval = samt.validate(ifile=file, ipath=path, ignore=params['ignore']) else: path, file = os.path.split(params['file_path']) rval = samt.validate(ifile=file, ipath=path) return rval #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.config = config self.__LOGGER = logging.getLogger('KBaseRNASeq') if 'log_level' in config: self.__LOGGER.setLevel(config['log_level']) else: self.__LOGGER.setLevel(logging.INFO) streamHandler = logging.StreamHandler(sys.stdout) formatter = logging.Formatter( "%(asctime)s - %(filename)s - %(lineno)d - \ %(levelname)s - %(message)s") formatter.converter = time.gmtime streamHandler.setFormatter(formatter) self.__LOGGER.addHandler(streamHandler) self.__LOGGER.info("Logger was set") script_utils.check_sys_stat(self.__LOGGER) self.scratch = config['scratch'] self.callback_url = os.environ['SDK_CALLBACK_URL'] self.ws_url = config['workspace-url'] self.dfu = DataFileUtil(self.callback_url) self.samtools = SamTools(config) #END_CONSTRUCTOR pass def validate_alignment(self, ctx, params): """ :param params: instance of type "ValidateAlignmentParams" (* Input parameters for validating a reads alignment. For validation errors to ignore, see http://broadinstitute.github.io/picard/command-line-overview.html#V alidateSamFile) -> structure: parameter "file_path" of String, parameter "ignore" of list of String :returns: instance of type "ValidateAlignmentOutput" (* Results from validate alignment *) -> structure: parameter "validated" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)) """ # ctx is the context object # return variables are: returnVal #BEGIN validate_alignment rval = self._validate(params) if rval == 0: returnVal = {'validated': True} else: returnVal = {'validated': False} #END validate_alignment # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method validate_alignment return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def upload_alignment(self, ctx, params): """ Validates and uploads the reads alignment * :param params: instance of type "UploadAlignmentParams" (* Required input parameters for uploading a reads alignment string destination_ref - object reference of alignment destination. The object ref is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the workspace name or id and obj_name_or_id is the object name or id file_path - File with the path of the sam or bam file to be uploaded. If a sam file is provided, it will be converted to the sorted bam format before being saved read_library_ref - workspace object ref of the read sample used to make the alignment file condition - assembly_or_genome_ref - workspace object ref of genome assembly or genome object that was used to build the alignment *) -> structure: parameter "destination_ref" of String, parameter "file_path" of String, parameter "read_library_ref" of String, parameter "condition" of String, parameter "assembly_or_genome_ref" of String, parameter "aligned_using" of String, parameter "aligner_version" of String, parameter "aligner_opts" of mapping from String to String, parameter "replicate_id" of String, parameter "platform" of String, parameter "bowtie2_index" of type "ws_bowtieIndex_id", parameter "sampleset_ref" of type "ws_Sampleset_ref", parameter "mapped_sample_id" of mapping from String to mapping from String to String, parameter "validate" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "ignore" of list of String :returns: instance of type "UploadAlignmentOutput" (* Output from uploading a reads alignment *) -> structure: parameter "obj_ref" of String """ # ctx is the context object # return variables are: returnVal #BEGIN upload_alignment self.__LOGGER.info( 'Starting upload Reads Alignment, parsing parameters ') pprint(params) ws_name_id, obj_name_id, file_path, lib_type = self._proc_upload_alignment_params( ctx, params) dir, file_name, file_base, file_ext = self._get_file_path_info( file_path) if self.PARAM_IN_VALIDATE in params and params[ self.PARAM_IN_VALIDATE] is True: if self._validate(params) == 1: raise Exception('{0} failed validation'.format(file_path)) bam_file = file_path if file_ext.lower() == '.sam': bam_file = os.path.join(dir, file_base + '.bam') self.samtools.convert_sam_to_sorted_bam(ifile=file_name, ipath=dir, ofile=bam_file) uploaded_file = self.dfu.file_to_shock({ 'file_path': bam_file, 'make_handle': 1 }) file_handle = uploaded_file['handle'] file_size = uploaded_file['size'] aligner_stats = self._get_aligner_stats(file_path) aligner_data = { 'file': file_handle, 'size': file_size, 'condition': params.get(self.PARAM_IN_CONDITION), 'read_sample_id': params.get(self.PARAM_IN_READ_LIB_REF), 'library_type': lib_type, 'genome_id': params.get(self.PARAM_IN_ASM_GEN_REF), 'alignment_stats': aligner_stats } optional_params = [ self.PARAM_IN_ALIGNED_USING, self.PARAM_IN_ALIGNER_VER, self.PARAM_IN_ALIGNER_OPTS, self.PARAM_IN_REPLICATE_ID, self.PARAM_IN_PLATFORM, self.PARAM_IN_BOWTIE2_INDEX, self.PARAM_IN_SAMPLESET_REF, self.PARAM_IN_MAPPED_SAMPLE_ID ] for opt_param in optional_params: if opt_param in params and params[opt_param] is not None: aligner_data[opt_param] = params[opt_param] self.__LOGGER.info('========= Adding extra_provenance_refs') self.__LOGGER.info(params.get(self.PARAM_IN_READ_LIB_REF)) self.__LOGGER.info(params.get(self.PARAM_IN_ASM_GEN_REF)) self.__LOGGER.info('=======================================') res = self.dfu.save_objects({ "id": ws_name_id, "objects": [{ "type": "KBaseRNASeq.RNASeqAlignment", "data": aligner_data, "name": obj_name_id, "extra_provenance_input_refs": [ params.get(self.PARAM_IN_READ_LIB_REF), params.get(self.PARAM_IN_ASM_GEN_REF) ] }] })[0] self.__LOGGER.info('save complete') returnVal = { 'obj_ref': str(res[6]) + '/' + str(res[0]) + '/' + str(res[4]) } self.__LOGGER.info('Uploaded object: ') self.__LOGGER.info(returnVal) #END upload_alignment # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method upload_alignment return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def download_alignment(self, ctx, params): """ Downloads alignment files in .bam, .sam and .bai formats. Also downloads alignment stats * :param params: instance of type "DownloadAlignmentParams" (* Required input parameters for downloading a reads alignment string source_ref - object reference of alignment source. The object ref is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the workspace name or id and obj_name_or_id is the object name or id *) -> structure: parameter "source_ref" of String, parameter "downloadSAM" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "downloadBAI" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "validate" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "ignore" of list of String :returns: instance of type "DownloadAlignmentOutput" (* The output of the download method. *) -> structure: parameter "destination_dir" of String, parameter "stats" of type "AlignmentStats" -> structure: parameter "properly_paired" of Long, parameter "multiple_alignments" of Long, parameter "singletons" of Long, parameter "alignment_rate" of Double, parameter "unmapped_reads" of Long, parameter "mapped_reads" of Long, parameter "total_reads" of Long """ # ctx is the context object # return variables are: returnVal #BEGIN download_alignment self.__LOGGER.info('Running download_alignment with params:\n' + pformat(params)) inref = params.get(self.PARAM_IN_SRC_REF) if not inref: raise ValueError('{} parameter is required'.format( self.PARAM_IN_SRC_REF)) try: alignment = self.dfu.get_objects({'object_refs': [inref]})['data'] except DFUError as e: self.__LOGGER.error( 'Logging stacktrace from workspace exception:\n' + e.data) raise # set the output dir uuid_str = str(uuid.uuid4()) output_dir = os.path.join(self.scratch, 'download_' + uuid_str) self._mkdir_p(output_dir) file_ret = self.dfu.shock_to_file({ 'shock_id': alignment[0]['data']['file']['id'], 'file_path': output_dir }) if zipfile.is_zipfile(file_ret.get('file_path')): with zipfile.ZipFile(file_ret.get('file_path')) as z: z.extractall(output_dir) for f in glob.glob(output_dir + '/*.zip'): os.remove(f) bam_files = glob.glob(output_dir + '/*.bam') uuid_prefix = uuid_str[:8] if len(bam_files) == 0: raise ValueError("Alignment object does not contain a bam file") for bam_file_path in bam_files: dir, file_name, file_base, file_ext = self._get_file_path_info( bam_file_path) if params.get(self.PARAM_IN_VALIDATE, False): validate_params = {'file_path': bam_file_path} if self._validate(validate_params) == 1: raise Exception( '{0} failed validation'.format(bam_file_path)) if params.get('downloadBAI', False): bai_file = uuid_prefix + '_' + file_base + '.bai' bai_file_path = os.path.join(output_dir, bai_file) self.samtools.create_bai_from_bam(ifile=file_name, ipath=output_dir, ofile=bai_file) if not os.path.isfile(bai_file_path): raise ValueError('Error creating {}'.format(bai_file_path)) if params.get('downloadSAM', False): sam_file = uuid_prefix + '_' + file_base + '.sam' sam_file_path = os.path.join(output_dir, sam_file) self.samtools.convert_bam_to_sam(ifile=file_name, ipath=output_dir, ofile=sam_file) if not os.path.isfile(sam_file_path): raise ValueError('Error creating {}'.format(sam_file_path)) returnVal = { 'destination_dir': output_dir, 'stats': alignment[0]['data']['alignment_stats'] } #END download_alignment # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method download_alignment return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def export_alignment(self, ctx, params): """ Wrapper function for use by in-narrative downloaders to download alignments from shock * :param params: instance of type "ExportParams" (* Required input parameters for exporting a reads alignment string source_ref - object reference of alignment source. The object ref is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the workspace name or id and obj_name_or_id is the object name or id *) -> structure: parameter "source_ref" of String, parameter "exportSAM" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "exportBAI" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "validate" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "ignore" of list of String :returns: instance of type "ExportOutput" -> structure: parameter "shock_id" of String """ # ctx is the context object # return variables are: output #BEGIN export_alignment inref = params.get(self.PARAM_IN_SRC_REF) if not inref: raise ValueError('{} parameter is required'.format( self.PARAM_IN_SRC_REF)) if params.get(self.PARAM_IN_VALIDATE, False) or \ params.get('exportBAI', False) or \ params.get('exportSAM', False): """ Need to validate or convert files. Use download_alignment """ download_params = {} for key, val in params.iteritems(): download_params[key.replace('export', 'download')] = val download_retVal = self.download_alignment(ctx, download_params)[0] export_dir = download_retVal['destination_dir'] # package and load to shock ret = self.dfu.package_for_download({ 'file_path': export_dir, 'ws_refs': [inref] }) output = {'shock_id': ret['shock_id']} else: """ return shock id from the object """ try: alignment = self.dfu.get_objects({'object_refs': [inref]})['data'] except DFUError as e: self.__LOGGER.error( 'Logging stacktrace from workspace exception:\n' + e.data) raise output = {'shock_id': alignment[0]['data']['file']['id']} #END export_alignment # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method export_alignment return value ' + 'output is not type dict as required.') # return the results return [output] def status(self, ctx): #BEGIN_STATUS returnVal = { 'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH } #END_STATUS return [returnVal]
class Utils: def __init__(self, config): self.cfg = config self.scratch = config['scratch'] self.callback_url = os.environ['SDK_CALLBACK_URL'] self.dfu = DataFileUtil(self.callback_url) self.kbse = KBaseSearchEngine(config['search-url']) self.gen_api = GenericsAPI(self.callback_url) self.DEFAULT_ONTOLOGY_REF = "KbaseOntologies/Custom" self.DEFAULT_ONTOLOGY_ID = "Custom:Term" self.DEFAULT_UNIT_ID = "Custom:Unit" @staticmethod def validate_params(params, expected, opt_param=set()): """Validates that required parameters are present. Warns if unexpected parameters appear""" expected = set(expected) opt_param = set(opt_param) pkeys = set(params) if expected - pkeys: raise ValueError( "Required keys {} not in supplied parameters".format( ", ".join(expected - pkeys))) defined_param = expected | opt_param for param in params: if param not in defined_param: logging.warning( "Unexpected parameter {} supplied".format(param)) def get_conditions(self, params): data = self.dfu.get_objects( {'object_refs': [params['condition_set_ref']]})['data'][0]['data'] conditions = {} keep_keys = params.get('conditions', data['conditions'].keys()) for key in keep_keys: conditions[key] = defaultdict(list) for factor, val in zip(data['factors'], data['conditions'][key]): ont_abriv = factor['factor_ont_id'].split(":")[0] factor['value'] = val conditions[key][ont_abriv].append(copy.copy(factor)) return {"conditions": conditions} def file_to_condition_set(self, params): """Convert a user supplied file to a compound set""" if 'input_file_path' in params: scratch_file_path = params['input_file_path'] elif 'input_shock_id' in params: scratch_file_path = self.dfu.shock_to_file({ 'shock_id': params['input_shock_id'], 'file_path': self.scratch }).get('file_path') else: raise ValueError( "Must supply either a input_shock_id or input_file_path") try: df = pd.read_excel(scratch_file_path, dtype='str') except XLRDError: df = pd.read_csv(scratch_file_path, sep="\t", dtype='str') comp_set = self._df_to_cs_obj(df) info = self.dfu.save_objects({ "id": params['output_ws_id'], "objects": [{ "type": "KBaseExperiments.ConditionSet", "data": comp_set, "name": params['output_obj_name'] }] })[0] return {"condition_set_ref": "%s/%s/%s" % (info[6], info[0], info[4])} def _conditionset_data_to_df(self, data): """ Converts a compound set object data to a dataframe """ factors = pd.DataFrame(data['factors']) factors.rename(columns=lambda x: x.replace("ont", "ontology"). capitalize().replace("_", " ")) conditions = pd.DataFrame(data['conditions']) cs_df = factors.join(conditions) return cs_df def _clusterset_data_to_df(self, data): """ Converts a cluster set object data to a dataframe """ original_matrix_ref = data.get('original_data') data_matrix = self.gen_api.fetch_data({ 'obj_ref': original_matrix_ref }).get('data_matrix') data_df = pd.read_json(data_matrix) clusters = data.get('clusters') id_name_list = [ cluster.get('id_to_data_position').keys() for cluster in clusters ] id_names = [item for sublist in id_name_list for item in sublist] if set(data_df.columns.tolist()) == set( id_names): # cluster is based on condition data_df = data_df.T cluster_names = [None] * data_df.index.size cluster_id = 0 for cluster in clusters: item_ids = cluster.get('id_to_data_position').keys() item_idx = [data_df.index.get_loc(item_id) for item_id in item_ids] for idx in item_idx: cluster_names[idx] = cluster_id cluster_id += 1 data_df['cluster'] = cluster_names return data_df def _ws_obj_to_df(self, input_ref): """Converts workspace obj to a dataframe""" res = self.dfu.get_objects({'object_refs': [input_ref]})['data'][0] name = res['info'][1] obj_type = res['info'][2] if "KBaseExperiments.ConditionSet" in obj_type: cs_df = self._conditionset_data_to_df(res['data']) elif "KBaseExperiments.ClusterSet" in obj_type: cs_df = self._clusterset_data_to_df(res['data']) else: err_msg = 'Ooops! [{}] is not supported.\n'.format(obj_type) err_msg += 'Please supply KBaseExperiments.ConditionSet or KBaseExperiments.ClusterSet' raise ValueError("err_msg") return name, cs_df, obj_type def _df_to_cs_obj(self, cs_df): """Converts a dataframe from a user file to a compound set object""" condition_set = {'ontology_mapping_method': "User Curation"} cs_df.fillna('', inplace=True) if not len(cs_df): raise ValueError("No factors in supplied files") factor_df = cs_df.filter(regex="[Uu]nit|[Ff]actor") condition_df = cs_df.drop(factor_df.columns, axis=1) if not len(condition_df.columns): raise ValueError( "Unable to find any condition columns in supplied file") factor_df.rename( columns=lambda x: x.lower().replace(" ontology ", "_ont_").strip(), inplace=True) if "factor" not in factor_df.columns: raise ValueError( "Unable to find a 'Factor' column in supplied file") factor_fields = ('factor', 'unit', 'factor_ont_id', 'unit_ont_id') factors = factor_df.filter(items=factor_fields).to_dict('records') condition_set['factors'] = [ self._add_ontology_info(f) for f in factors ] condition_set['conditions'] = condition_df.to_dict('list') return condition_set def _search_ontologies(self, term, closest=False): """ Match to an existing KBase ontology term :param term: Test to match :param closest: if false, term must exactly match an ontology ID :return: dict(ontology_ref, id) """ params = { "object_types": ["OntologyTerm"], "match_filter": { "lookup_in_keys": { "id": { "value": term } } }, "access_filter": { "with_private": 0, "with_public": 1 }, "pagination": { "count": 1 }, "post_processing": { "skip_data": 1 } } if closest: params['match_filter'] = {"full_text_in_all": term} res = self.kbse.search_objects(params) if not res['objects']: return None term = res['objects'][0] return { "ontology_ref": term['guid'].split(":")[1], "id": term['key_props']['id'] } def _add_ontology_info(self, factor): """Searches KBASE ontologies for terms matching the user supplied factors and units. Add the references if found""" optionals = { "unit", "unit_ont_id", "unit_ont_ref", } factor = { k: v for k, v in factor.items() if k not in optionals or v != "" } ont_info = self._search_ontologies( factor.get('factor_ont_id', "").replace("_", ":")) if ont_info: factor['factor_ont_ref'] = ont_info['ontology_ref'] factor['factor_ont_id'] = ont_info['id'] else: factor['factor_ont_ref'] = self.DEFAULT_ONTOLOGY_REF factor['factor_ont_id'] = self.DEFAULT_ONTOLOGY_ID if factor.get('unit'): ont_info = self._search_ontologies( factor.get('unit_ont_id', '').replace("_", ":")) if ont_info: factor['unit_ont_ref'] = ont_info['ontology_ref'] factor['unit_ont_id'] = ont_info['id'] else: factor['unit_ont_ref'] = self.DEFAULT_ONTOLOGY_REF factor['unit_ont_id'] = self.DEFAULT_UNIT_ID return factor def to_tsv(self, params): """Convert an compound set to TSV file""" files = {} _id, df, obj_type = self._ws_obj_to_df(params['input_ref']) files['file_path'] = os.path.join(params['destination_dir'], _id + ".tsv") df.to_csv(files['file_path'], sep="\t", index=False) return _id, files def to_excel(self, params): """Convert an compound set to Excel file""" files = {} _id, df, obj_type = self._ws_obj_to_df(params['input_ref']) files['file_path'] = os.path.join(params['destination_dir'], _id + ".xlsx") writer = pd.ExcelWriter(files['file_path']) if "KBaseExperiments.ConditionSet" in obj_type: df.to_excel(writer, "Conditions", index=False) elif "KBaseExperiments.ClusterSet" in obj_type: df.to_excel(writer, "ClusterSet", index=True) # else is checked in `_ws_obj_to_df` writer.save() return _id, files def export(self, file, name, input_ref): """Saves a set of files to SHOCK for export""" export_package_dir = os.path.join(self.scratch, name + str(uuid.uuid4())) os.makedirs(export_package_dir) shutil.move(file, os.path.join(export_package_dir, os.path.basename(file))) # package it up and be done package_details = self.dfu.package_for_download({ 'file_path': export_package_dir, 'ws_refs': [input_ref] }) return {'shock_id': package_details['shock_id']}
class EmmaxUtil: def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['token'] self.shock_url = config['shock-url'] self.scratch = os.path.join(config['scratch'], 'emmax_assoc_'+str(uuid.uuid4())) os.mkdir(self.scratch) self.config = config self.dfu = DataFileUtil(self.callback_url) self.kbr = KBaseReport(self.callback_url, token=self.config['token']) def _validate_phenotype_file(self, phenotype_filepath, tfam_filepath, case_control=False): #TODO: check if case/control. If so, recode to 2/1 if necessary. #TODO: verify the number of samples in pheno file matches tfam count #TODO: verify the order of the two files matches. If not, reorder pheno #TODO: verify the FID and wiFID match the tfam pass def _validate_emmax_params(self, params): #TODO: All manner of param validation pass def _create_tsv_file(self, top_snp_filepath, tsv_filename): log("Generating tsv file from {}".format(top_snp_filepath)) cols = ['SNP', 'CHR', 'BP', 'P'] snpData = pd.read_csv(top_snp_filepath, delimiter='\t', header=None, names=['ID', 'SE', 'p']) tsvData = pd.DataFrame(columns = cols) tsvData['SNP'] = snpData.ID tsvData['CHR'] = snpData.ID.str[1:2] tsvData['P'] = snpData.p tsvData.sort_values(by='CHR', inplace=True) bps = [] current_chr = tsvData.iloc[0]['CHR'] count = 1 for idx, row in tsvData.iterrows(): if current_chr != row['CHR']: current_chr = row['CHR'] count = 1 bps.append(count) count = count + 1 tsvData['BP'] = bps tsv_filepath = os.path.join(self.scratch, tsv_filename) tsvData.to_csv(tsv_filepath, sep='\t', index=False) return tsv_filepath def _copyDirectory(self, src, dest): try: shutil.copytree(src, dest) # Directories are the same except shutil.Error as e: print('Directory not copied. Error: %s' % e) # Any error saying that the directory doesn't exist except OSError as e: print('Directory not copied. Error: %s' % e) def _run_subprocess(self, command, print_output=False, use_shell=False): log("Executing command:\n{}\n".format(command)) p = subprocess.Popen(command, cwd=self.scratch, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=use_shell) output, errors = p.communicate() if print_output: print("Command output:\n{}".format(output)) if errors: print("Command error output\n{}".format(errors)) if p.returncode != 0: error_msg = "Error running command:\n{}\nReturn code: {}".format(command, p.returncode) raise ValueError(error_msg) def _download_variation_file(self, variation_obj_ref): log("Retrieving Variation Object {}...".format(variation_obj_ref)) try: variation_shock_id = self.dfu.get_objects({ 'object_refs': [variation_obj_ref] })['data'][0]['data']['variation_file_reference'] self.dfu.shock_to_file({ 'shock_id': variation_shock_id, 'file_path': self.scratch, 'unpack' : 'unpack' }) except Exception as e: log("Error while retrieving Variation Object {}".format(variation_obj_ref)) log(e) raise ValueError(e) variation_filename = [f for f in os.listdir(self.scratch) if f.endswith('.vcf')][0] variation_filepath = os.path.join(self.scratch, variation_filename) log("Variation file successfully downloaded to {}".format(variation_filepath)) return variation_filepath def _move_phenotype_data(self, pheno_filename): """ This is here until we get a Phenotype/Trait object working """ pheno_filepath = os.path.join(self.scratch, pheno_filename) shutil.copy('/kb/module/data/' + pheno_filename, pheno_filepath) return pheno_filepath def _convert_vcf_to_plink(self, variation_filepath, fam_id = '--double-id', plink_file_prefix='plink_out'): #FIXME: This function should probably be in VCF utils log("Generating PLINK .tfam and .tped from VCF...") plink_cmd = ['plink'] plink_cmd.append('--vcf') plink_cmd.append(variation_filepath) plink_cmd.append('--recode12') plink_cmd.append('transpose') plink_cmd.append('--output-missing-genotype') plink_cmd.append('0') plink_cmd.append(fam_id) plink_cmd.append('--out') plink_cmd.append(plink_file_prefix) self._run_subprocess(plink_cmd, print_output=True) def _generate_kinship_matrix(self, tped_prefix, matrix_type='BN'): #TODO: check for existence of files before attempting matrix generation log("Generating {} Kinship Matrix...".format(matrix_type)) kinship_cmd = ['emmax-kin', '-v', '-d', '10'] if (matrix_type == 'BN'): # kinship_cmd.append('-d') # kinship_cmd.append('10') pass elif (matrix_type == 'IBS'): kinship_cmd.append('-s') # kinship_cmd.append('-d') # kinship_cmd.append('10') else: log("Invalid matrix type specified. Aborting") raise ValueError("Invalid matrix type specified") kinship_cmd.append(tped_prefix) self._run_subprocess(kinship_cmd, print_output=True) kinship_matrix_filename = [f for f in os.listdir(self.scratch) if f.endswith('.kinf')][0] kinship_matrix_filepath = os.path.join(self.scratch, kinship_matrix_filename) log("Variation file successfully downloaded to {}".format(kinship_matrix_filepath)) return kinship_matrix_filepath def _emmax_association(self, plink_prefix, pheno_filepath, kinship_filepath, emmax_params): log("Running EMMAX association analysis...") emmax_cmd = ['emmax'] for param in emmax_params: emmax_cmd.append(param) emmax_cmd.append('-t') emmax_cmd.append(plink_prefix) emmax_cmd.append('-p') emmax_cmd.append(pheno_filepath) emmax_cmd.append('-k') emmax_cmd.append(kinship_filepath) emmax_cmd.append('-o') emmax_cmd.append(plink_prefix) self._run_subprocess(emmax_cmd) emmax_filenames = [f for f in os.listdir(self.scratch) if f.endswith('.reml') or f.endswith('.ps')] return emmax_filenames def _select_top_snps(self, count, ps_filepath, output_filepath): #select_cmd = ['awk', '{print $NF, $0}', ps_filepath, "|", "sort", "-n", "|", "cut", "-f2-", "-d' '"] select_cmd = ["awk '{{print $NF,$0}}' {} | sort -n | cut -f2- -d' ' | sed -n -e '1,{}p' > {}".format(ps_filepath, str(count), output_filepath)] self._run_subprocess(select_cmd, print_output=False, use_shell=True) def _generate_output_files(self): log('Ziping EMMAX .reml and .ps files...') output_files = list() allowed_extensions = ['.ps', '.reml', '.tsv'] result_file = os.path.join(self.scratch, 'emmax_results.zip') with zipfile.ZipFile(result_file, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as zip_file: for root, dirs, files in os.walk(self.scratch): for file in files: if (file.endswith(tuple(allowed_extensions))): if file in zip_file.namelist(): continue zip_file.write(os.path.join(root, file), file) output_files.append({'path': result_file, 'name': os.path.basename(result_file), 'label': os.path.basename(result_file), 'description': 'File(s) generated by EMMAX'}) log("Importer output generated: {}".format(output_files)) return output_files def _generate_html_report(self, template_dir, tsv_filepath): log("Generating HTML report...") html_report = [] output_dir = os.path.join(self.scratch, 'html') self._copyDirectory(template_dir, output_dir) result_file_path = os.path.join(output_dir, 'index.html') shutil.copyfile(tsv_filepath, os.path.join(output_dir, 'emmax_top.tsv')) report_shock_id = self.dfu.file_to_shock({'file_path': output_dir, 'pack': 'zip'})['shock_id'] html_report.append({'shock_id': report_shock_id, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'Manhattan plot'}) return html_report def run_emmax_association(self, params): variation_filepath = self._download_variation_file(params['variation_obj_ref']) self._convert_vcf_to_plink(variation_filepath, '--double-id', params['output_file_prefix']) pheno_filepath = self._move_phenotype_data('flcReordered.pheno') kinship_filepath = self._generate_kinship_matrix(params['output_file_prefix']) emmax_params = ['-v', '-d', '10'] emmax_assoc_files = self._emmax_association(params['output_file_prefix'], pheno_filepath, kinship_filepath, emmax_params) full_result_filepath = os.path.join(self.scratch, params['output_file_prefix'] + '.ps') top_snp_filepath = os.path.join(self.scratch, TOP_SNP_FN) self._select_top_snps(params['snp_return_count'], full_result_filepath, top_snp_filepath) tsv_filepath = self._create_tsv_file(top_snp_filepath, 'emmax_top.tsv') output_html_files = self._generate_html_report(TEMPLATE_DIRECTORY, tsv_filepath) output_emmax_files = self._generate_output_files() report_params = { 'message': '', 'workspace_name' : params.get('workspace_name'), 'file_links' : output_emmax_files, 'html_links' : output_html_files, 'direct_html_link_index' : 0, 'html_window_height' : 333, 'report_object_name' : 'emmax_assoc_html_report_' + str(uuid.uuid4()) } kbr_output = self.kbr.create_extended_report(report_params) report_output = { 'report_name': kbr_output['name'], 'report_ref': kbr_output['ref'], } log("EMMAX report generated successfully!") return report_output