def collect(self): params = self.method_params ws_client = self.common_params['ws_client'] hs_client = self.common_params['hs_client'] ws_id = params['ws_id'] #rscripts_dir = self.common_params['rscripts_dir'] rscripts_dir = '/kb/module/rscripts' token = self.common_params['user_token'] diffexp_dir = self.directory logger = self.logger logger.info( 'in DiffExpforBallgown.collect, method params (params) are') logger.info(pformat(params)) output_object_name = params['output_obj_name'] output_csv = "ballgown_diffexp.tsv" volcano_plot_file = "volcano_plot.png" stringtie_dir_prefix = "StringTie_outdir_" # # 1) need a pattern RE to match all the StringTie subdirs, so prefix all # unzipped dirs with "stringtie_out_" # 2) need a group identifier string i.e. "111000" # ballgown_set_info = rnaseq_util.get_info_and_download_for_ballgown( logger, ws_client, hs_client, ws_id, self.urls, diffexp_dir, stringtie_dir_prefix, params['expressionset_id'], token) logger.info('back from download_for_ballgown(), ballgown_set_info are') logger.info(pformat(ballgown_set_info)) sample_dir_group_file = "sample_dir_group_table" # output file group_list = rnaseq_util.create_sample_dir_group_file( logger, ws_client, ws_id, ballgown_set_info['subdirs'], params['group_name1'], params['expr_ids1'], params['group_name2'], params['expr_ids2'], sample_dir_group_file) ballgown_output_dir = os.path.join(diffexp_dir, "ballgown_out") logger.info("ballgown output dir is {0}".format(ballgown_output_dir)) handler_util.setupWorkingDir(logger, ballgown_output_dir) logger.info("about to run_ballgown_diff_exp") rnaseq_util.run_ballgown_diff_exp(logger, rscripts_dir, diffexp_dir, sample_dir_group_file, ballgown_output_dir, output_csv, volcano_plot_file) logger.info( "back from run_ballgown_diff_exp, about to load diff exp matrix file" ) diff_expr_matrix = rnaseq_util.load_diff_expr_matrix( ballgown_output_dir, output_csv) # read file before its zipped logger.info("about to load ballgout output into workspace") de_ws_save_obj_data = rnaseq_util.load_ballgown_output_into_ws( logger, ws_id, ws_client, hs_client, token, diffexp_dir, ballgown_output_dir, self.details["used_tool"], self.details["tool_version"], ballgown_set_info[ 'sample_expression_ids'], # for sample ids? Is this good? group_list, # conditions ballgown_set_info['genome_id'], # genome_id ballgown_set_info['expressionset_id'], # expressionset_id ballgown_set_info['alignmentSet_id'], # alignmentset_id ballgown_set_info['sampleset_id'], # sampleset_id output_object_name) logger.info( "back from loading ballgown output into workspace, object save data is " ) logger.info(pformat(de_ws_save_obj_data)) max_num_genes = sys.maxint # default if 'maximum_num_genes' in params: if (params['maximum_num_genes'] != None): max_num_genes = params['maximum_num_genes'] # this returns a list of gene ids passing the specified cuts, ordered by # descending fold_change selected_gene_list = rnaseq_util.filter_genes_diff_expr_matrix( diff_expr_matrix, params['fold_scale_type'], params['alpha_cutoff'], params['fold_change_cutoff'], max_num_genes) # !!!!! IF selected_gene_list is empty print some kind of message, take no further action # get the unfiltered expression matrix expression_set_id_name = script_util.ws_get_obj_name( logger, ws_client, ws_id, params['expressionset_id']) em_name = expression_set_id_name + "_FPKM_ExpressionMatrix" logger.info("about to fetch expression matrix {0}".format(em_name)) try: #emw = ws_client.get_objects( [ { "name": em_name, "workspace": ws_id } ] )[0] emw = script_util.ws_get_obj(logger, ws_client, ws_id, em_name)[0] except: raise Exception( "unable to retrieve expression matrix object {0} from workspace {1}" .format(em_name, ws_id)) emo = emw["data"] # filter it filtered_emo = rnaseq_util.filter_expr_matrix_object( emo, selected_gene_list) # save it logger.info("saving emo em_name {0}".format( params["filtered_expr_matrix"])) try: ret = ws_client.save_objects({ 'workspace': ws_id, 'objects': [{ 'type': 'KBaseFeatureValues.ExpressionMatrix', 'data': filtered_emo, 'name': params["filtered_expr_matrix"] }] }) except: raise Exception("failed to save object ") logger.info("ws save return:\n" + pformat(ret)) logger.info("saving volcano plot as report object") report_object_name = expression_set_id_name + "_plot_report" output_obj_ref = script_util.ws_get_ref(logger, ws_client, ws_id, output_object_name) em_obj_ref = script_util.ws_get_ref(logger, ws_client, ws_id, params["filtered_expr_matrix"]) plot_report_object_name = rnaseq_util.create_and_save_volcano_plot_report( logger, ws_client, ws_id, self.urls['callback_url'], token, ballgown_output_dir, volcano_plot_file, output_obj_ref, em_obj_ref, report_object_name) #logger.info( "plot_report_object") #logger.info( pformat( plot_report_object_name ) ) # THIS NEEDS TO BE AN INPUT PARAMETER IN SPEC FILE #iltered_expr_matrix_name = expressionset_id + "_filtered_fpkm" #e_em_save_obj_data = created_and_save_filtered_expr_matrix( logger, # ws_client, # ws_id, # token, # expression_set_name, # fold_scale_type, #"linear", "log2+1", "log10+1" # alpha_cutoff, # q_value_cutoff, # log2_fold_change_cutoff, # maximum_num_genes, # filtered_expr_matrix_name # ) #logger.info( "plot_report_object_name[1] is {0}".format( plot_report_object_name[1] ) ) #logger.info( "plot_report_ref is {0}/{1}/{2}".format( plot_report_object_name[6], plot_report_object_name[0],plot_report_object_name[4] ) ) returnVal = { 'diff_expr_object': output_object_name, 'filtered_expression_maxtrix': params["filtered_expr_matrix"], 'report_name': plot_report_object_name[1], 'report_ref': "{0}/{1}/{2}".format(plot_report_object_name[6], plot_report_object_name[0], plot_report_object_name[4]), 'workspace': ws_id } self.returnVal = returnVal
def runEach(self, task_params): ws_client = self.common_params['ws_client'] hs = self.common_params['hs_client'] params = self.method_params logger = self.logger token = self.common_params['user_token'] read_sample = task_params['job_id'] condition = task_params['label'] directory = task_params['hisat2_dir'] ws_id = task_params['ws_id'] genome_id = task_params['annotation_id'] sampleset_id = task_params['sampleset_id'] print "Downloading Read Sample{0}".format(read_sample) logger.info("Downloading Read Sample{0}".format(read_sample)) try: #r_sample = ws_client.get_objects( # [{ 'name' : read_sample, 'workspace' : ws_id}])[0] #r_sample_info = ws_client.get_object_info_new({"objects": [{'name': read_sample, 'workspace': ws_id}]})[0] #sample_type = r_sample_info[2].split('-')[0] r_sample = script_util.ws_get_obj(self.logger, ws_client, ws_id, read_sample)[0] sample_type = script_util.ws_get_type_name(self.logger, ws_client, ws_id, read_sample) sample_name = script_util.ws_get_obj_name4file( self.logger, ws_client, ws_id, read_sample) input_direc = os.path.join( directory, sample_name.split('.')[0] + "_hisat2_input") if not os.path.exists(input_direc): os.mkdir(input_direc) output_name = sample_name.split('.')[0] + "_hisat2_alignment" output_dir = os.path.join(directory, output_name) if not os.path.exists(output_dir): os.mkdir(output_dir) print directory base = handler_util.get_file_with_suffix(directory, ".1.ht2") print base hisat2_base = os.path.join(directory, base) ### Adding advanced options to Bowtie2Call hisat2_cmd = '' hisat2_cmd += (' -p {0}'.format(self.num_threads)) if ('quality_score' in params and params['quality_score'] is not None): hisat2_cmd += (' --' + params['quality_score']) if ('alignment_type' in params and params['alignment_type'] is not None): hisat2_cmd += (' --' + params['alignment_type']) if ('trim5' in params and params['trim5'] is not None): hisat2_cmd += (' --trim5 ' + str(params['trim5'])) if ('trim3' in params and params['trim3'] is not None): hisat2_cmd += (' --trim3 ' + str(params['trim3'])) if ('np' in params and params['np'] is not None): hisat2_cmd += (' --np ' + str(params['np'])) if ('minins' in params and params['minins'] is not None): hisat2_cmd += (' --minins ' + str(params['minins'])) if ('maxins' in params and params['maxins'] is not None): hisat2_cmd += (' --maxins ' + str(params['maxins'])) #if('orientation' in params and params['orientation'] is not None): hisat2_cmd += ( ' --'+params['orientation']) if ('min_intron_length' in params and params['min_intron_length'] is not None): hisat2_cmd += (' --min-intronlen ' + str(params['min_intron_length'])) if ('max_intron_length' in params and params['max_intron_length'] is not None): hisat2_cmd += (' --max-intronlen ' + str(params['max_intron_length'])) if ('no_spliced_alignment' in params and params['no_spliced_alignment'] != 0): hisat2_cmd += (' --no-spliced-alignment') if ('transcriptome_mapping_only' in params and params['transcriptome_mapping_only'] != 0): hisat2_cmd += (' --transcriptome-mapping-only') if ('tailor_alignments' in params and params['tailor_alignments'] is not None): hisat2_cmd += (' --' + params['tailor_alignments']) out_file = output_dir + "/accepted_hits.sam" #### try: sample_ref = script_util.ws_get_ref(self.logger, ws_client, ws_id, read_sample) ds = script_util.ru_reads_download(self.logger, sample_ref, input_direc, token) self.logger.info(ds) except Exception, e: self.logger.exception(e) raise Exception( "Unable to download reads file , {0}".format(read_sample)) if sample_type == 'KBaseAssembly.SingleEndLibrary' or sample_type == 'KBaseFile.SingleEndLibrary': lib_type = 'SingleEnd' hisat2_cmd += " -U {0} -x {1} -S {2}".format( ds['fwd'], hisat2_base, out_file) if sample_type == 'KBaseAssembly.PairedEndLibrary' or sample_type == 'KBaseFile.PairedEndLibrary': lib_type = 'PairedEnd' if sample_type == 'KBaseAssembly.PairedEndLibrary': if ('orientation' in params and params['orientation'] is not None): hisat2_cmd += (' --' + params['orientation']) else: # TODO: the following can be read from PEL object if ('orientation' in params and params['orientation'] is not None): hisat2_cmd += (' --' + params['orientation']) hisat2_cmd += " -1 {0} -2 {1} -x {2} -S {3}".format( ds['fwd'], ds['rev'], hisat2_base, out_file) #if sample_type == 'KBaseAssembly.SingleEndLibrary' or sample_type == 'KBaseFile.SingleEndLibrary': # lib_type = 'SingleEnd' # if sample_type == 'KBaseAssembly.SingleEndLibrary': # read_id = r_sample['data']['handle']['id'] # read_name = r_sample['data']['handle']['file_name'] # else: # read_id = r_sample['data']['lib']['file']['id'] # read_name = r_sample['data']['lib']['file']['file_name'] # try: # script_util.download_file_from_shock(self.logger, shock_service_url=self.urls['shock_service_url'], shock_id=read_id,filename=read_name, directory=input_direc,token=token) # hisat2_cmd += " -U {0} -x {1} -S {2}".format(os.path.join(input_direc,read_name),hisat2_base,out_file) # except Exception,e: # self.logger.exception(e) # raise Exception( "Unable to download shock file , {0}".format(read_name)) #if sample_type == 'KBaseAssembly.PairedEndLibrary' or sample_type == 'KBaseFile.PairedEndLibrary': # lib_type = 'PairedEnd' # if sample_type == 'KBaseAssembly.PairedEndLibrary': # if('orientation' in params and params['orientation'] is not None): hisat2_cmd += ( ' --'+params['orientation']) # read1_id = r_sample['data']['handle_1']['id'] # read1_name = r_sample['data']['handle_1']['file_name'] # read2_id = r_sample['data']['handle_2']['id'] # read2_name = r_sample['data']['handle_2']['file_name'] # else: # # TODO: the following can be read from PEL object # if('orientation' in params and params['orientation'] is not None): hisat2_cmd += ( ' --'+params['orientation']) # read1_id = r_sample['data']['lib1']['file']['id'] # read1_name = r_sample['data']['lib1']['file']['file_name'] # read2_id = r_sample['data']['lib2']['file']['id'] # read2_name = r_sample['data']['lib2']['file']['file_name'] # try: # script_util.download_file_from_shock(self.logger, shock_service_url=self.urls['shock_service_url'], shock_id=read1_id,filename=read1_name, directory=input_direc,token=token) # script_util.download_file_from_shock(self.logger, shock_service_url=self.urls['shock_service_url'], shock_id=read2_id,filename=read2_name, directory=input_direc,token=token) # hisat2_cmd += " -1 {0} -2 {1} -x {2} -S {3}".format(os.path.join(input_direc,read1_name),os.path.join(input_direc,read2_name),hisat2_base,out_file) # except Exception,e: # logger.exception(e) # raise Exception( "Unable to download shock file , {0} or {1}".format(read1_name,read2_name)) try: self.logger.info("Executing: hisat2 {0}".format(hisat2_cmd)) cmdline_output = script_util.runProgram( self.logger, "hisat2", hisat2_cmd, None, directory) except Exception, e: logger.exception(e) raise Exception("Failed to run command {0}".format(hisat2_cmd))
def runEach(self,task_params): ws_client = self.common_params['ws_client'] hs = self.common_params['hs_client'] params = self.method_params logger = self.logger token = self.common_params['user_token'] read_sample = task_params['job_id'] condition = task_params['label'] directory = task_params['tophat_dir'] ws_id = task_params['ws_id'] genome_id = task_params['annotation_id'] sampleset_id = task_params['sampleset_id'] gtf_file = task_params['gtf_file'] print "Downloading Read Sample{0}".format(read_sample) logger.info("Downloading Read Sample{0}".format(read_sample)) try: #r_sample = ws_client.get_objects( # [{ 'name' : read_sample, 'workspace' : ws_id}])[0] r_sample = script_util.ws_get_obj(logger,ws_client, ws_id, read_sample)[0] #r_sample_info = ws_client.get_object_info_new({"objects": [{'name': read_sample, 'workspace': ws_id}]})[0] #sample_type = r_sample_info[2].split('-')[0] sample_type = script_util.ws_get_type_name(logger, ws_client, ws_id, read_sample) sample_name = script_util.ws_get_obj_name4file(self.logger, ws_client, ws_id, read_sample) output_name = sample_name.split('.')[0]+"_tophat_alignment" output_dir = os.path.join(directory,output_name) #if not os.path.exists(output_dir): os.makedirs(output_dir) #out_file = output_dir +"/accepted_hits.sam" bowtie2_base =os.path.join(directory,handler_util.get_file_with_suffix(directory,".rev.1.bt2")) ### Adding advanced options to Bowtie2Call tophat_cmd = (' -p '+str(self.num_threads)) if('max_intron_length' in params and params['max_intron_length'] is not None ) : tophat_cmd += (' -I '+str(params['max_intron_length'])) if('min_intron_length' in params and params['min_intron_length'] is not None ): tophat_cmd += (' -i '+str(params['min_intron_length'])) if('min_anchor_length' in params and params['min_anchor_length'] is not None ): tophat_cmd += (' -a '+str(params['min_anchor_length'])) if('read_edit_dist' in params and params['read_edit_dist'] is not None ) : tophat_cmd += (' --read-edit-dist '+str(params['read_edit_dist'])) if('read_gap_length' in params and params['read_gap_length'] is not None) : tophat_cmd += (' --read-gap-length '+str(params['read_gap_length'])) if('read_mismatches' in params and params['read_mismatches'] is not None) : tophat_cmd += (' -N '+str(params['read_mismatches'])) if('library_type' in params and params['library_type'] is not None ) : tophat_cmd += (' --library-type ' + params['library_type']) if('report_secondary_alignments' in params and int(params['report_secondary_alignments']) == 1) : tophat_cmd += ' --report-secondary-alignments' if('no_coverage_search' in params and int(params['no_coverage_search']) == 1): tophat_cmd += ' --no-coverage-search' if('preset_options' in params and params['preset_options'] is not None ): tophat_cmd += ' --'+params['preset_options'] #out_file = output_dir +"/accepted_hits.sam" try: sample_ref = script_util.ws_get_ref(self.logger, ws_client, ws_id, read_sample) ds = script_util.ru_reads_download(self.logger, sample_ref,directory, token) except Exception,e: self.logger.exception(e) raise Exception( "Unable to download reads file , {0}".format(read_sample)) if sample_type == 'KBaseAssembly.SingleEndLibrary' or sample_type == 'KBaseFile.SingleEndLibrary': lib_type = 'SingleEnd' tophat_cmd += ' -o {0} -G {1} {2} {3}'.format(output_dir,gtf_file,bowtie2_base,ds['fwd']) if sample_type == 'KBaseAssembly.PairedEndLibrary' or sample_type == 'KBaseFile.PairedEndLibrary': lib_type = 'PairedEnd' if sample_type == 'KBaseAssembly.PairedEndLibrary': if('orientation' in params and params['orientation'] is not None): tophat_cmd += ( ' --'+params['orientation']) else: # TODO: the following can be read from PEL object if('orientation' in params and params['orientation'] is not None): tophat_cmd += ( ' --'+params['orientation']) tophat_cmd += ' -o {0} -G {1} {2} {3} {4}'.format(output_dir,gtf_file,bowtie2_base,ds['fwd'],ds['rev']) # if sample_type == 'KBaseAssembly.SingleEndLibrary' or sample_type == 'KBaseFile.SingleEndLibrary': # lib_type = 'SingleEnd' # if sample_type == 'KBaseAssembly.SingleEndLibrary': # read_id = r_sample['data']['handle']['id'] # read_name = r_sample['data']['handle']['file_name'] # else: # read_id = r_sample['data']['lib']['file']['id'] # read_name = r_sample['data']['lib']['file']['file_name'] # try: # script_util.download_file_from_shock(self.logger, shock_service_url=self.urls['shock_service_url'], shock_id=read_id,filename=read_name, directory=directory,token=token) # tophat_cmd += ' -o {0} -G {1} {2} {3}'.format(output_dir,gtf_file,bowtie2_base,os.path.join(directory,read_name)) # except Exception,e: # self.logger.exception(e) # raise Exception( "Unable to download shock file , {0}".format(read_name)) # if sample_type == 'KBaseAssembly.PairedEndLibrary' or sample_type == 'KBaseFile.PairedEndLibrary': # lib_type = 'PairedEnd' # if sample_type == 'KBaseAssembly.PairedEndLibrary': # if('orientation' in params and params['orientation'] is not None): tophat_cmd += ( ' --'+params['orientation']) # read1_id = r_sample['data']['handle_1']['id'] # read1_name = r_sample['data']['handle_1']['file_name'] # read2_id = r_sample['data']['handle_2']['id'] # read2_name = r_sample['data']['handle_2']['file_name'] # else: # # TODO: the following can be read from PEL object # if('orientation' in params and params['orientation'] is not None): tophat_cmd += ( ' --'+params['orientation']) # read1_id = r_sample['data']['lib1']['file']['id'] # read1_name = r_sample['data']['lib1']['file']['file_name'] # read2_id = r_sample['data']['lib2']['file']['id'] # read2_name = r_sample['data']['lib2']['file']['file_name'] # try: # script_util.download_file_from_shock(self.logger, shock_service_url=self.urls['shock_service_url'], shock_id=read1_id,filename=read1_name, directory=directory,token=token) # script_util.download_file_from_shock(self.logger, shock_service_url=self.urls['shock_service_url'], shock_id=read2_id,filename=read2_name, directory=directory,token=token) # tophat_cmd += ' -o {0} -G {1} {2} {3} {4}'.format(output_dir,gtf_file,bowtie2_base,os.path.join(directory,read1_name),os.path.join(directory,read2_name)) # except Exception,e: # raise Exception( "Unable to download shock file , {0} or {1}".format(read1_name,read2_name)) try: self.logger.info("Executing: tophat {0}".format(tophat_cmd)) cmdline_output, cmd_err = script_util.runProgram(self.logger,"tophat",tophat_cmd,None,directory) except Exception,e: raise Exception("Failed to run command {0}\n{1}\n{2}".format(tophat_cmd,cmdline_output,cmd_err))
class HiSat2Sample(HiSat2): def __init__(self, logger, directory, urls, max_cores): super(HiSat2Sample, self).__init__(logger, directory, urls, max_cores) # user defined shared variables across methods self.sample_info = None #self.sampleset_info = None self.num_threads = 1 def prepare(self): # for quick testing, we recover parameters here ws_client = self.common_params['ws_client'] hs = self.common_params['hs_client'] params = self.method_params logger = self.logger token = self.common_params['user_token'] hisat2_dir = self.directory try: #sample,annotation_name = ws_client.get_objects( # [{ 'name' : params['sampleset_id'], 'workspace' : params['ws_id']}, # { 'name' : params['genome_id'], 'workspace' : params['ws_id']}]) sample = script_util.ws_get_obj(logger, ws_client, params['ws_id'], params['sampleset_id'])[0] annotation_name = script_util.ws_get_obj(logger, ws_client, params['ws_id'], params['genome_id'])[0] self.sample = sample except Exception, e: logger.exception("".join(traceback.format_exc())) raise ValueError(" Error Downloading objects from the workspace ") ### Get object Info and IDs #sample_info,annotation_info = ws_client.get_object_info_new({"objects": [ # {'name': params['sampleset_id'], 'workspace': params['ws_id']}, # {'name': params['genome_id'], 'workspace': params['ws_id']} # ]}) sample_info = script_util.ws_get_obj_info(logger, ws_client, params['ws_id'], params['sampleset_id'])[0] self.sample_info = sample_info ### Get the workspace object ids for the objects ### sample_id = str(sample_info[6]) + '/' + str( sample_info[0]) + '/' + str(sample_info[4]) #annotation_id = str(annotation_info[6]) + '/' + str(annotation_info[0]) + '/' + str(annotation_info[4]) annotation_id = script_util.ws_get_ref(logger, ws_client, params['ws_id'], params['genome_id']) sample_type = sample_info[2].split('-')[0] lib_types = [ 'KBaseAssembly.SingleEndLibrary', 'KBaseAssembly.PairedEndLibrary', 'KBaseFile.SingleEndLibrary', 'KBaseFile.PairedEndLibrary' ] ### Check if the Library objects exist in the same workspace if not sample_type in lib_types: #'KBaseAssembly.SingleEndLibrary' or sample_type != 'KBaseAssembly.PairedEndLibrary': raise HiSat2SampleException( 'Either of the Library typed objects SingleEndLibrary or PairedEndLibrary is required' ) r_label = 'Single' self.num_jobs = 1 ### Get the Genome Id for the genome selected and get fasta file ref_id, fasta_file = rnaseq_util.get_fa_from_genome( logger, ws_client, self.urls, params['ws_id'], hisat2_dir, params['genome_id']) ### Build Index for the fasta file hisat2base = os.path.basename(fasta_file) #hisat2base =os.path.join(hisat2_dir,handler_util.get_file_with_suffix(hisat2_dir,".fa")) hisat2base_cmd = '{0} {1}'.format(fasta_file, hisat2base) try: logger.info("Building Index for Hisat2 {0}".format(hisat2base_cmd)) cmdline_output = script_util.runProgram(logger, "hisat2-build", hisat2base_cmd, None, hisat2_dir) except Exception, e: raise Exception("Failed to run command {0}".format(hisat2base_cmd))
class TophatSampleSet(Tophat): def __init__(self, logger, directory, urls, max_cores): super(TophatSampleSet, self).__init__(logger, directory, urls, max_cores) # user defined shared variables across methods self.sample = None self.bowtie2index_id = None #self.num_threads = None def prepare(self): # for quick testing, we recover parameters here ws_client = self.common_params['ws_client'] hs = self.common_params['hs_client'] params = self.method_params logger = self.logger token = self.common_params['user_token'] tophat_dir = self.directory try: #sample ,bowtie_index = ws_client.get_objects( # [{'name' : params['sampleset_id'],'workspace' : params['ws_id']}, # { 'name' : params['bowtie_index'], 'workspace' : params['ws_id']}]) sample = script_util.ws_get_obj(logger, ws_client, params['ws_id'],params['sampleset_id'])[0] bowtie_index = script_util.ws_get_obj(logger, ws_client, params['ws_id'],params['bowtie_index'])[0] self.sample = sample except Exception,e: logger.exception("".join(traceback.format_exc())) raise ValueError(" Error Downloading objects from the workspace ") ### Get object Info and IDs sample_info = script_util.ws_get_obj_info(logger, ws_client, params['ws_id'], params['sampleset_id'])[0] sample_type = sample_info[2].split('-')[0] # SampleSet if not (sample_type == 'KBaseRNASeq.RNASeqSampleSet' or sample_type == 'KBaseSets.ReadsSet'): raise TophatSampleSetException('RNASeqSampleSet or ReadsSet is required') (reads, r_label) = rnaseq_util.get_reads_conditions(logger, sample, sample_type) #reads = sample['data']['sample_ids'] #reads_type= sample['data']['Library_type'] # Note: do not need the following as we support ws reference #e_ws_objs = script_util.if_ws_obj_exists_notype(None,ws_client,params['ws_id'],reads) #missing_objs = [i for i in reads if not i in e_ws_objs] #if len(e_ws_objs) != len(reads): # raise ValueError('Missing Library objects {0} in the {1}. please copy them and run this method'.format(",".join(missing_objs),params['ws_id'])) ### Get obejct IDs #bowtie2_index_info,sampleset_info = ws_client.get_object_info_new({"objects": [{'name': params['bowtie_index'], 'workspace': params['ws_id']},{'name': params['sampleset_id'], 'workspace': params['ws_id']}]}) #self.bowtie2index_id = str(bowtie2_index_info[6]) + '/' + str(bowtie2_index_info[0]) + '/' + str(bowtie2_index_info[4]) #sampleset_id = str(sampleset_info[6]) + '/' + str(sampleset_info[0]) + '/' + str(sampleset_info[4]) self.bowtie2index_id = script_util.ws_get_ref(logger, ws_client, params['ws_id'], params['bowtie_index']) sampleset_id = script_util.ws_get_ref(logger, ws_client, params['ws_id'], params['sampleset_id']) bw_id = bowtie_index['data']['handle']['id'] bw_name = bowtie_index['data']['handle']['file_name'] genome_id = bowtie_index['data']['genome_id'] annotation_gtf = ws_client.get_object_info([{"ref" :genome_id}],includeMetadata=None)[0][1] shared_files={} shared_files[bw_name] = bw_id script_util.download_shock_files(logger,self.urls['shock_service_url'],tophat_dir,shared_files,token) try: logger.info("Unzipping Bowtie2 Indices") script_util.unzip_files(logger,os.path.join(tophat_dir,bw_name),tophat_dir) mv_dir= handler_util.get_dir(tophat_dir) if mv_dir is not None: script_util.move_files(logger,mv_dir,tophat_dir) except Exception, e: logger.error("".join(traceback.format_exc())) raise Exception("Unzip indexfile error")
class HiSat2SampleSet(HiSat2): def __init__(self, logger, directory, urls, max_cores): super(HiSat2SampleSet, self).__init__(logger, directory, urls, max_cores) # user defined shared variables across methods self.sample = None self.sampleset_info = None #self.num_threads = None def prepare(self): # for quick testing, we recover parameters here ws_client = self.common_params['ws_client'] hs = self.common_params['hs_client'] params = self.method_params logger = self.logger token = self.common_params['user_token'] hisat2_dir = self.directory try: #sample,annotation_name = ws_client.get_objects( # [{ 'name' : params['sampleset_id'], 'workspace' : params['ws_id']}, # { 'name' : params['genome_id'], 'workspace' : params['ws_id']}]) sample = script_util.ws_get_obj(logger, ws_client, params['ws_id'], params['sampleset_id'])[0] annotation_name = script_util.ws_get_obj(logger, ws_client, params['ws_id'], params['genome_id'])[0] self.sample = sample except Exception, e: logger.exception("".join(traceback.format_exc())) raise ValueError(" Error Downloading objects from the workspace ") ### Get object Info and IDs #sampleset_info,annotation_info = ws_client.get_object_info_new({"objects": [ # {'name': params['sampleset_id'], 'workspace': params['ws_id']}, # {'name': params['genome_id'], 'workspace': params['ws_id']} # ]}) sampleset_info = script_util.ws_get_obj_info(logger, ws_client, params['ws_id'], params['sampleset_id'])[0] self.sampleset_info = sampleset_info ### Get the workspace object ids for the objects ### sampleset_id = str(sampleset_info[6]) + '/' + str( sampleset_info[0]) + '/' + str(sampleset_info[4]) #annotation_id = str(annotation_info[6]) + '/' + str(annotation_info[0]) + '/' + str(annotation_info[4]) annotation_id = script_util.ws_get_ref(logger, ws_client, params['ws_id'], params['genome_id']) sample_type = sampleset_info[2].split('-')[0] ### Check if the Library objects exist in the same workspace if not (sample_type == 'KBaseRNASeq.RNASeqSampleSet' or sample_type == 'KBaseSets.ReadsSet'): raise HiSat2SampleSetException( 'RNASeqSampleSet or ReadsSet is required') #logger.info("Check if the Library objects do exist in the current workspace") #reads = sample['data']['sample_ids'] #r_label = sample['data']['condition'] (reads, r_label) = rnaseq_util.get_reads_conditions(logger, sample, sample_type) #e_ws_objs = script_util.if_ws_obj_exists_notype(None,ws_client,params['ws_id'],reads) #missing_objs = [i for i in reads if not i in e_ws_objs] #if len(e_ws_objs) != len(reads): # raise HiSat2SampleSetException('Missing Library objects {0} in the {1}. please copy them and run this method'.format(",".join(missing_objs),params['ws_id'])) self.num_jobs = len(reads) ref_id, fasta_file = rnaseq_util.get_fa_from_genome( logger, ws_client, self.urls, params['ws_id'], hisat2_dir, params['genome_id']) hisat2base = os.path.basename(fasta_file) #hisat2base =os.path.join(hisat2_dir,handler_util.get_file_with_suffix(hisat2_dir,".fa")) hisat2base_cmd = '{0} {1}'.format(fasta_file, hisat2base) try: logger.info("Building Index for Hisat2 {0}".format(hisat2base_cmd)) cmdline_output = script_util.runProgram(logger, "hisat2-build", hisat2base_cmd, None, hisat2_dir) except Exception, e: raise Exception("Failed to run command {0}".format(hisat2base_cmd))
class TophatSample(Tophat): def __init__(self, logger, directory, urls, max_cores): super(TophatSample, self).__init__(logger, directory, urls, max_cores) # user defined shared variables across methods self.bowtie2index_id = None self.num_threads = 1 def prepare(self): # for quick testing, we recover parameters here ws_client = self.common_params['ws_client'] hs = self.common_params['hs_client'] params = self.method_params logger = self.logger token = self.common_params['user_token'] tophat_dir = self.directory try: #sample ,bowtie_index = ws_client.get_objects( # [{'name' : params['sampleset_id'],'workspace' : params['ws_id']}, # { 'name' : params['bowtie_index'], 'workspace' : params['ws_id']}]) sample = script_util.ws_get_obj(logger, ws_client, params['ws_id'], params['sampleset_id'])[0] bowtie_index = script_util.ws_get_obj(logger, ws_client, params['ws_id'], params['bowtie_index'])[0] self.sample = sample except Exception, e: logger.exception("".join(traceback.format_exc())) raise ValueError(" Error Downloading objects from the workspace ") ### Get object Info and IDs #sample_info = ws_client.get_object_info_new({"objects": [{'name': params['sampleset_id'], 'workspace': params['ws_id']}]})[0] #sample_type = sample_info[2].split('-')[0] sample_type = script_util.ws_get_type_name(logger, ws_client, params['ws_id'], params['sampleset_id']) ### Get obejct IDs #bowtie2_index_info,sampleset_info = ws_client.get_object_info_new({"objects": [{'name': params['bowtie_index'], 'workspace': params['ws_id']},{'name': params['sampleset_id'], 'workspace': params['ws_id']}]}) #self.bowtie2index_id = str(bowtie2_index_info[6]) + '/' + str(bowtie2_index_info[0]) + '/' + str(bowtie2_index_info[4]) #sampleset_id = str(sampleset_info[6]) + '/' + str(sampleset_info[0]) + '/' + str(sampleset_info[4]) self.bowtie2index_id = script_util.ws_get_ref(logger, ws_client, params['ws_id'], params['bowtie_index']) sampleset_id = script_util.ws_get_ref(logger, ws_client, params['ws_id'], params['sampleset_id']) bw_id = bowtie_index['data']['handle']['id'] bw_name = bowtie_index['data']['handle']['file_name'] genome_id = bowtie_index['data']['genome_id'] annotation_gtf = ws_client.get_object_info([{ "ref": genome_id }], includeMetadata=None)[0][1] shared_files = {} shared_files[bw_name] = bw_id script_util.download_shock_files(logger, self.urls['shock_service_url'], tophat_dir, shared_files, token) try: logger.info("Unzipping Bowtie2 Indices") script_util.unzip_files(logger, os.path.join(tophat_dir, bw_name), tophat_dir) mv_dir = handler_util.get_dir(tophat_dir) if mv_dir is not None: script_util.move_files(logger, mv_dir, tophat_dir) except Exception, e: logger.error("".join(traceback.format_exc())) raise Exception("Unzip indexfile error")
def runEach(self,task_params): ws_client = self.common_params['ws_client'] hs = self.common_params['hs_client'] params = self.method_params logger = self.logger token = self.common_params['user_token'] read_sample = task_params['job_id'] condition = task_params['label'] directory = task_params['bowtie2_dir'] ws_id = task_params['ws_id'] genome_id = task_params['annotation_id'] sampleset_id = task_params['sampleset_id'] print "Downloading Read Sample{0}".format(read_sample) logger.info("Downloading Read Sample{0}".format(read_sample)) try: #r_sample = ws_client.get_objects( # [{ 'name' : read_sample, 'workspace' : ws_id}])[0] r_sample = script_util.ws_get_obj(logger,ws_client, ws_id, read_sample)[0] #r_sample_info = ws_client.get_object_info_new({"objects": [{'name': read_sample, 'workspace': ws_id}]})[0] #sample_type = r_sample_info[2].split('-')[0] sample_type = script_util.ws_get_type_name(logger, ws_client, ws_id, read_sample) sample_name = script_util.ws_get_obj_name4file(self.logger, ws_client, ws_id, read_sample) input_direc = os.path.join(directory,sample_name.split('.')[0]+"_bowtie2_input") if not os.path.exists(input_direc): os.mkdir(input_direc) output_name = sample_name.split('.')[0]+"_bowtie2_alignment" output_dir = os.path.join(directory,output_name) if not os.path.exists(output_dir): os.mkdir(output_dir) base = handler_util.get_file_with_suffix(directory,".rev.1.bt2") bowtie2_base =os.path.join(directory,base) ### Adding advanced options to Bowtie2Call bowtie2_cmd = '' bowtie2_cmd += ( ' -p {0}'.format(self.num_threads)) if('quality_score' in params and params['quality_score'] is not None): bowtie2_cmd += ( ' --'+params['quality_score']) if('alignment_type' in params and params['alignment_type'] is not None): bowtie2_cmd += ( ' --'+params['alignment_type'] ) if('preset_options' in params and params['preset_options'] is not None ) and ('alignment_type' in params and params['alignment_type'] is not None): if (params['alignment_type'] == 'local'): bowtie2_cmd += (' --'+params['preset_options']+'-local') else: bowtie2_cmd += (' --'+params['preset_options'] ) if('trim5' in params and params['trim5'] is not None): bowtie2_cmd += ( ' --trim5 '+str(params['trim5'])) if('trim3' in params and params['trim3'] is not None): bowtie2_cmd += ( ' --trim3 '+str(params['trim3'])) if('np' in params and params['np'] is not None): bowtie2_cmd += ( ' --np '+str(params['np'])) if('minins' in params and params['minins'] is not None): bowtie2_cmd += ( ' --minins '+str(params['minins'])) if('maxins' in params and params['maxins'] is not None): bowtie2_cmd += ( ' --maxins '+str(params['maxins'])) out_file = output_dir +"/accepted_hits.sam" #### try: sample_ref = script_util.ws_get_ref(self.logger, ws_client, ws_id, read_sample) ds = script_util.ru_reads_download(self.logger, sample_ref,input_direc, token) except Exception,e: self.logger.exception(e) raise Exception( "Unable to download reads file , {0}".format(read_sample)) if sample_type == 'KBaseAssembly.SingleEndLibrary' or sample_type == 'KBaseFile.SingleEndLibrary': lib_type = 'SingleEnd' bowtie2_cmd += " -U {0} -x {1} -S {2}".format(ds['fwd'],bowtie2_base,out_file) if sample_type == 'KBaseAssembly.PairedEndLibrary' or sample_type == 'KBaseFile.PairedEndLibrary': lib_type = 'PairedEnd' if sample_type == 'KBaseAssembly.PairedEndLibrary': if('orientation' in params and params['orientation'] is not None): hisat2_cmd += ( ' --'+params['orientation']) else: # TODO: the following can be read from PEL object if('orientation' in params and params['orientation'] is not None): hisat2_cmd += ( ' --'+params['orientation']) hisat2_cmd += " -1 {0} -2 {1} -x {2} -S {3}".format(ds['fwd'], ds['rev'],hisat2_base,out_file) bowtie2_cmd += " -1 {0} -2 {1} -x {2} -S {3}".format(ds['fwd'], ds['rev'],bowtie2_base,out_file) ### # if sample_type == 'KBaseAssembly.SingleEndLibrary' or sample_type == 'KBaseFile.SingleEndLibrary': # lib_type = 'SingleEnd' # if sample_type == 'KBaseAssembly.SingleEndLibrary': # read_id = r_sample['data']['handle']['id'] # read_name = r_sample['data']['handle']['file_name'] # else: # read_id = r_sample['data']['lib']['file']['id'] # read_name = r_sample['data']['lib']['file']['file_name'] # try: # script_util.download_file_from_shock(self.logger, shock_service_url=self.urls['shock_service_url'], shock_id=read_id,filename=read_name, directory=input_direc,token=token) # bowtie2_cmd += " -U {0} -x {1} -S {2}".format(os.path.join(input_direc,read_name),bowtie2_base,out_file) # except Exception,e: # self.logger.exception(e) # raise Exception( "Unable to download shock file , {0}".format(read_name)) # if sample_type == 'KBaseAssembly.PairedEndLibrary' or sample_type == 'KBaseFile.PairedEndLibrary': # lib_type = 'PairedEnd' # if sample_type == 'KBaseAssembly.PairedEndLibrary': # if('orientation' in params and params['orientation'] is not None): bowtie2_cmd += ( ' --'+params['orientation']) # read1_id = r_sample['data']['handle_1']['id'] # read1_name = r_sample['data']['handle_1']['file_name'] # read2_id = r_sample['data']['handle_2']['id'] # read2_name = r_sample['data']['handle_2']['file_name'] # else: # # TODO: the following can be read from PEL object # if('orientation' in params and params['orientation'] is not None): bowtie2_cmd += ( ' --'+params['orientation']) # read1_id = r_sample['data']['lib1']['file']['id'] # read1_name = r_sample['data']['lib1']['file']['file_name'] # read2_id = r_sample['data']['lib2']['file']['id'] # read2_name = r_sample['data']['lib2']['file']['file_name'] # try: # script_util.download_file_from_shock(self.logger, shock_service_url=self.urls['shock_service_url'], shock_id=read1_id,filename=read1_name, directory=input_direc,token=token) # script_util.download_file_from_shock(self.logger, shock_service_url=self.urls['shock_service_url'], shock_id=read2_id,filename=read2_name, directory=input_direc,token=token) # bowtie2_cmd += " -1 {0} -2 {1} -x {2} -S {3}".format(os.path.join(input_direc,read1_name),os.path.join(input_direc,read2_name),bowtie2_base,out_file) # except Exception,e: # raise Exception( "Unable to download shock file , {0} or {1}".format(read1_name,read2_name)) try: self.logger.info("Executing: bowtie2 {0}".format(bowtie2_cmd)) cmdline_output = script_util.runProgram(self.logger,"bowtie2",bowtie2_cmd,None,directory) except Exception,e: raise Exception("Failed to run command {0}".format(bowtie2_cmd))
def runEach(self, task_params): ws_client = self.common_params['ws_client'] hs = self.common_params['hs_client'] params = self.method_params logger = self.logger token = self.common_params['user_token'] read_sample = task_params['job_id'] condition = task_params['label'] directory = task_params['tophat_dir'] ws_id = task_params['ws_id'] genome_id = task_params['annotation_id'] sampleset_id = task_params['sampleset_id'] gtf_file = task_params['gtf_file'] print "Downloading Read Sample{0}".format(read_sample) logger.info("Downloading Read Sample{0}".format(read_sample)) try: #r_sample = ws_client.get_objects( # [{ 'name' : read_sample, 'workspace' : ws_id}])[0] r_sample = script_util.ws_get_obj(logger, ws_client, ws_id, read_sample)[0] #r_sample_info = ws_client.get_object_info_new({"objects": [{'name': read_sample, 'workspace': ws_id}]})[0] #sample_type = r_sample_info[2].split('-')[0] sample_type = script_util.ws_get_type_name(logger, ws_client, ws_id, read_sample) sample_name = script_util.ws_get_obj_name4file( self.logger, ws_client, ws_id, read_sample) output_name = sample_name.split('.')[0] + "_tophat_alignment" output_dir = os.path.join(directory, output_name) #if not os.path.exists(output_dir): os.makedirs(output_dir) #out_file = output_dir +"/accepted_hits.sam" bowtie2_base = os.path.join( directory, handler_util.get_file_with_suffix(directory, ".rev.1.bt2")) ### Adding advanced options to Bowtie2Call tophat_cmd = (' -p ' + str(self.num_threads)) if ('max_intron_length' in params and params['max_intron_length'] is not None): tophat_cmd += (' -I ' + str(params['max_intron_length'])) if ('min_intron_length' in params and params['min_intron_length'] is not None): tophat_cmd += (' -i ' + str(params['min_intron_length'])) if ('min_anchor_length' in params and params['min_anchor_length'] is not None): tophat_cmd += (' -a ' + str(params['min_anchor_length'])) if ('read_edit_dist' in params and params['read_edit_dist'] is not None): tophat_cmd += (' --read-edit-dist ' + str(params['read_edit_dist'])) if ('read_gap_length' in params and params['read_gap_length'] is not None): tophat_cmd += (' --read-gap-length ' + str(params['read_gap_length'])) if ('read_mismatches' in params and params['read_mismatches'] is not None): tophat_cmd += (' -N ' + str(params['read_mismatches'])) if ('library_type' in params and params['library_type'] is not None): tophat_cmd += (' --library-type ' + params['library_type']) if ('report_secondary_alignments' in params and int(params['report_secondary_alignments']) == 1): tophat_cmd += ' --report-secondary-alignments' if ('no_coverage_search' in params and int(params['no_coverage_search']) == 1): tophat_cmd += ' --no-coverage-search' if ('preset_options' in params and params['preset_options'] is not None): tophat_cmd += ' --' + params['preset_options'] #out_file = output_dir +"/accepted_hits.sam" try: sample_ref = script_util.ws_get_ref(self.logger, ws_client, ws_id, read_sample) ds = script_util.ru_reads_download(self.logger, sample_ref, directory, token) except Exception, e: self.logger.exception(e) raise Exception( "Unable to download reads file , {0}".format(read_sample)) if sample_type == 'KBaseAssembly.SingleEndLibrary' or sample_type == 'KBaseFile.SingleEndLibrary': lib_type = 'SingleEnd' tophat_cmd += ' -o {0} -G {1} {2} {3}'.format( output_dir, gtf_file, bowtie2_base, ds['fwd']) if sample_type == 'KBaseAssembly.PairedEndLibrary' or sample_type == 'KBaseFile.PairedEndLibrary': lib_type = 'PairedEnd' if sample_type == 'KBaseAssembly.PairedEndLibrary': if ('orientation' in params and params['orientation'] is not None): tophat_cmd += (' --' + params['orientation']) else: # TODO: the following can be read from PEL object if ('orientation' in params and params['orientation'] is not None): tophat_cmd += (' --' + params['orientation']) tophat_cmd += ' -o {0} -G {1} {2} {3} {4}'.format( output_dir, gtf_file, bowtie2_base, ds['fwd'], ds['rev']) # if sample_type == 'KBaseAssembly.SingleEndLibrary' or sample_type == 'KBaseFile.SingleEndLibrary': # lib_type = 'SingleEnd' # if sample_type == 'KBaseAssembly.SingleEndLibrary': # read_id = r_sample['data']['handle']['id'] # read_name = r_sample['data']['handle']['file_name'] # else: # read_id = r_sample['data']['lib']['file']['id'] # read_name = r_sample['data']['lib']['file']['file_name'] # try: # script_util.download_file_from_shock(self.logger, shock_service_url=self.urls['shock_service_url'], shock_id=read_id,filename=read_name, directory=directory,token=token) # tophat_cmd += ' -o {0} -G {1} {2} {3}'.format(output_dir,gtf_file,bowtie2_base,os.path.join(directory,read_name)) # except Exception,e: # self.logger.exception(e) # raise Exception( "Unable to download shock file , {0}".format(read_name)) # if sample_type == 'KBaseAssembly.PairedEndLibrary' or sample_type == 'KBaseFile.PairedEndLibrary': # lib_type = 'PairedEnd' # if sample_type == 'KBaseAssembly.PairedEndLibrary': # if('orientation' in params and params['orientation'] is not None): tophat_cmd += ( ' --'+params['orientation']) # read1_id = r_sample['data']['handle_1']['id'] # read1_name = r_sample['data']['handle_1']['file_name'] # read2_id = r_sample['data']['handle_2']['id'] # read2_name = r_sample['data']['handle_2']['file_name'] # else: # # TODO: the following can be read from PEL object # if('orientation' in params and params['orientation'] is not None): tophat_cmd += ( ' --'+params['orientation']) # read1_id = r_sample['data']['lib1']['file']['id'] # read1_name = r_sample['data']['lib1']['file']['file_name'] # read2_id = r_sample['data']['lib2']['file']['id'] # read2_name = r_sample['data']['lib2']['file']['file_name'] # try: # script_util.download_file_from_shock(self.logger, shock_service_url=self.urls['shock_service_url'], shock_id=read1_id,filename=read1_name, directory=directory,token=token) # script_util.download_file_from_shock(self.logger, shock_service_url=self.urls['shock_service_url'], shock_id=read2_id,filename=read2_name, directory=directory,token=token) # tophat_cmd += ' -o {0} -G {1} {2} {3} {4}'.format(output_dir,gtf_file,bowtie2_base,os.path.join(directory,read1_name),os.path.join(directory,read2_name)) # except Exception,e: # raise Exception( "Unable to download shock file , {0} or {1}".format(read1_name,read2_name)) try: self.logger.info("Executing: tophat {0}".format(tophat_cmd)) cmdline_output, cmd_err = script_util.runProgram( self.logger, "tophat", tophat_cmd, None, directory) except Exception, e: raise Exception("Failed to run command {0}\n{1}\n{2}".format( tophat_cmd, cmdline_output, cmd_err))