def run_skip(self, reads_file): """ Doesn't run RQCFilter, but a dummy skip version. It returns the same result structure, so it doesn't derail the other pipeline steps. However, the "filtered_fastq_file" is the unchanged fastq file, other than gzipping it. run_log is just an empty (but existing!) file. """ print("NOT running RQCFilter, just putting together some results.") # make the dummy output dir outdir = os.path.join( self.scratch_dir, "dummy_rqcfilter_output_{}".format(int(time() * 1000))) mkdir(outdir) # mock up a log file dummy_log = os.path.join(outdir, "dummy_rqcfilter_log.txt") open(dummy_log, 'w').close() # just compress the reads and move them into that output dir (probably don't need to # move them, but let's be consistent) dfu = DataFileUtil(self.callback_url) compressed_reads = dfu.pack_file({ "file_path": reads_file, "pack": "gzip" })["file_path"] base_name = os.path.basename(compressed_reads) not_filtered_reads = os.path.join(outdir, base_name) os.rename(compressed_reads, not_filtered_reads) return { "output_directory": outdir, "filtered_fastq_file": not_filtered_reads, "run_log": dummy_log, "command": "BBTools.run_RQCFilter_local -- skipped. No command run.", "version_string": "KBase BBTools module" }
def upload_html_set(self, ctx, params): """ Upload an HTML file set to the KBase data stores. :param params: instance of type "UploadHTMLSetInput" (Input to the upload_html_set function. Required arguments: One of: wsid - the id of the workspace where the reads will be saved (preferred). wsname - the name of the workspace where the reads will be saved. One of: objid - the id of the workspace object to save over name - the name to which the workspace object will be saved path - the path to the directory with the HTML files. This directory will be compressed and loaded into the KBase stores.) -> structure: parameter "wsid" of Long, parameter "wsname" of String, parameter "objid" of Long, parameter "name" of String, parameter "path" of String :returns: instance of type "UploadHTMLSetOutput" (Output of the upload_html_set function. obj_ref - a reference to the new Workspace object in the form X/Y/Z, where X is the workspace ID, Y is the object ID, and Z is the version.) -> structure: parameter "obj_ref" of String """ # ctx is the context object # return variables are: out #BEGIN upload_html_set del ctx wsid = params.get('wsid') wsname = params.get('wsname') if not self.xor(wsid, wsname): raise ValueError( 'Exactly one of the workspace ID or name must be provided') dfu = DataFileUtil(self.callback_url) if wsname: self.log('Translating workspace name to id') if not isinstance(wsname, six.string_types): raise ValueError('wsname must be a string') wsid = dfu.ws_name_to_id(wsname) self.log('translation done') del wsname objid = params.get('objid') name = params.get('name') if not self.xor(objid, name): raise ValueError( 'Exactly one of the object ID or name must be provided') htmlpath = params.get('path') if not htmlpath: raise ValueError('path parameter is required') htmlpath = os.path.abspath(os.path.expanduser(htmlpath)) if not os.path.isdir(htmlpath): raise ValueError('path must be a directory') zipfile = dfu.pack_file({ 'file_path': htmlpath, 'pack': 'zip' })['file_path'] if os.path.getsize(zipfile) > self.MAX_ZIP_SIZE: os.remove(zipfile) raise ValueError('Zipfile from specified directory is greater ' + 'than maximum size allowed: ' + str(self.MAX_ZIP_SIZE)) fh, tf = tempfile.mkstemp(dir=self.scratch) os.close(fh) with open(tf, 'w') as objfile, open(zipfile, 'rb') as z: objfile.write('{"file":"') d = z.read(self.CHUNKSIZE) while d: objfile.write(base64.b64encode(d)) d = z.read(self.CHUNKSIZE) objfile.write('"}') os.remove(zipfile) so = { 'type': 'HTMLFileSetUtils.HTMLFileSet-0.1', # TODO release 'data_json_file': tf } if name: so['name'] = name else: so['objid'] = objid wsio = WsLargeDataIO(self.callback_url, service_ver='dev') # TODO remove dev @IgnorePep8 ret = wsio.save_objects({'id': wsid, 'objects': [so]})[0] os.remove(tf) out = {'obj_ref': str(ret[6]) + '/' + str(ret[0]) + '/' + str(ret[4])} #END upload_html_set # At some point might do deeper type checking... if not isinstance(out, dict): raise ValueError('Method upload_html_set return value ' + 'out is not type dict as required.') # return the results return [out]
class STARUtils: STAR_VERSION = 'STAR 2.5.3a' STAR_BIN = '/kb/deployment/bin/STAR' STAR_IDX_DIR = 'STAR_Genome_index' STAR_OUT_DIR = 'STAR_Output' PARAM_IN_WS = 'output_workspace' PARAM_IN_FASTA_FILES = 'genomeFastaFiles' PARAM_IN_OUTFILE_PREFIX = 'outFileNamePrefix' PARAM_IN_STARMODE = 'runMode' PARAM_IN_THREADN = 'runThreadN' PARAM_IN_READS_FILES = 'readFilesIn' PARAM_IN_READS = 'readsset_ref' PARAM_IN_GENOME = 'genome_ref' SET_READS = 'set_reads_refs' def __init__(self, scratch_dir, workspace_url, callback_url, srv_wiz_url, provenance): self.workspace_url = workspace_url self.callback_url = callback_url self.srv_wiz_url = srv_wiz_url self.au = AssemblyUtil(self.callback_url) self.dfu = DataFileUtil(self.callback_url, service_ver='beta') self.scratch = scratch_dir self.working_dir = scratch_dir self.prog_runner = Program_Runner(self.STAR_BIN, self.scratch) self.provenance = provenance self.ws_client = Workspace(self.workspace_url) self.parallel_runner = KBParallel(self.callback_url) self.qualimap = kb_QualiMap(self.callback_url, service_ver='dev') self.set_api_client = SetAPI(self.srv_wiz_url, service_ver='dev') self.eu = ExpressionUtils(self.callback_url, service_ver='beta') def process_params(self, params): """ process_params: checks params passed to run_star method and set default values """ log('Start validating run_star parameters') # check for required parameters if params.get(self.PARAM_IN_WS, None) is None: raise ValueError(self.PARAM_IN_WS + ' parameter is required') if params.get(self.PARAM_IN_STARMODE, None) is None: params[self.PARAM_IN_STARMODE] = 'alignReads' if params.get(self.PARAM_IN_GENOME, None) is None: raise ValueError(self.PARAM_IN_GENOME + ' parameter is required for generating genome index') if (params.get(self.PARAM_IN_STARMODE, None) is not None and params[self.PARAM_IN_STARMODE] != "genomeGenerate"): if params.get(self.PARAM_IN_READS, None) is None: raise ValueError(self.PARAM_IN_READS + ' parameter is required for reads mapping') if not valid_string(params[self.PARAM_IN_READS], is_ref=True): raise ValueError("Parameter readsset_ref must be a valid Workspace object reference, " "not {}".format(params.get(self.PARAM_IN_READS, None))) if params.get(self.PARAM_IN_THREADN, None) is not None: if not isinstance(params[self.PARAM_IN_THREADN], int): raise ValueError(self.PARAM_IN_HASH_THREADN + ' must be of type int') else: params[self.PARAM_IN_THREADN] = 2 if "alignment_suffix" not in params or not valid_string(params["alignment_suffix"]): raise ValueError("Parameter alignment_suffix must be a valid Workspace object string, " "not {}".format(params.get("alignment_suffix", None))) if params.get(self.PARAM_IN_OUTFILE_PREFIX, None) is not None: if params[self.PARAM_IN_OUTFILE_PREFIX].find('/') != -1: raise ValueError(self.PARAM_IN_OUTFILE_PREFIX + ' cannot contain subfolder(s).') else: params[self.PARAM_IN_OUTFILE_PREFIX] = 'star_' if params.get('create_report', None) is None: params['create_report'] = 0 return self._setDefaultParameters(params) def _setDefaultParameters(self, params_in): """set default for this group of parameters """ params = copy.deepcopy(params_in) if params.get('outFilterType', None) is None: params['outFilterType'] = "\"BySJout\"" if params.get('outFilterMultimapNmax', None) is None: params['outFilterMultimapNmax'] = 20 if params.get('outSAMtype', None) is None: params['outSAMtype'] = 'BAM' if params.get('outSAMattrIHstart', None) is None: params['outSAMattrIHstart'] = 0 if params.get('outSAMstrandField', None) is None: params['outSAMstrandField'] = 'intronMotif' if params.get('outFilterIntronMotifs', None) is None: params['outFilterIntronMotifs'] = 'RemoveNoncanonical' if params.get(self.SET_READS, None) is None: params[self.SET_READS] = self._get_reads_refs_from_setref(params) return params def _get_genome_gtf_file(self, gnm_ref, gtf_file_dir): """ Get data from genome object ref and return the GTF filename (with path) for STAR indexing and mapping. STAR uses the reference annotation to guide assembly and for creating alignment """ log("Converting genome {0} to GFF file in folder {1}".format(gnm_ref, gtf_file_dir)) gfu = GenomeFileUtil(self.callback_url) try: gfu_ret = gfu.genome_to_gff({self.PARAM_IN_GENOME: gnm_ref, 'is_gtf': 1, 'target_dir': gtf_file_dir }) except ValueError as egfu: log('GFU getting GTF file raised error:\n') pprint(egfu) return None else:#no exception raised return gfu_ret.get('file_path') def _construct_indexing_cmd(self, params): # STEP 1: construct the command for running `STAR --runMode genomeGenerate...` idx_cmd = [self.STAR_BIN] idx_cmd.append('--genomeDir') idx_cmd.append(params[self.STAR_IDX_DIR]) idx_cmd.append('--' + self.PARAM_IN_STARMODE) idx_cmd.append('genomeGenerate') idx_cmd.append('--' + self.PARAM_IN_THREADN) idx_cmd.append(str(params[self.PARAM_IN_THREADN])) if params.get(self.PARAM_IN_FASTA_FILES, None) is not None: idx_cmd.append('--' + self.PARAM_IN_FASTA_FILES) for fasta_file in params[self.PARAM_IN_FASTA_FILES]: idx_cmd.append(fasta_file) # STEP 2: append the standard optional inputs if params.get('sjdbGTFfile', None) is not None: idx_cmd.append('--sjdbGTFfile') idx_cmd.append(params['sjdbGTFfile']) if (params.get('sjdbOverhang', None) is not None and params['sjdbOverhang'] > 0): idx_cmd.append('--sjdbOverhang') idx_cmd.append(str(params['sjdbOverhang'])) #print ('STAR indexing CMD:') #print ' '.join(idx_cmd) return idx_cmd def _construct_mapping_cmd(self, params): if params.get(self.PARAM_IN_STARMODE, None) is None: params[self.PARAM_IN_STARMODE] = 'alignReads' # STEP 1: set the working folder housing the STAR output results as well as the reads info star_out_dir = '' if params.get('align_output', None) is None: star_out_dir = self.scratch else: star_out_dir = params['align_output'] # STEP 2: construct the command for running STAR mapping mp_cmd = [self.STAR_BIN] mp_cmd.append('--genomeDir') mp_cmd.append(params[self.STAR_IDX_DIR]) mp_cmd.append('--' + self.PARAM_IN_STARMODE) mp_cmd.append(params[self.PARAM_IN_STARMODE]) mp_cmd.append('--' + self.PARAM_IN_THREADN) mp_cmd.append(str(params[self.PARAM_IN_THREADN])) if params.get(self.PARAM_IN_READS_FILES, None) is not None: #print('Input reads files:\n' + pformat(params[self.PARAM_IN_READS_FILES])) mp_cmd.append('--' + self.PARAM_IN_READS_FILES) for reads_file in params[self.PARAM_IN_READS_FILES]: mp_cmd.append(reads_file) readName, readsExtension = os.path.splitext(reads_file) #print ('Reads file name-- {}/extension-- {}:'.format(readName, readsExtension)) if readsExtension == '.gz': mp_cmd.append('--readFilesCommand') mp_cmd.append('gunzip') mp_cmd.append('-c') if readsExtension == '.bz2': mp_cmd.append('--readFilesCommand') mp_cmd.append('bunzip2') mp_cmd.append('-c') # STEP 3: appending the advanced optional inputs mp_cmd.append('--' + self.PARAM_IN_OUTFILE_PREFIX) mp_cmd.append(os.path.join(star_out_dir, params[self.PARAM_IN_OUTFILE_PREFIX])) if params.get('sjdbGTFfile', None) is not None: mp_cmd.append('--sjdbGTFfile') mp_cmd.append(params['sjdbGTFfile']) if (params.get('sjdbOverhang', None) is not None and params['sjdbOverhang'] > 0): mp_cmd.append('--sjdbOverhang') mp_cmd.append(str(params['sjdbOverhang'])) if (params.get('outFilterType', None) is not None and isinstance(params['outFilterType'], str)): mp_cmd.append('--outFilterType') mp_cmd.append(params['outFilterType']) if (params.get('outFilterMultimapNmax', None) is not None and isinstance(params['outFilterMultimapNmax'], int) and params['outFilterMultimapNmax'] >= 0): mp_cmd.append('--outFilterMultimapNmax') mp_cmd.append(str(params['outFilterMultimapNmax'])) #output sorted file:Aligned.sortedByCoord.out.bam #allowed values of --outSAMtype are BAM Unsorted or SortedByCoordinate or both if params.get('outSAMtype', None) is not None: mp_cmd.append('--outSAMtype') mp_cmd.append(params['outSAMtype']) if params.get('outSAMtype', None) == 'BAM': mp_cmd.append('SortedByCoordinate') # 'It is recommended to remove the non-canonical junctions for Cnks runs using # --outFilterIntronMotifs RemoveNoncanonical' if params.get('outFilterIntronMotifs', None) is not None: mp_cmd.append('--outFilterIntronMotifs') mp_cmd.append('RemoveNoncanonical') if (params.get('outSAMattrIHstart', None) is not None and isinstance(params['outSAMattrIHstart'], int) and params['outSAMattrIHstart'] >= 0): mp_cmd.append('--outSAMattrIHstart') mp_cmd.append(str(params['outSAMattrIHstart'])) if (params.get('outSAMstrandField', None) is not None and isinstance(params['outSAMstrandField'], str)): mp_cmd.append('--outSAMstrandField') mp_cmd.append(params['outSAMstrandField']) quant_modes = ["TranscriptomeSAM", "GeneCounts", "Both"] if (params.get('quantMode', None) is not None and params.get('quantMode', None) in quant_modes): mp_cmd.append('--quantMode') if params['quantMode'] == "Both": mp_cmd.append("TranscriptomeSAM") mp_cmd.append("GeneCounts") else: mp_cmd.append(params['quantMode']) if (params.get('alignSJoverhangMin', None) is not None and isinstance(params['alignSJoverhangMin'], int) and params['alignSJoverhangMin'] > 0): mp_cmd.append('--alignSJoverhangMin') mp_cmd.append(str(params['alignSJoverhangMin'])) if (params.get('alignSJDBoverhangMin', None) is not None and isinstance(params['alignSJDBoverhangMin'], int) and params['alignSJDBoverhangMin'] > 0): mp_cmd.append('--alignSJDBoverhangMin') mp_cmd.append(str(params['alignSJDBoverhangMin'])) if (params.get('outFilterMismatchNmax', None) is not None and isinstance(params['outFilterMismatchNmax'], int) and params['outFilterMismatchNmax'] > 0): mp_cmd.append('--outFilterMismatchNmax') mp_cmd.append(str(params['outFilterMismatchNmax'])) if (params.get('alignIntronMin', None) is not None and isinstance(params['alignIntronMin'], int) and params['alignIntronMin'] > 0): mp_cmd.append('--alignIntronMin') mp_cmd.append(str(params['alignIntronMin'])) if (params.get('alignIntronMax', None) is not None and isinstance(params['alignIntronMax'], int) and params['alignIntronMax'] >= 0): mp_cmd.append('--alignIntronMax') mp_cmd.append(str(params['alignIntronMax'])) if (params.get('alignMatesGapMax', None) is not None and isinstance(params['alignMatesGapMax'], int) and params['alignMatesGapMax'] >= 0): mp_cmd.append('--alignMatesGapMax') mp_cmd.append(str(params['alignMatesGapMax'])) #print ('STAR mapping CMD:') #print ' '.join(mp_cmd) return mp_cmd def _exec_indexing(self, params): log('Running STAR index generating with params:\n' + pformat(params)) idx_cmd = self._construct_indexing_cmd(params) exitCode = self.prog_runner.run(idx_cmd, self.scratch) return exitCode def _exec_mapping(self, params): log('Running STAR mapping with params:\n' + pformat(params)) mp_cmd = self._construct_mapping_cmd(params) exitCode = self.prog_runner.run(mp_cmd, self.scratch) return exitCode def _exec_star_pipeline(self, params, rds_files, rds_name, idx_dir, out_dir): # build the parameters params_idx = self._get_indexing_params(params, idx_dir) params_mp = self._get_mapping_params(params, rds_files, rds_name, idx_dir, out_dir) # execute indexing and then mapping retVal = {} try: if params[self.PARAM_IN_STARMODE]=='genomeGenerate': ret = self._exec_indexing(params_idx) else: ret = 0 while( ret != 0 ): time.sleep(1) except ValueError as eidx: log('STAR genome indexing raised error:\n') pprint(eidx) else:#no exception raised by genome indexing and STAR returns 0, then run mapping params_mp[self.PARAM_IN_STARMODE] = 'alignReads' try: ret = self._exec_mapping(params_mp) while( ret != 0 ): time.sleep(1) except ValueError as emp: log('STAR mapping raised error:\n') pprint(emp) else:#no exception raised by STAR mapping and STAR returns 0, then move to saving and reporting ret = {'star_idx': star_idx, 'star_output': params_mp.get('align_output')} return ret def upload_STARalignment(self, input_params, reads_ref, reads_info, output_bam_file): """ Uploads the alignment file + metadata. Returns the STAR alignment reference. """ aligner_opts = dict() for k in input_params: aligner_opts[k] = str(input_params[k]) pprint(reads_info) alignment_name = reads_ref['alignment_output_name'] align_upload_params = { "destination_ref": "{}/{}".format(input_params[self.PARAM_IN_WS], alignment_name), "file_path": output_bam_file, "assembly_or_genome_ref": input_params[self.PARAM_IN_GENOME], "read_library_ref": reads_info['object_ref'], "library_type": reads_info['style'], "condition": reads_info['condition'], "aligned_using": 'STAR', "aligner_version":self.STAR_VERSION, "aligner_opts": aligner_opts } pprint(align_upload_params) ra_util = ReadsAlignmentUtils(self.callback_url, service_ver='beta') rau_upload_ret = ra_util.upload_alignment(align_upload_params) alignment_ref = rau_upload_ret["obj_ref"] print("STAR alignment uploaded as object {}".format(alignment_ref)) return rau_upload_ret def generate_report_for_single_run(self, run_output_info, params): input_ref = run_output_info['upload_results']['obj_ref'] index_dir = run_output_info['index_dir'] output_dir = run_output_info['output_dir'] output_files = self._generate_output_file_list(index_dir, output_dir) # first run qualimap qualimap_report = self.qualimap.run_bamqc({'input_ref': input_ref}) qc_result_zip_info = qualimap_report['qc_result_zip_info'] # create report report_text = 'Ran on a single reads library.\n\n' alignment_info = self.get_obj_infos(input_ref)[0] report_text = 'Created ReadsAlignment: ' + str(alignment_info[1]) + '\n' report_text += ' ' + input_ref + '\n' kbr = KBaseReport(self.callback_url) report_info = kbr.create_extended_report({'message': report_text, 'file_links': output_files, 'objects_created': [{'ref': input_ref, 'description': 'ReadsAlignment'}], 'report_object_name': 'kb_STAR_report_' + str(uuid.uuid4()), 'direct_html_link_index': 0, 'html_links': [{'shock_id': qc_result_zip_info['shock_id'], 'name': qc_result_zip_info['index_html_file_name'], 'label': qc_result_zip_info['name']}], 'html_window_height': 366, 'workspace_name': params['output_workspace'] }) return report_info #{'report_name': report_info['name'], 'report_ref': report_info['ref']} def _get_reads_info(self, reads, readsSet_ref): ''' _get_reads_info:fetches the detailed info for each reads with ref in list reads_refs return an object of the following structure: { "style": "paired", "single", or "interleaved", "file_fwd": path_to_file, "name": name of the reads, "file_rev": path_to_file, only if paired end, "object_ref": reads reference for downstream convenience, "condition": the condition for the reads. } ''' try: print("Fetching FASTA file from reads reference {}".format(reads['ref'])) ret_reads_info = fetch_reads_from_reference(reads['ref'], self.callback_url) except ValueError: print("Incorrect object type for fetching a FASTA file!") raise if ret_reads_info.get("file_fwd", None) is None: raise RuntimeError("FASTA file fetched from reads {} doesn't seem to exist!".format(reads['ref'])) else: if reads.get('condition', None) is not None: ret_reads_info['condition'] = reads['condition'] else: ret_reads_info['condition'] = 'unspecified' if reads.get('object_ref', None) != readsSet_ref: ret_reads_info[self.PARAM_IN_READS] = readsSet_ref return ret_reads_info def _get_genome_fasta(self, gnm_ref): genome_fasta_files = list() if gnm_ref is not None: try: print("Fetching FASTA file from object {}".format(gnm_ref)) genome_fasta_file = fetch_fasta_from_object(gnm_ref, self.workspace_url, self.callback_url) print("Done fetching FASTA file! Path = {}".format(genome_fasta_file.get("path", None))) except ValueError: print("Incorrect object type for fetching a FASTA file!") raise if genome_fasta_file.get("path", None) is None: raise RuntimeError("FASTA file fetched from object {} doesn't seem exist!".format(gnm_ref)) else: genome_fasta_files.append(genome_fasta_file["path"]) return genome_fasta_files def convert_params(self, validated_params): """ Convert input parameters with KBase ref format into STAR parameters, and add the advanced options. """ params = copy.deepcopy(validated_params) params['runMode'] = 'genomeGenerate' if validated_params.get('create_report', None) is not None: params['create_report'] = validated_params['create_report'] if validated_params.get('concurrent_local_tasks', None) is not None: params['concurrent_local_tasks'] = validated_params['concurrent_local_tasks'] if validated_params.get('concurrent_njsw_tasks', None) is not None: params['concurrent_njsw_tasks'] = validated_params['concurrent_njsw_tasks'] if validated_params.get('alignmentset_suffix', None) is not None: params['alignmentset_suffix'] = validated_params['alignmentset_suffix'] # Add advanced options from validated_params to params sjdbGTFfile = validated_params.get("sjdbGTFfile", None) if sjdbGTFfile is not None: params['sjdbGTFfile'] = sjdbGTFfile else: params['sjdbGTFfile'] = self._get_genome_gtf_file( params[self.PARAM_IN_GENOME], os.path.join(self.scratch, self.STAR_IDX_DIR)) if validated_params.get('sjdbOverhang', None) is not None : params['sjdbOverhang'] = validated_params['sjdbOverhang'] else: params['sjdbOverhang'] = 100 quant_modes = ["TranscriptomeSAM", "GeneCounts", "Both"] if (validated_params.get('quantMode', None) is not None and validated_params.get('quantMode', None) in quant_modes): params['quantMode'] = validated_params['quantMode'] else: params['quantMode'] = 'Both' return params def _get_indexing_params(self, params, star_idx_dir): params_idx = { 'runMode': 'genomeGenerate', 'runThreadN': params[self.PARAM_IN_THREADN], self.STAR_IDX_DIR: star_idx_dir, 'genomeFastaFiles': params[self.PARAM_IN_FASTA_FILES] } if params.get('sjdbGTFfile', None) is not None: params_idx['sjdbGTFfile'] = params['sjdbGTFfile'] if params.get('sjdbOverhang', None) is not None : params_idx['sjdbOverhang'] = params['sjdbOverhang'] return params_idx def _get_mapping_params(self, params, rds_files, rds_name, idx_dir, out_dir): ''' build the mapping parameters''' aligndir = out_dir if rds_name: aligndir = os.path.join(out_dir, rds_name) self._mkdir_p(aligndir) #print '**********STAR output directory created:{}'.format(aligndir) params_mp = copy.deepcopy(params) params_mp['runMode'] = 'alignReads' params_mp['readFilesIn'] = rds_files params_mp[self.STAR_IDX_DIR] = idx_dir params_mp['align_output'] = aligndir return params_mp def determine_input_info(self, validated_params): ''' get info on the readsset_ref object and determine if we run once or run on a set input info provides information on the input and tells us if we should run as a single_library or as a set: input_info = {'run_mode': '', 'info': [..], 'ref': '55/1/2'} ''' info = self.get_obj_infos(validated_params[self.PARAM_IN_READS])[0] obj_type = self.get_type_from_obj_info(info) if obj_type in ['KBaseAssembly.PairedEndLibrary', 'KBaseAssembly.SingleEndLibrary', 'KBaseFile.PairedEndLibrary', 'KBaseFile.SingleEndLibrary']: return {'run_mode': 'single_library', 'info': info, 'ref': validated_params[self.PARAM_IN_READS]} if obj_type == 'KBaseRNASeq.RNASeqSampleSet': return {'run_mode': 'sample_set', 'info': info, 'ref': validated_params[self.PARAM_IN_READS]} if obj_type == 'KBaseSets.ReadsSet': return {'run_mode': 'sample_set', 'info': info, 'ref': validated_params[self.PARAM_IN_READS]} raise ValueError('Object type of readsset_ref is not valid, was: ' + str(obj_type)) def determine_unique_reads_names(self, validated_params): infos = self.get_obj_infos(validated_params[self.PARAM_IN_READS]) return get_unique_names(infos) def get_type_from_obj_info(self, info): return info[2].split('-')[0] def get_name_from_obj_info(self, info): return info[1] def get_obj_infos(self, ref): return self.ws_client.get_object_info3({'objects': [{'ref': ref}]})['infos'] def get_object_names(self, ref_list): """ From a list of workspace references, returns a mapping from ref -> name of the object. """ obj_ids = list() for ref in ref_list: obj_ids.append({"ref": ref}) info = self.ws_client.get_object_info3({"objects": obj_ids}) name_map = dict() # we already have the refs as passed previously, so use those for mapping, as they're in # the same order as what's returned. for i in range(len(info["infos"])): name_map[ref_list[i]] = info["infos"][i][1] return name_map def _mkdir_p(self, dir): """ _mkdir_p: make directory for given path """ log('Creating a new dir: ' + dir) if not dir: return if not os.path.exists(dir): os.makedirs(dir) else: log('{} has existed, so skip creating.'.format(dir)) def create_star_dirs(self, star_home): '''creating the directories for STAR''' # the index directory idxdir = os.path.join(star_home, self.STAR_IDX_DIR) self._mkdir_p(idxdir) # the output directory outdir = os.path.join(star_home, self.STAR_OUT_DIR) self._mkdir_p(outdir) return (idxdir, outdir) def _get_reads_refs_from_setref(self, params): readsSet_ref = params[self.PARAM_IN_READS] reads_refs = list() try: #print("Fetching reads ref(s) from sample/reads set ref {}".format(readsSet_ref)) reads_refs = fetch_reads_refs_from_sampleset( readsSet_ref, self.workspace_url, self.callback_url, params) #print("\nDone fetching reads ref(s) from readsSet {}--\nDetails:\n".format(readsSet_ref)) #pprint(reads_refs) except ValueError: print("Incorrect object type for fetching reads ref(s)!") raise return reads_refs def _generate_output_file_list(self, idx_dir, out_dir): """ _generate_output_file_list: zip result files and generate file_links for report """ log('start packing result files') output_files = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) star_index = os.path.join(output_directory, 'star_index.zip') star_output = os.path.join(output_directory, 'star_output.zip') self.zip_folder(idx_dir, star_index) self.zip_folder(out_dir, star_output) #star_index = self.zip_folder_withDFU(idx_dir, 'star_index') #star_output = self.zip_folder_withDFU(out_dir, 'star_output') output_files.append({'path': star_index, 'name': os.path.basename(star_index), 'label': os.path.basename(star_index), 'description': 'Index file(s) generated by STAR'}) output_files.append({'path': star_output, 'name': os.path.basename(star_output), 'label': os.path.basename(star_output), 'description': 'Output file(s) generated by STAR'}) return output_files def zip_folder_withDFU(self, folder_path, output_name): """Zip the contents of an entire folder (with that folder included in the archive). Empty subfolders will be included in the archive as well. """ output_path = self.dfu.pack_file( {'file_path': folder_path + '/' + output_name, 'pack': 'zip'})['file_path'] print "{} created successfully.".format(output_path) #with zipfile.ZipFile(output_path, "r") as f: #print 'Checking the zipped file......\n' #for info in f.infolist(): # print info.filename, info.date_time, info.file_size, info.compress_size #for fn in f.namelist(): #print fn return output_path def zip_folder(self, folder_path, output_path): """Zip the contents of an entire folder (with that folder included in the archive). Empty subfolders could be included in the archive as well if the commented portion is used. """ with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as ziph: for root, folders, files in os.walk(folder_path): # Include all subfolders, including empty ones. #for folder_name in folders: # absolute_path = os.path.join(root, folder_name) # relative_path = os.path.join(os.path.basename(root), folder_name) # print "Adding {} to archive.".format(absolute_path) # ziph.write(absolute_path, relative_path) for f in files: absolute_path = os.path.join(root, f) relative_path = os.path.join(os.path.basename(root), f) #print "Adding {} to archive.".format(absolute_path) ziph.write(absolute_path, relative_path) print "{} created successfully.".format(output_path) #with zipfile.ZipFile(output_path, "r") as f: # print 'Checking the zipped file......\n' # for info in f.infolist(): # print info.filename, info.date_time, info.file_size, info.compress_size def _generate_html_report(self, out_dir, obj_ref): """ _generate_html_report: generate html summary report """ log('start generating html report') html_report = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file_path = os.path.join(output_directory, 'report.html') star_obj = self.ws_client.get_objects2({'objects': [{'ref': obj_ref}]})['data'][0] star_obj_info = star_obj['info'] star_obj_data = star_obj['data'] star_obj_type = star_obj_info[2] Overview_Content = '' if re.match('KBaseRNASeq.RNASeqAlignment-\d.\d', star_obj_type): Overview_Content += '<br/><table><tr><th>Generated Alignment Object</th>' Overview_Content += '<th></th></tr>' Overview_Content += '<tr><th>Alignment Name</th><th>Condition</th></tr>' Overview_Content += '<tr><td>{} ({})</td>'.format(star_obj_info[1],obj_ref) Overview_Content += '<td>{}</td></tr>'.format(star_obj_data['condition']) Overview_Content += '</table>' elif (re.match('KBaseRNASeq.RNASeqAlignmentSet-\d.\d', star_obj_type) or re.match('KBaseSets.ReadsAlignmentSet-\d.\d', star_obj_type) or re.match('KBaseSet.RNASeqAlignmentSet-\d.\d', star_obj_type)): Overview_Content += '<br/><table><tr><th>Generated AlignmentSet Object</th></tr>' Overview_Content += '<tr><td>{} ({})'.format(star_obj_info[1],obj_ref) Overview_Content += '</td></tr></table>' Overview_Content += '<p><br/></p>' Overview_Content += '<table><tr><th>Generated Alignment Objects</th>' Overview_Content += '<th></th></tr>' Overview_Content += self._fill_html_trs('Alignment Name', star_obj_data) Overview_Content += '</table>' elif re.match('KBaseRNASeq.RNASeqExpression-\d.\d', star_obj_type): Overview_Content += '<br/><table><tr><th>Generated Expression Object</th>' Overview_Content += '<th></th></tr>' Overview_Content += '<tr><th>Expression Name</th><th>Condition</th></tr>' Overview_Content += '<tr><td>{} ({})</td>'.format(star_obj_info[1], obj_ref) Overview_Content += '<td>{}</td></tr>'.format(star_obj_data['condition']) Overview_Content += '</table>' elif re.match('KBaseSets.ExpressionSet-\d.\d', star_obj_type): Overview_Content += '<br/><table><tr><th>Generated ExpressionSet Object</th></tr>' Overview_Content += '<tr><td>{} ({})'.format(star_obj_info[1], obj_ref) Overview_Content += '</td></tr></table>' Overview_Content += '<p><br/></p>' Overview_Content += '<table><tr><th>Generated Expression Objects</th>' Overview_Content += '<th></th></tr>' Overview_Content += self._fill_html_trs('Expression Name', star_obj_data) Overview_Content += '</table>' with open(result_file_path, 'w') as result_file: with open(os.path.join(os.path.dirname(__file__), 'report_template.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace('<p>Overview_Content</p>', Overview_Content) result_file.write(report_template) html_report.append({'path': result_file_path, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for STAR'}) return html_report def _fill_html_trs(self, col_caption, obj_data): ''' _fill_html_trs: simple creates an html string that has rows (tr) of td for a table ''' tr_html_str = '<tr><th>{}</th><th>Condition</th></tr>'.format(col_caption) for item in obj_data['items']: item_obj = self.ws_client.get_objects2({'objects':[{'ref': item['ref']}]})['data'][0] item_obj_info = item_obj['info'] item_obj_data = item_obj['data'] obj_name = item_obj_info[1] tr_html_str += '<tr><td>{} ({})</td>'.format(obj_name, item['ref']) tr_html_str += '<td>{}</td></tr>'.format(item_obj_data['condition']) return tr_html_str def _generate_star_report(self, obj_ref, report_text, html_links, workspace_name, index_dir, output_dir): """ _generate_star_report: generate summary report """ log('creating STAR report') output_files = self._generate_output_file_list(index_dir, output_dir) output_html_files = self._generate_html_report(output_dir, obj_ref) output_html_files += html_links star_obj = self.ws_client.get_objects2({'objects':[{'ref': obj_ref}]})['data'][0] star_obj_info = star_obj['info'] star_obj_data = star_obj['data'] star_obj_type = star_obj_info[2] if re.match('KBaseRNASeq.RNASeqAlignment-\d+.\d+', star_obj_type): objects_created = [{'ref': obj_ref, 'description': 'RNASeqAlignment generated by STAR'}] elif (re.match('KBaseRNASeq.RNASeqAlignmentSet-\d+.\d+', star_obj_type) or re.match('KBaseSets.ReadsAlignmentSet-\d+.\d+', star_obj_type) or re.match('KBaseSet.RNASeqAlignmentSet-\d+.\d+', star_obj_type)): objects_created = [{'ref': obj_ref, 'description': '{} generated by STAR'.format(re.sub(r"-\d+.\d+", "",star_obj_type))}] items = star_obj_data['items'] for item in items: objects_created.append({'ref': item['ref'], 'description': 'Alignment generated by STAR'}) elif re.match('KBaseRNASeq.RNASeqExpression-\d+.\d+', star_obj_type): objects_created = [{'ref': obj_ref, 'description': 'Expression generated by STAR'}] elif re.match('KBaseSets.ExpressionSet-\d+.\d+', star_obj_type): objects_created = [{'ref': obj_ref, 'description': 'ExpressionSet generated by STAR'}] items = star_obj_data['items'] for item in items: objects_created.append({'ref': item['ref'], 'description': 'Expression generated by STAR'}) report_params = {'message': report_text, 'workspace_name': workspace_name, 'file_links': output_files, 'objects_created': objects_created, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 366, 'report_object_name': 'kb_STAR_report_' + str(uuid.uuid4())} kbase_report_client = KBaseReport(self.callback_url) report_output = kbase_report_client.create_extended_report(report_params) return report_output def upload_alignment_set(self, alignment_items, alignmentset_name, ws_name): """ Compiles and saves a set of alignment references (+ other stuff) into a KBaseRNASeq.RNASeqAlignmentSet. Returns the reference to the new alignment set. alignment_items: [{ "ref": alignment_ref, "label": condition label. }] """ print("Uploading completed alignment set") alignment_set = { "description": "Alignments using STAR, v.{}".format(self.STAR_VERSION), "items": alignment_items } set_info = self.set_api_client.save_reads_alignment_set_v1({ "workspace": ws_name, "output_object_name": alignmentset_name, "data": alignment_set }) return set_info