class DownloadFastqUtils: def __init__(self): self.callbackURL = os.environ['SDK_CALLBACK_URL'] self.au = AssemblyUtil(self.callbackURL) self.ru = ReadsUtils(self.callbackURL) pass def _stage_input_file(self, ref, reads_type): if reads_type == 'KBaseFile.PairedEndLibrary' or 'KBaseAssembly.PairedEndLibrary': input_file_info = self.ru.download_reads({ 'read_libraries': [ref], 'interleaved': 'true' })['files'][ref] elif reads_type == 'KBaseFile.SingleEndLibrary' or 'KBaseAssembly.SingleEndLibrary': input_file_info = self.ru.download_reads({'read_libraries': [ref]})['files'][ref] else: raise ValueError("Can't download_reads() for object type: '" + str(reads_type) + "'") input_file_info['input_ref'] = ref file_location = input_file_info['files']['fwd'] interleaved = False if input_file_info['files']['type'] == 'interleaved': interleaved = True return input_file_info def download_genome(self, genomeref): file = self.au.get_assembly_as_fasta({'ref': genomeref}) return file
def prepare_single_run(self, input_info, assembly_or_genome_ref, bwa_index_info, ws_for_cache): ''' Given a reads ref and an assembly, setup the bwa index ''' # first setup the bwa index of the assembly input_configuration = {'bwa_index_info': bwa_index_info} if not bwa_index_info: bwaIndexBuilder = BwaIndexBuilder(self.scratch_dir, self.workspace_url, self.callback_url, self.srv_wiz_url, self.provenance) index_result = bwaIndexBuilder.get_index({ 'ref': assembly_or_genome_ref, 'ws_for_cache': ws_for_cache }) input_configuration['bwa_index_info'] = index_result # next download the reads read_lib_ref = input_info['ref'] read_lib_info = input_info['info'] reads_params = { 'read_libraries': [read_lib_ref], 'interleaved': 'false', 'gzipped': None } ru = ReadsUtils(self.callback_url) reads = ru.download_reads(reads_params)['files'] input_configuration['reads_lib_type'] = self.get_type_from_obj_info( read_lib_info).split('.')[1] input_configuration['reads_files'] = reads[read_lib_ref] input_configuration['reads_lib_ref'] = read_lib_ref return input_configuration
def get_reads_RU(self, refs, console): readcli = ReadsUtils(self.callbackURL, token=self.token) typeerr = ('Supported types: KBaseFile.SingleEndLibrary ' + 'KBaseFile.PairedEndLibrary ' + 'KBaseAssembly.SingleEndLibrary ' + 'KBaseAssembly.PairedEndLibrary') try: reads = readcli.download_reads({'read_libraries': refs, 'interleaved': 'true', 'gzipped': None })['files'] except ServerError as se: self.log(console, 'logging stacktrace from dynamic client error') self.log(console, se.data) if typeerr in se.message: prefix = se.message.split('.')[0] raise ValueError( prefix + '. Only the types ' + 'KBaseAssembly.PairedEndLibrary ' + 'and KBaseFile.PairedEndLibrary are supported') else: raise self.log(console, 'Got reads data from converter:\n' + pformat(reads)) return reads
def fetch_reads_from_reference(ref, callback_url): """ Fetch a FASTQ file (or 2 for paired-end) from a reads reference. Returns the following structure: { "style": "paired", "single", or "interleaved", "file_fwd": path_to_file, "file_rev": path_to_file, only if paired end, "object_ref": reads reference for downstream convenience. } """ try: print("Fetching reads from object {}".format(ref)) reads_client = ReadsUtils(callback_url) reads_dl = reads_client.download_reads({ "read_libraries": [ref], "interleaved": "false" }) pprint(reads_dl) reads_files = reads_dl['files'][ref]['files'] ret_reads = { "object_ref": ref, "style": reads_files["type"], "file_fwd": reads_files["fwd"] } if reads_files.get("rev", None) is not None: ret_reads["file_rev"] = reads_files["rev"] return ret_reads except: print( "Unable to fetch a file from expected reads object {}".format(ref)) raise
def download_interleaved_reads(callback_url, reads_upa): ru = ReadsUtils(callback_url) reads_info = ru.download_reads({ 'read_libraries': [reads_upa], 'interleaved': 'true', 'gzipped': None })['files'][reads_upa] return reads_info
def download_reads(self, token, reads_ref): try: readsUtils_Client = ReadsUtils (url=self.callback_url, token=token) # SDK local readsLibrary = readsUtils_Client.download_reads ({'read_libraries': [reads_ref], 'interleaved': 'true' }) reads_file_path = readsLibrary['files'][reads_ref]['files']['fwd'] except Exception as e: raise ValueError('Unable to get reads library object from workspace: (' + reads_ref +")\n" + str(e)) return reads_file_path
def run_mash_sketch(self, ctx, params): """ Generate a sketch file from a fasta/fastq file :param params: instance of type "MashSketchParams" (* * Pass in **one of** input_path, assembly_ref, or reads_ref * input_path - string - local file path to an input fasta/fastq * assembly_ref - string - workspace reference to an Assembly type * reads_ref - string - workspace reference to a Reads type * Optionally, pass in a boolean indicating whether you are using paired-end reads. * paired_ends - boolean - whether you are passing in paired ends) -> structure: parameter "input_path" of String, parameter "assembly_ref" of String, parameter "reads_ref" of String, parameter "paired_ends" of type "boolean" (params: input_upa: workspace reference to an assembly object workspace_name: name of current workspace search_db: database to search n_max_results: number of results to return, integer between 1 and 100) :returns: instance of type "MashSketchResults" (* * Returns the local scratch file path of the generated sketch file. * Will have the extension '.msh') -> structure: parameter "sketch_path" of String """ # ctx is the context object # return variables are: results #BEGIN run_mash_sketch if 'reads_ref' in params: reads_utils = ReadsUtils(self.callbackURL) result = reads_utils.download_reads({ 'read_libraries': [params['reads_ref']], 'interleaved': 'true' }) input_path = result['files'][params['reads_ref']]['files']['fwd'] elif 'assembly_ref' in params: assembly_util = AssemblyUtil(self.callbackURL) result = assembly_util.get_assembly_as_fasta({'ref': params['assembly_ref']}) input_path = result['path'] elif 'input_path' in params: input_path = params['input_path'] else: raise ValueError( 'Invalid params; must provide one of `reads_ref`, `assembly_ref`, or `input_path`.' ) mash_utils = MashUtils(self.config, self.auth_token) output_file_path = mash_utils.mash_sketch(input_path, paired_ends=params.get('paired_ends')) results = {'sketch_path': output_file_path} #END run_mash_sketch # At some point might do deeper type checking... if not isinstance(results, dict): raise ValueError('Method run_mash_sketch return value ' + 'results is not type dict as required.') # return the results return [results]
def fetch_reads_files(self, reads_upas): """ From a list of reads UPAs, uses ReadsUtils to fetch the reads as files. Returns them as a dictionary from reads_upa -> filename """ if reads_upas is None: raise ValueError("reads_upas must be a list of UPAs") if len(reads_upas) == 0: raise ValueError("reads_upas must contain at least one UPA") ru = ReadsUtils(self.callback_url) reads_info = ru.download_reads(({ 'read_libraries': reads_upas, 'interleaved': 'true', 'gzipped': None }))['files'] file_set = dict() for reads in reads_info: file_set[reads] = reads_info[reads]['files']['fwd'] return file_set
class masurca_utils: """ masurca_utils: defining a system of utils for running masurca """ MaSuRCA_VERSION = 'MaSuRCA-3.2.9' MaSuRCA_BIN = '/kb/module/' + MaSuRCA_VERSION + '/bin/masurca' PARAM_IN_WS = 'workspace_name' PARAM_IN_THREADN = 'num_threads' PARAM_IN_READS_LIBS = 'reads_libraries' PARAM_IN_JUMP_LIBS = 'jump_libraries' PARAM_IN_JF_SIZE = 'jf_size' PARAM_IN_CS_NAME = 'output_contigset_name' INVALID_WS_OBJ_NAME_RE = re.compile('[^\\w\\|._-]') def __init__(self, prj_dir, config): self.workspace_url = config['workspace-url'] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] if 'shock-url' in config: self.shock_url = config['shock-url'] if 'handle-service-url' in config: self.handle_url = config['handle-service-url'] self.ws_client = Workspace(self.workspace_url, token=self.token) self.ru = ReadsUtils(self.callback_url, token=self.token) self.au = AssemblyUtil(self.callback_url, token=self.token) self.kbr = KBaseReport(self.callback_url) self.kbq = kb_quast(self.callback_url) self.proj_dir = prj_dir self.prog_runner = Program_Runner(self.MaSuRCA_BIN, self.proj_dir) def _has_long_reads(self, params): """ _has_long_reads: check if a long reads input exists in the parameters """ return (params.get('pacbio_reads', None) or params.get('nanopore_reads', None) or params.get('other_frg_file', None)) def _get_data_portion(self, pe_reads_data, jp_reads_data=None, pacbio_reads_file='', nanopore_reads_file='', other_frg_file=''): """ _get_data_portion: build the 'DATA...END' portion for the config.txt file """ data_str = '' if pe_reads_data: # log('PE reads data details:\n{}'.format(json.dumps(pe_reads_data, indent=1))) for pe in pe_reads_data: if data_str != '': data_str += '\n' data_str += 'PE= ' + pe['pe_prefix'] + ' ' + str(pe['pe_mean']) + ' ' + \ str(pe['pe_stdev']) + ' ' + pe['fwd_file'] if pe.get('rev_file', None): data_str += ' ' + pe['rev_file'] if jp_reads_data: # log('JUMP reads data details:\n{}'.format(json.dumps(jp_reads_data, indent=1))) for jp in jp_reads_data: if data_str != '': data_str += '\n' data_str += 'JUMP= ' + jp['jp_prefix'] + ' ' + str(jp['jp_mean']) + ' ' + \ str(jp['jp_stdev']) + ' ' + jp['fwd_file'] if jp.get('rev_file', None): data_str += ' ' + jp['rev_file'] # Adding the pacbio_reads # Note that pcbio reads must be in a single fasta file! # For example: # data_str +='\nPACBIO= /pool/genomics/frandsenp/masurca/PacBio/pacbio_reads.fasta' # ***if you have both types of reads supply them both as NANOPORE type*** if pacbio_reads_file != '': if data_str != '': data_str += '\n' if nanopore_reads_file != '': data_str += 'NANOPORE=' + pacbio_reads_file else: data_str += 'PACBIO=' + pacbio_reads_file # Adding the nanopore_reads and note that nanopore reads must be in a single fasta file! # For example: # data_str +='\nNANOPORE= /pool/genomics/frandsenp/masurca/NanoPore/nanopore_reads.fasta' if nanopore_reads_file != '': if data_str != '': data_str += '\n' data_str += 'NANOPORE= ' + nanopore_reads_file # Adding the other_frg_file inputs if any # any OTHER sequence data (454, Sanger, Ion torrent, etc) must be first converted into # Celera Assembler compatible .frg file # (see http://wgsassembler.sourceforge.com) and supplied as OTHER=file.frg if other_frg_file != '': if data_str != '': data_str += '\n' data_str += 'OTHER=' + other_frg_file return data_str def _get_parameters_portion(self, params): """ build the 'PARAMETERS...END' portion for the config.txt file """ # set the default parameters as suggested in the example configuration file param_str = ( "EXTEND_JUMP_READS=0\nUSE_GRID=0\nGRID_QUEUE=all.q\nGRID_BATCH_SIZE" + "=300000000\nLHE_COVERAGE=25\nMEGA_READS_ONE_PASS=0") if (params.get('graph_kmer_size', None) and type(params['graph_kmer_size']) == int): if param_str != '': param_str += '\n' param_str += 'GRAPH_KMER_SIZE=' + str(params['graph_kmer_size']) else: if param_str != '': param_str += '\n' param_str += 'GRAPH_KMER_SIZE=auto' if params.get('use_linking_mates', None): if param_str != '': param_str += '\n' if params['use_linking_mates'] == 1 and not self._has_long_reads( params): param_str += 'USE_LINKING_MATES=1' else: param_str += 'USE_LINKING_MATES=0' if params.get('limit_jump_coverage', None): if param_str != '': param_str += '\n' param_str += 'LIMIT_JUMP_COVERAGE = ' + str( params['limit_jump_coverage']) if params.get('cgwErrorRate', None): if param_str != '': param_str += '\n' param_str += 'CA_PARAMETERS = cgwErrorRate=' + str( params['cgwErrorRate']) if params.get(self.PARAM_IN_THREADN, None): if param_str != '': param_str += '\n' param_str += 'NUM_THREADS = ' + str(params[self.PARAM_IN_THREADN]) if params.get('jf_size', None): if param_str != '': param_str += '\n' param_str += 'JF_SIZE=' + str(params['jf_size']) if params.get('kmer_count_threshold', None): if param_str != '': param_str += '\n' param_str += 'KMER_COUNT_THRESHOLD=' + str( params['kmer_count_threshold']) if params.get('do_homopolymer_trim', None): if param_str != '': param_str += '\n' if params['do_homopolymer_trim'] == 1: param_str += 'DO_HOMOPOLYMER_TRIM=1' else: param_str += 'DO_HOMOPOLYMER_TRIM=0' if params.get('close_gaps', None): if param_str != '': param_str += '\n' if params['close_gaps'] == 1: param_str += 'CLOSE_GAPS=1' else: param_str += 'CLOSE_GAPS=0' if params.get('soap_assembly', None): if param_str != '': param_str += '\n' if params['soap_assembly'] == 1: param_str += 'SOAP_ASSEMBLY=1' else: param_str += 'SOAP_ASSEMBLY=0' return param_str def _replaceSectionText(self, orig_txt, begin_patn, end_patn, repl_txt): """ replace a section of text of orig_txt between lines begin-patn and end-patn with repl_text examples of parameters: begin_patn1 = "DATA\n" begin_patn2 = "PARAMETERS\n" end_patn1 = "END\nPARAMETERS\n" end_patn2 = "END\n" repl_txt1 = ('PE= pe 500 50 /kb/module/work/testReads/small.forward.fq' + ' /kb/module/work/testReads/small.reverse.fq\n') repl_txt2 = ('GRAPH_KMER_SIZE=auto\nUSE_LINKING_MATES=1\nLIMIT_JUMP_COVERAGE = 60\n' + 'CA_PARAMETERS = cgwErrorRate=0.15\nNUM_THREADS= 64\nJF_SIZE=100000000\n DO_HOMOPOLYMER_TRIM=0\n') """ if repl_txt != '': # create regular expression pattern repl = re.compile(begin_patn + '.*?' + end_patn, re.DOTALL) repl_txt = begin_patn + repl_txt + '\n' + end_patn # replace the text between begin_patn and end_patn with repl_txt txt_replaced = repl.sub(repl_txt, orig_txt) # pprint(txt_replaced) return txt_replaced else: return orig_txt def _unique_prefix_check(self, pfix, refs): prefix_lookup = {} for ref in refs: pre = ref[pfix][0:2] if pre not in prefix_lookup: prefix_lookup[pre] = 1 else: raise ValueError('The first two characters in \'' + ref[pfix] + '\' has been used.') def _get_pereads_info(self, input_params): """ _get_pereads_info--from a list of paired_readsParams structures fetches the corresponding reads info with the paired_readsParams[pe_id] returns a list of reads data in the following structure: reads_data = { 'fwd_file': path_to_fastq_file, 'pe_prefix': the two-letter prefix for the reads library, 'pe_mean': the average reads length for the reads library, 'pe_stdev': the standard deviation for the reads library, 'type': reads_type, #('interleaved', 'paired', or 'single' 'seq_tech': sequencing_tech, 'reads_ref': KBase object ref for downstream convenience, 'reads_name': KBase object name for downstream convenience, 'rev_file': path_to_fastq_file, #only if paired end } """ rds_params = copy.deepcopy(input_params) wsname = rds_params[self.PARAM_IN_WS] rds_refs = [] rds_data = [] # reads_libraries grouped params if rds_params.get(self.PARAM_IN_READS_LIBS, None): pe_reads_libs = rds_params[self.PARAM_IN_READS_LIBS] for pe_lib in pe_reads_libs: if pe_lib.get('pe_id', None): rds_refs.append(pe_lib['pe_id']) rds_data = self._get_kbreads_info(wsname, rds_refs) for pe_lib in pe_reads_libs: i = 0 for rds in rds_data: i += 1 if 'pe_id' in pe_lib and pe_lib['pe_id'] == rds[ 'reads_ref']: if pe_lib.get('pe_prefix', None): rds['pe_prefix'] = pe_lib['pe_prefix'][0] else: rds['pe_prefix'] = 'p' rds['pe_prefix'] += str(i) pe_lib['pe_prefix'] = rds['pe_prefix'] if pe_lib.get('pe_mean', None) is None: pe_lib['pe_mean'] = 500 rds['pe_mean'] = pe_lib['pe_mean'] if pe_lib.get('pe_stdev', None) is None: pe_lib['pe_stdev'] = 50 rds['pe_stdev'] = pe_lib['pe_stdev'] self._unique_prefix_check('pe_prefix', pe_reads_libs) else: raise ValueError("Parameter {} is required.".format( self.PARAM_IN_READS_LIBS)) return rds_data def _get_jpreads_info(self, input_params): """ _get_jpreads_info--from a list of jump_readsParams structures fetches the corresponding reads info with the paired_readsParams[pe_id] returns a list of reads data in the following structure: reads_data = { 'fwd_file': path_to_fastq_file, 'jp_prefix': the two-letter prefix for the reads library, 'jp_mean': the average reads length for the reads library, 'jp_stdev': the standard deviation for the reads library, 'type': reads_type, #('interleaved', 'paired', or 'single' 'seq_tech': sequencing_tech, 'reads_ref': KBase object ref for downstream convenience, 'reads_name': KBase object name for downstream convenience, 'rev_file': path_to_fastq_file, #only if paired end } """ rds_params = copy.deepcopy(input_params) wsname = rds_params[self.PARAM_IN_WS] rds_refs = [] rds_data = [] # jump_libraries grouped params if rds_params.get(self.PARAM_IN_JUMP_LIBS, None): jp_reads_libs = rds_params[self.PARAM_IN_JUMP_LIBS] for jp_lib in jp_reads_libs: if jp_lib.get('jp_id', None): rds_refs.append(jp_lib['jp_id']) rds_data = self._get_kbreads_info(wsname, rds_refs) for jp_lib in jp_reads_libs: i = 0 for rds in rds_data: i += 1 if 'jp_id' in jp_lib and jp_lib['jp_id'] == rds[ 'reads_ref']: if jp_lib.get('jp_prefix', None): rds['jp_prefix'] = jp_lib['jp_prefix'][0] else: rds['jp_prefix'] = 's' rds['jp_prefix'] += str(i) jp_lib['jp_prefix'] = rds['jp_prefix'] if jp_lib.get('jp_mean', None) is None: jp_lib['jp_mean'] = 3600 rds['jp_mean'] = jp_lib['jp_mean'] if jp_lib.get('jp_stdev', None) is None: jp_lib['jp_stdev'] = 200 rds['jp_stdev'] = jp_lib['jp_stdev'] self._unique_prefix_check('jp_prefix', jp_reads_libs) return rds_data def _get_kbreads_info(self, wsname, reads_refs): """ _get_kbreads_info--from a set of given KBase reads refs, fetches the corresponding reads info with as deinterleaved fastq files and returns a list of reads data in the following structure: reads_data = { 'fwd_file': path_to_fastq_file, 'type': reads_type, #('interleaved', 'paired', or 'single' 'seq_tech': sequencing_tech, 'reads_ref': KBase object ref for downstream convenience, 'reads_name': KBase object name for downstream convenience, 'rev_file': path_to_fastq_file, #only if paired end } """ obj_ids = [] for r in reads_refs: if r: obj_ids.append({'ref': r if '/' in r else (wsname + '/' + r)}) if not obj_ids: return [] ws_info = self.ws_client.get_object_info_new({'objects': obj_ids}) reads_params = [] reftoname = {} for wsi, oid in zip(ws_info, obj_ids): ref = oid['ref'] reads_params.append(ref) obj_name = wsi[1] reftoname[ref] = wsi[7] + '/' + obj_name typeerr = ('Supported types: KBaseFile.SingleEndLibrary ' + 'KBaseFile.PairedEndLibrary ' + 'KBaseAssembly.SingleEndLibrary ' + 'KBaseAssembly.PairedEndLibrary') try: reads = self.ru.download_reads({ 'read_libraries': reads_params, 'interleaved': 'false' })['files'] except ServerError as se: log('logging stacktrace from dynamic client error') log(se.data) if typeerr in se.message: prefix = se.message.split('.')[0] raise ValueError( prefix + '. Only the types ' + 'KBaseAssembly.SingleEndLibrary ' + 'KBaseAssembly.PairedEndLibrary ' + 'KBaseFile.SingleEndLibrary ' + 'and KBaseFile.PairedEndLibrary are supported') else: raise # log('Downloaded reads data from KBase:\n' + pformat(reads)) reads_data = [] for ref in reads_refs: reads_name = reftoname[ref] f = reads[ref]['files'] seq_tech = reads[ref]['sequencing_tech'] rds_info = { 'fwd_file': f['fwd'], 'reads_ref': ref, 'type': f['type'], 'seq_tech': seq_tech, 'reads_name': reads_name } if f.get('rev', None) is not None: rds_info['rev_file'] = f['rev'] reads_data.append(rds_info) return reads_data def _generate_output_file_list(self, out_dir): """ _generate_output_file_list: zip result files and generate file_links for report """ log('start packing result files') output_files = list() output_directory = os.path.join(self.proj_dir, str(uuid.uuid4())) mkdir_p(output_directory) masurca_output = os.path.join(output_directory, 'masurca_output.zip') self._zip_folder(out_dir, masurca_output) output_files.append({ 'path': masurca_output, 'name': os.path.basename(masurca_output), 'label': os.path.basename(masurca_output), 'description': 'Output file(s) generated by MaSuRCA' }) return output_files def _zip_folder(self, folder_path, output_path): """ _zip_folder: Zip the contents of an entire folder (with that folder included in the archive). Empty subfolders could be included in the archive as well if the commented portion is used. """ with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as ziph: for root, folders, files in os.walk(folder_path): for f in files: absolute_path = os.path.join(root, f) relative_path = os.path.join(os.path.basename(root), f) # print "Adding {} to archive.".format(absolute_path) ziph.write(absolute_path, relative_path) print("{} created successfully.".format(output_path)) # with zipfile.ZipFile(output_path, "r") as f: # print 'Checking the zipped file......\n' # for info in f.infolist(): # print info.filename, info.date_time, info.file_size, info.compress_size def _load_stats(self, input_file_name): log('Starting conversion of FASTA to KBaseGenomeAnnotations.Assembly') log('Building Object.') if not os.path.isfile(input_file_name): raise Exception('The input file name {0} is not a file!'.format( input_file_name)) with open(input_file_name, 'r') as input_file_handle: contig_id = None sequence_len = 0 fasta_dict = dict() first_header_found = False # Pattern for replacing white space pattern = re.compile(r'\s+') for current_line in input_file_handle: if current_line[0] == '>': # found a header line # Wrap up previous fasta sequence if not first_header_found: first_header_found = True else: fasta_dict[contig_id] = sequence_len sequence_len = 0 fasta_header = current_line.replace('>', '').strip() try: contig_id = fasta_header.strip().split(' ', 1)[0] except (IndexError, KeyError, ValueError): contig_id = fasta_header.strip() else: sequence_len += len(re.sub(pattern, '', current_line)) # wrap up last fasta sequence if not first_header_found: raise Exception("There are no contigs in this file") else: fasta_dict[contig_id] = sequence_len return fasta_dict def _check_reference(self, ref): """ Tests the given ref string to make sure it conforms to the expected object reference format. Returns True if it passes, False otherwise. """ obj_ref_regex = re.compile( "^(?P<wsid>\d+)\/(?P<objid>\d+)(\/(?P<ver>\d+))?$") ref_path = ref.strip().split(";") for step in ref_path: if not obj_ref_regex.match(step): return False return True def _check_ref_type(self, ref, allowed_types): """ Validates the object type of ref against the list of allowed types. If it passes, this returns True, otherwise False. Really, all this does is verify that at least one of the strings in allowed_types is a substring of the ref object type name. Ex1: ref = "KBaseGenomes.Genome-4.0" allowed_types = ["assembly", "KBaseFile.Assembly"] returns False Ex2: ref = "KBaseGenomes.Genome-4.0" allowed_types = ["assembly", "genome"] returns True """ obj_type = self._get_object_type(ref).lower() for t in allowed_types: if t.lower() in obj_type: return True return False def _get_object_type(self, ref): """ Fetches and returns the typed object name of ref from the given workspace url. If that object doesn't exist, or there's another Workspace error, this raises a RuntimeError exception. """ info = self.ws_client.get_object_info3({'objects': [{'ref': ref}]}) obj_info = info.get('infos', [[]])[0] if len(obj_info) == 0: raise RuntimeError( "An error occurred while fetching type info from the Workspace. " "No information returned for reference {}".format(ref)) return obj_info[2] def _get_fasta_from_assembly(self, assembly_ref): """ From an assembly or contigset, this uses a data file to build a FASTA file and return the path to it. """ allowed_types = [ 'KBaseFile.Assembly', 'KBaseGenomeAnnotations.Assembly', 'KBaseGenomes.ContigSet' ] if not self._check_ref_type(assembly_ref, allowed_types): raise ValueError( "The reference {} cannot be used to fetch a FASTA file".format( assembly_ref)) au = AssemblyUtil(self.callback_url) return au.get_assembly_as_fasta({'ref': assembly_ref}) def generate_report(self, contig_file_name, params, out_dir, wsname): """ generate_report: reporting results """ log('Generating and saving report') contig_file_with_path = os.path.join(out_dir, contig_file_name) fasta_stats = self._load_stats(contig_file_with_path) lengths = [fasta_stats[contig_id] for contig_id in fasta_stats] assembly_ref = params[self.PARAM_IN_WS] + '/' + params[ self.PARAM_IN_CS_NAME] report_text = '' report_text += 'MaSuRCA results saved to: ' + wsname + '/' + out_dir + '\n' report_text += 'Assembly saved to: ' + assembly_ref + '\n' report_text += 'Assembled into ' + str(len(lengths)) + ' contigs.\n' report_text += 'Avg Length: ' + str( sum(lengths) / float(len(lengths))) + ' bp.\n' # compute a simple contig length distribution bins = 10 counts, edges = np.histogram(lengths, bins) report_text += 'Contig Length Distribution (# of contigs -- min to max ' + 'basepairs):\n' for c in range(bins): report_text += (' ' + str(counts[c]) + '\t--\t' + str(edges[c]) + ' to ' + str(edges[c + 1]) + ' bp\n') print('Running QUAST') quastret = self.kbq.run_QUAST({ 'files': [{ 'path': contig_file_with_path, 'label': params[self.PARAM_IN_CS_NAME] }] }) output_files = self._generate_output_file_list(out_dir) print('Saving report') report_output = self.kbr.create_extended_report({ 'message': report_text, 'objects_created': [{ 'ref': assembly_ref, 'description': 'Assembled contigs' }], 'direct_html_link_index': 0, 'file_links': output_files, 'html_links': [{ 'shock_id': quastret['shock_id'], 'name': 'report.html', 'label': 'QUAST report' }], 'report_object_name': 'kb_masurca_report_' + str(uuid.uuid4()), 'workspace_name': params[self.PARAM_IN_WS] }) report_name = report_output['name'] report_ref = report_output['ref'] return report_name, report_ref def validate_params(self, params): """ validate_params: checks params passed to run_masurca_app method and set default values """ # log('Start validating run_masurca_app parameters:\n{}'.format( # json.dumps(params, indent=1))) # check for mandatory parameters if params.get(self.PARAM_IN_WS, None) is None: raise ValueError(self.PARAM_IN_WS + ' parameter is mandatory') if self.PARAM_IN_THREADN not in params: raise ValueError(self.PARAM_IN_THREADN + ' parameter is mandatory') if params.get(self.PARAM_IN_JF_SIZE, None) is None: raise ValueError(self.PARAM_IN_JF_SIZE + ' parameter is mandatory') if params.get(self.PARAM_IN_READS_LIBS, None) is None: raise ValueError(self.PARAM_IN_READS_LIBS + ' parameter is mandatory') if type(params[self.PARAM_IN_READS_LIBS]) != list: raise ValueError(self.PARAM_IN_READS_LIBS + ' must be a list') if params.get(self.PARAM_IN_CS_NAME, None) is None: raise ValueError('Parameter {} is mandatory!'.format( self.PARAM_IN_CS_NAME)) if self.INVALID_WS_OBJ_NAME_RE.search(params[self.PARAM_IN_CS_NAME]): raise ValueError('Invalid workspace object name: {}.'.format( params[self.PARAM_IN_CS_NAME])) if 'dna_source' in params: dna_src = params.get('dna_source') if dna_src == 'bacteria': params['limit_jump_coverage'] = 60 params['cgwErrorRate'] = 0.25 else: params['limit_jump_coverage'] = 300 params['cgwErrorRate'] = 0.15 if params.get('create_report', None) is None: params['create_report'] = 0 return params def construct_masurca_assembler_cfg(self, params): # STEP 1: get the working folder housing the config.txt file and the masurca results wsname = params[self.PARAM_IN_WS] config_file_path = os.path.join(self.proj_dir, 'config.txt') # STEP 2.1: retrieve the reads data from input parameter pe_reads_data = self._get_pereads_info(params) jp_reads_data = [] if params.get(self.PARAM_IN_JUMP_LIBS, None): jp_reads_data = self._get_jpreads_info(params) if 'jp_mean' not in params or type(params['jp_mean']) != int: params['jp_mean'] = 3600 if 'jp_stdev' not in params or type(params['jp_stdev']) != int: params['jp_stdev'] = 200 # STEP 2.2: PACBIO reads must be in a single FASTA file and supplied as PACBIO=reads.fa; assbl_types = [ 'KBaseFile.Assembly', 'KBaseGenomeAnnotations.Assembly', 'KBaseGenomes.ContigSet' ] reads_types = [ 'KBaseAssembly.SingleEndLibrary', 'KBaseFile.SingleEndLibrary', 'KBaseAssembly.PairedEndLibrary', 'KBaseFile.PairedEndLibrary' ] pb_reads_file = '' if params.get('pacbio_reads', None): pb_ref = params['pacbio_reads'] if self._check_ref_type(pb_ref, assbl_types): pb_reads_file = (self._get_fasta_from_assembly(pb_ref)).get( 'path', '') else: if self._check_ref_type(pb_ref, reads_types): pb_rd = self._get_kbreads_info(wsname, [pb_ref]) pb_reads_file = pb_rd[0]['fwd_file'] if pb_rd[0].get('rev_file', None): pb_reads_file += ' ' + pb_rd[0]['rev_file'] # STEP 2.3: NANOPORE reads must be in a single FASTA/FASTQ file and supplied # as NANOPORE=reads.fa np_reads_file = '' if params.get('nanopore_reads', None): np_ref = params['nanopore_reads'] if self._check_ref_type(np_ref, assbl_types): np_reads_file = (self._get_fasta_from_assembly(np_ref)).get( 'path', '') else: if self._check_ref_type(np_ref, reads_types): np_rd = self._get_kbreads_info(wsname, [np_ref]) np_reads_file = np_rd[0]['fwd_file'] if np_rd[0].get('rev_file', None): np_reads_file += ' ' + np_rd[0]['rev_file'] # STEP 2.4: any OTHER sequence data (454, Sanger, Ion torrent, etc) must be first # converted into Celera Assembler compatible .frg files # (see http://wgsassembler.sourceforge.com) and supplied as OTHER=file.frg other_frg = '' if params.get('other_frg_file', None): other_frg = params['other_frg_file'] # STEP 3: construct and save the config.txt file for running masurca try: # STEP 3.1: replace the 'DATA...END' portion of the config_template.txt file data_str = self._get_data_portion(pe_reads_data, jp_reads_data, pb_reads_file, np_reads_file, other_frg) if data_str == '': # no reads libraries are specified, no further actions return '' config_template = '' with codecs.open(os.path.join(os.path.dirname(__file__), 'config_template.txt'), mode='r', encoding='utf-8') as config_template_file: config_template = config_template_file.read() begin_patn1 = "DATA\n" end_patn1 = "END\nPARAMETERS\n" config_with_data = self._replaceSectionText( config_template, begin_patn1, end_patn1, data_str) # log("\n***After DATA section replacement:\n{}\nSaved at {}".format( # config_with_data.encode('utf-8').decode('utf-8'), config_file_path)) with codecs.open(config_file_path, mode='w', encoding='utf-8') as config_file: config_file.write(config_with_data) # STEP 3.2: replace the 'PARAMETERS...END' portion of the config_file file saved above param_str = self._get_parameters_portion(params) if param_str == '': # no parameters are specified, no further actions return '' previous_config = '' with codecs.open(config_file_path, mode='r', encoding='utf-8') as previous_config_file: previous_config = previous_config_file.read() begin_patn2 = "PARAMETERS\n" end_patn2 = "END\n" final_config = self._replaceSectionText(previous_config, begin_patn2, end_patn2, param_str) log("\n***Configuration file content:\n{}\nSaved at {}".format( final_config.encode('utf-8').decode('utf-8'), config_file_path)) with codecs.open(config_file_path, mode='w', encoding='utf-8') as config_file: config_file.write(final_config) except IOError as ioerr: log('Creation of the config.txt file raised error:\n') pprint(ioerr) return '' else: return config_file_path def generate_assemble_script(self, config_file): if os.path.isfile(config_file): f_dir, f_nm = os.path.split(config_file) m_cmd = [self.MaSuRCA_BIN] m_cmd.append(config_file) try: self.prog_runner.run(m_cmd, f_dir) assemble_file = os.path.join(f_dir, 'assemble.sh') log('Created the assemble.sh file at {}.\n'.format( assemble_file)) return assemble_file except ValueError as ve: log('Error generating assemble.sh file: \n{}'.format(ve)) raise ValueError('Failed to generate assemble.sh file!') else: log("The config file {} is not found.\n".format(config_file)) log('NO assemble.sh file created.\n') return '' def run_assemble(self, asmbl_file): exit_code = 1 if os.path.isfile(asmbl_file): log("The assemble.sh file exists at {}\n".format(asmbl_file)) f_dir, f_nm = os.path.split(asmbl_file) a_cmd = ['/bin/bash'] a_cmd.append(asmbl_file) log("The working directory is {}\n".format(f_dir)) log("The assembling command is {}\n".format(' '.join(a_cmd))) try: exit_code = self.prog_runner.run(a_cmd, f_dir) except ValueError as ve: log('Error running assemble: \n{}'.format(ve)) else: log("The assemble.sh file {} is not found.".format(asmbl_file)) return exit_code def save_assembly(self, contig_fa, wsname, a_name): if os.path.isfile(contig_fa): log('Uploading FASTA file to Assembly...') self.au.save_assembly_from_fasta({ 'file': { 'path': contig_fa }, 'workspace_name': wsname, 'assembly_name': a_name }) else: log("The contig file {} is not found.".format(contig_fa))
def download_long(self, console, warnings, token, wsname, lib, min_long_read_length): try: # object info try: wsClient = Workspace(self.workspaceURL, token=token) except Exception as e: raise ValueError("unable to instantiate wsClient. " + str(e)) [ OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I ] = range(11) # object_info tuple obj_id = {'ref': lib if '/' in lib else (wsname + '/' + lib)} lib_obj_info = wsClient.get_object_info_new({'objects': [obj_id]})[0] lib_obj_type = lib_obj_info[TYPE_I] lib_obj_type = re.sub('-[0-9]+\.[0-9]+$', "", lib_obj_type) # remove trailing version lib_ref = str(lib_obj_info[WSID_I])+'/' + \ str(lib_obj_info[OBJID_I])+'/'+str(lib_obj_info[VERSION_I]) if lib_obj_type == 'KBaseGenomes.ContigSet' or lib_obj_type == 'KBaseGenomeAnnotations.Assembly': # download using assembly util / data file util self.log(console, "Getting long reads (from contigs object).\n") auClient = AssemblyUtil(url=self.callbackURL, token=token) dfuClient = DataFileUtil(url=self.callbackURL, token=token) contigFile = auClient.get_assembly_as_fasta({ 'ref': lib_ref }).get('path') long_reads_path = dfuClient.unpack_file( {'file_path': contig_file})['file_path'] self.log( warnings, "Warning: Long reads are in FASTA format, so short read check was not performed." ) else: ruClient = ReadsUtils(url=self.callbackURL, token=token) self.log(console, "Getting long reads (from reads library object).\n") result = ruClient.download_reads({ 'read_libraries': [lib_ref], 'interleaved': 'false' }) long_reads_path = result['files'][lib_ref]['files']['fwd'] [n_reads, n_reads_short ] = self.filter_short_fastq(console, long_reads_path, min_long_read_length) if (n_reads_short > 0): self.log( warnings, "Warning: Of " + str(n_reads) + " long reads, " + str(n_reads_short) + " are shorter than " + str(min_long_read_length) + "; consider using the filtlong app to filter out shorter reads." ) except Exception as e: raise ValueError('Unable to download long reads\n' + str(e)) return long_reads_path
def download_short_unpaired(self, console, token, wsname, short_unpaired_libraries): try: self.log(console, "Getting short unpaired reads.\n") ruClient = ReadsUtils(url=self.callbackURL, token=token) # first, unpack any ReadsSets into the actual SingleEndLibrary referencs reads_refs = [] # object info try: wsClient = Workspace(self.workspaceURL, token=token) except Exception as e: raise ValueError("unable to instantiate wsClient. " + str(e)) [ OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I ] = range(11) # object_info tuple for lib in short_unpaired_libraries: try: obj_id = { 'ref': lib if '/' in lib else (wsname + '/' + lib) } lib_obj_info = wsClient.get_object_info_new( {'objects': [obj_id]})[0] lib_obj_type = lib_obj_info[TYPE_I] # remove trailing version lib_obj_type = re.sub('-[0-9]+\.[0-9]+$', "", lib_obj_type) lib_ref = str(lib_obj_info[WSID_I])+'/' + \ str(lib_obj_info[OBJID_I])+'/'+str(lib_obj_info[VERSION_I]) if lib_obj_type == 'KBaseSets.ReadsSet': # unpack it try: setAPIClient = SetAPI(url=self.serviceWizardURL, token=token) self.log(console, 'getting reads set ' + lib_ref) readsSet = setAPIClient.get_reads_set_v1({ 'ref': lib_ref, 'include_item_info': 1 }) except Exception as e: raise ValueError( 'SetAPI FAILURE: Unable to get read library set object: (' + lib_ref + ')\n' + str(e)) for readsLibrary in readsSet['data']['items']: reads_refs.append(readsLibrary['ref']) else: # use other reads objects "as is" reads_refs.append(lib_ref) except Exception as e: raise ValueError('Unable to get read library object: (' + str(lib) + ')' + str(e)) result = ruClient.download_reads({ 'read_libraries': reads_refs, 'interleaved': 'false' }) # combine outputs short_unpaired_path = os.path.join( self.scratch, "short_unpaired_" + str(uuid.uuid4()) + ".fastq") self.log(console, "Combining short unpaired reads.\n") for reads_ref in reads_refs: files = result['files'][reads_ref]['files'] if 'fwd' in files: path = files['fwd'] if path.endswith('.gz'): cmd = 'gzip -dc ' + path + ' >> ' + short_unpaired_path else: cmd = 'cat ' + path + ' >> ' + short_unpaired_path self.log(console, "command: " + cmd) cmdProcess = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) cmdProcess.wait() if cmdProcess.returncode != 0: raise ValueError('Error running ' + cmd) os.remove(path) else: raise ValueError('File ' + reads_ref + ' missing forward reads file') except Exception as e: raise ValueError('Unable to download short unpaired reads\n' + str(e)) return short_unpaired_path
class nmdc_mg_assembly: def __init__(self, callbaack_url, scratch, wdl='../../metaAssembly/'): self.callback_url = callbaack_url self.scratch = scratch self.special = special(self.callback_url) self.ru = ReadsUtils(self.callback_url) self.au = AssemblyUtil(self.callback_url) self.report = KBaseReport(self.callback_url) self.wdl_base = wdl def validate_params(self, params): pass def fetch_reads_files(self, reads_upas): """ From a list of reads UPAs, uses ReadsUtils to fetch the reads as files. Returns them as a dictionary from reads_upa -> filename """ if reads_upas is None: raise ValueError("reads_upas must be a list of UPAs") if len(reads_upas) == 0: raise ValueError("reads_upas must contain at least one UPA") reads_info = self.ru.download_reads(({ 'read_libraries': reads_upas, 'interleaved': 'true', 'gzipped': None }))['files'] file_set = dict() for reads in reads_info: file_set[reads] = reads_info[reads]['files']['fwd'] return file_set def run_wdl(self, rf): print(os.getcwd()) wdl_files = ['jgi_assembly.wdl'] for f in wdl_files: src = self.wdl_base + f dst = './' + f shutil.copy(src, dst) ins = { "jgi_metaASM.input_file": [rf.replace(self.scratch, './')], "jgi_metaASM.rename_contig_prefix": "contig", "jgi_metaASM.outdir": "/out/" } input_file = os.path.join(self.scratch, 'inputs.json') with open(input_file, 'w') as f: f.write(json.dumps(ins)) p = {'workflow': wdl_files[0], 'inputs': 'inputs.json'} res = self.special.wdl(p) print('wdl: ' + str(res)) def _fix_path(self, orig): ind = orig.find('cromwell-executions') return os.path.join(self.scratch, orig[ind:]) def upload_assembly(self, file_path_orig, workspace_name, assembly_name): """ From a list of file paths, uploads them to KBase, generates Assembly objects, then returns the generated UPAs. """ file_path = self._fix_path(file_path_orig) if not file_path: raise ValueError("file_path must be defined") if not os.path.exists(file_path): raise ValueError( "The given assembly file '{}' does not exist".format( file_path)) if not workspace_name: raise ValueError("workspace_name must be defined") if not assembly_name: raise ValueError("assembly_name must be defined") assembly_upa = self.au.save_assembly_from_fasta({ "file": { "path": file_path }, "workspace_name": workspace_name, "assembly_name": assembly_name }) return assembly_upa def _upload_pipeline_result(self, pipeline_result, workspace_name, assembly_name, filtered_reads_name=None, cleaned_reads_name=None, skip_rqcfilter=False, input_reads=None): """ This is very tricky and uploads (optionally!) a few things under different cases. 1. Uploads assembly - this always happens after a successful run. 2. Cleaned reads - passed RQCFilter / BFC / SeqTK - optional, if cleaned_reads_name isn't None 3. Filtered reads - passed RQCFilter - optional, if filtered_reads_name isn't None AND skip_rqcfilter is False returns a dict of UPAs with the following keys: - assembly_upa - the assembly (always) - filtered_reads_upa - the RQCFiltered reads (optionally) - cleaned_reads_upa - the RQCFiltered -> BFC -> SeqTK cleaned reads (optional) """ # upload the assembly uploaded_assy_upa = self.file_util.upload_assembly( pipeline_result["spades"]["contigs_file"], workspace_name, assembly_name) upload_result = {"assembly_upa": uploaded_assy_upa} # upload filtered reads if we didn't skip RQCFilter (otherwise it's just a copy) if filtered_reads_name and not skip_rqcfilter: # unzip the cleaned reads because ReadsUtils won't do it for us. decompressed_reads = os.path.join(self.output_dir, "filtered_reads.fastq") pigz_command = "{} -d -c {} > {}".format( PIGZ, pipeline_result["rqcfilter"]["filtered_fastq_file"], decompressed_reads) p = subprocess.Popen(pigz_command, cwd=self.scratch_dir, shell=True) exit_code = p.wait() if exit_code != 0: raise RuntimeError( "Unable to decompress filtered reads for validation! Can't upload them, either!" ) filtered_reads_upa = self.file_util.upload_reads( decompressed_reads, workspace_name, filtered_reads_name, input_reads) upload_result["filtered_reads_upa"] = filtered_reads_upa # upload the cleaned reads if cleaned_reads_name: # unzip the cleaned reads because ReadsUtils won't do it for us. decompressed_reads = os.path.join(self.output_dir, "cleaned_reads.fastq") pigz_command = "{} -d -c {} > {}".format( PIGZ, pipeline_result["seqtk"]["cleaned_reads"], decompressed_reads) p = subprocess.Popen(pigz_command, cwd=self.scratch_dir, shell=True) exit_code = p.wait() if exit_code != 0: raise RuntimeError( "Unable to decompress cleaned reads for validation! Can't upload them, either!" ) cleaned_reads_upa = self.file_util.upload_reads( decompressed_reads, workspace_name, cleaned_reads_name, input_reads) upload_result["cleaned_reads_upa"] = cleaned_reads_upa return upload_result def assemble(self, params): self.validate_params(params) workspace_name = params['workspace_name'] assembly_name = params['output_assembly_name'] # Stage Data files = self.fetch_reads_files([params["reads_upa"]]) reads_files = list(files.values()) # Run WDL self.run_wdl(reads_files[0]) # Check if things ran mfile = os.path.join(self.scratch, 'meta.json') print(mfile) if not os.path.exists(mfile): raise OSError("Failed to run workflow") with open(mfile) as f: pipeline_output = json.loads(f.read()) out = pipeline_output["calls"]["jgi_metaASM.create_agp"][0]["outputs"] print(out) # Generate Output Objects contigs_fn = out['outcontigs'] upa = self.upload_assembly(contigs_fn, workspace_name, assembly_name) upload_kwargs = {} print("upload complete") # Do report report_info = self.report.create({ 'report': { 'objects_created': [], 'text_message': "Assemble metagenomic reads" }, 'workspace_name': workspace_name }) return { 'report_name': report_info['name'], 'report_ref': report_info['ref'], }
def run_megahit(self, ctx, params): """ :param params: instance of type "MegaHitParams" (Run MEGAHIT. Most parameters here are just passed forward to MEGAHIT workspace_name - the name of the workspace for input/output read_library_ref - the name of the PE read library (SE library support in the future) output_contig_set_name - the name of the output contigset megahit_parameter_preset - override a group of parameters; possible values: meta '--min-count 2 --k-list 21,41,61,81,99' (generic metagenomes, default) meta-sensitive '--min-count 2 --k-list 21,31,41,51,61,71,81,91,99' (more sensitive but slower) meta-large '--min-count 2 --k-list 27,37,47,57,67,77,87' (large & complex metagenomes, like soil) bulk '--min-count 3 --k-list 31,51,71,91,99 --no-mercy' (experimental, standard bulk sequencing with >= 30x depth) single-cell '--min-count 3 --k-list 21,33,55,77,99,121 --merge_level 20,0.96' (experimental, single cell data) min_count - minimum multiplicity for filtering (k_min+1)-mers, default 2 k_min - minimum kmer size (<= 255), must be odd number, defaults to 21 k_max - maximum kmer size (<= 255), must be odd number, defaults to 141 k_step - increment of kmer size of each iteration (<= 28), must be even number, defaults to 10 k_list - list of kmer sizes (all must be odd, in the range 15-255, increment <= 28); override using `--k-min', `--k-max' and `--k-step' min_contig_length - minimum length of contigs to output, default is 2000 max_mem_percent - maximum memory to make available to MEGAHIT, as a percentage of available system memory (optional, default = 0.9 or 90%) @optional megahit_parameter_preset @optional min_count @optional k_min @optional k_max @optional k_step @optional k_list @optional min_contig_length @optional max_mem_percent) -> structure: parameter "workspace_name" of String, parameter "read_library_ref" of String, parameter "output_contigset_name" of String, parameter "megahit_parameter_preset" of String, parameter "min_count" of Long, parameter "k_min" of Long, parameter "k_max" of Long, parameter "k_step" of Long, parameter "k_list" of list of Long, parameter "min_contig_length" of Long, parameter "max_mem_percent" of Double :returns: instance of type "MegaHitOutput" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_megahit print('Running run_megahit with params=') pprint(params) # STEP 1: basic parameter checks + parsing if 'workspace_name' not in params: raise ValueError('workspace_name parameter is required') if 'read_library_ref' not in params: raise ValueError('read_library_ref parameter is required') if 'output_contigset_name' not in params: raise ValueError('output_contigset_name parameter is required') # STEP 2: get the read library as deinterleaved fastq files input_ref = params['read_library_ref'] reads_params = {'read_libraries': [input_ref], 'interleaved': 'false', 'gzipped': None } ru = ReadsUtils(self.callbackURL) reads = ru.download_reads(reads_params)['files'] print('Input reads files:') fwd = reads[input_ref]['files']['fwd'] rev = reads[input_ref]['files']['rev'] pprint('forward: ' + fwd) pprint('reverse: ' + rev) # STEP 3: run megahit # construct the command megahit_cmd = [self.MEGAHIT] # we only support PE reads, so add that megahit_cmd.append('-1') megahit_cmd.append(fwd) megahit_cmd.append('-2') megahit_cmd.append(rev) # if a preset is defined, use that: if 'megahit_parameter_preset' in params: if params['megahit_parameter_preset']: megahit_cmd.append('--presets') megahit_cmd.append(params['megahit_parameter_preset']) if 'min_count' in params: if params['min_count']: megahit_cmd.append('--min-count') megahit_cmd.append(str(params['min_count'])) if 'k_min' in params: if params['k_min']: megahit_cmd.append('--k-min') megahit_cmd.append(str(params['k_min'])) if 'k_max' in params: if params['k_max']: megahit_cmd.append('--k-max') megahit_cmd.append(str(params['k_max'])) if 'k_step' in params: if params['k_step']: megahit_cmd.append('--k-step') megahit_cmd.append(str(params['k_step'])) if 'k_list' in params: if params['k_list']: k_list = [] for k_val in params['k_list']: k_list.append(str(k_val)) megahit_cmd.append('--k-list') megahit_cmd.append(','.join(k_list)) min_contig_length = self.DEFAULT_MIN_CONTIG_LENGTH if 'min_contig_length' in params: if params['min_contig_length']: if str(params['min_contig_length']).isdigit(): min_contig_length = params['min_contig_length'] else: raise ValueError('min_contig_length parameter must be a non-negative integer') megahit_cmd.append('--min-contig-len') megahit_cmd.append(str(min_contig_length)) # Set the number of CPUs to the number of cores minus 1 megahit_cmd.append('--num-cpu-threads') megahit_cmd.append(str(max([(multiprocessing.cpu_count() - 1), 1]))) # set mem usage # Note: this just sets the default value - 90% of available system memory allocated # to the container. Exposing it here as a place to later expose as a parameter. max_mem_percent = params.get('max_mem_percent', 0.9) megahit_cmd.append('-m') megahit_cmd.append(str(max_mem_percent)) # set the output location timestamp = int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000) output_dir = os.path.join(self.scratch, 'output.' + str(timestamp)) megahit_cmd.append('-o') megahit_cmd.append(output_dir) # run megahit print('running megahit:') print(' ' + ' '.join(megahit_cmd)) p = subprocess.Popen(megahit_cmd, cwd=self.scratch, shell=False) retcode = p.wait() print('Return code: ' + str(retcode)) if p.returncode != 0: error_str = report_megahit_error(output_dir, retcode) raise RuntimeError(error_str) output_contigs = os.path.join(output_dir, 'final.contigs.fa') # on macs, we cannot run megahit in the shared host scratch space, so we need to move the file there if self.mac_mode: shutil.move(output_contigs, os.path.join(self.host_scratch, 'final.contigs.fa')) output_contigs = os.path.join(self.host_scratch, 'final.contigs.fa') # STEP 4: save the resulting assembly assemblyUtil = AssemblyUtil(self.callbackURL) output_data_ref = assemblyUtil.save_assembly_from_fasta({ 'file': {'path': output_contigs}, 'workspace_name': params['workspace_name'], 'assembly_name': params['output_contigset_name'] }) # STEP 5: generate and save the report # compute a simple contig length distribution for the report lengths = [] for seq_record in SeqIO.parse(output_contigs, 'fasta'): lengths.append(len(seq_record.seq)) report = '' report += 'ContigSet saved to: ' + params['workspace_name'] + '/' + params['output_contigset_name'] + '\n' report += 'Assembled into ' + str(len(lengths)) + ' contigs.\n' report += 'Avg Length: ' + str(sum(lengths) / float(len(lengths))) + ' bp.\n' bins = 10 counts, edges = np.histogram(lengths, bins) report += 'Contig Length Distribution (# of contigs -- min to max basepairs):\n' for c in range(bins): report += ' ' + str(counts[c]) + '\t--\t' + str(edges[c]) + ' to ' + str(edges[c + 1]) + ' bp\n' print('Running QUAST') kbq = kb_quast(self.callbackURL) try: quastret = kbq.run_QUAST({'files': [{'path': output_contigs, 'label': params['output_contigset_name']}]}) except ServerError as qe: # not really any way to test this, all inputs have been checked earlier and should be # ok print('Logging exception from running QUAST') print(str(qe)) # TODO delete shock node raise print('Saving report') kbr = KBaseReport(self.callbackURL) try: report_info = kbr.create_extended_report( {'message': report, 'objects_created': [{'ref': output_data_ref, 'description': 'Assembled contigs'}], 'direct_html_link_index': 0, 'html_links': [{'shock_id': quastret['shock_id'], 'name': 'report.html', 'label': 'QUAST report'} ], 'report_object_name': 'kb_megahit_report_' + str(uuid.uuid4()), 'workspace_name': params['workspace_name'] }) except ServerError as re: # not really any way to test this, all inputs have been checked earlier and should be # ok print('Logging exception from creating report object') print(str(re)) # TODO delete shock node raise # STEP 6: contruct the output to send back output = {'report_name': report_info['name'], 'report_ref': report_info['ref']} #END run_megahit # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_megahit return value ' + 'output is not type dict as required.') # return the results return [output]
def run_FamaReadProfiling(self, ctx, params): """ Run metagenome functional profiling module of Fama. :param params: instance of type "FamaReadProfilingParams" (Parameters for metagenome functional profiling. workspace_name - the name of the workspace for input/output read_library_refs - references to the name of the PE read library or SE read library ref_dataset - the name of Fama reference dataset is_paired_end - 1 for paired-end library, 0 for single-end library output_functional_profile_name - the name of the output functional profile output_read_library_ref - the name of the output filtered PE or SE read library) -> structure: parameter "workspace_name" of String, parameter "read_library_refs" of list of String, parameter "ref_dataset" of String, parameter "is_paired_end" of type "bool" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "output_functional_profile_name" of String, parameter "output_read_library_name" of String :returns: instance of type "ReportResults" (Output report parameters report_name - the name of the report object report_ref - the reference to the report object) -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_FamaReadProfiling # Import Read Library and save as two paired-end FASTQ files input_refs = params['read_library_refs'] fama_reference = params['ref_dataset'] ws_client = Workspace(self.ws_url) ru = ReadsUtils(self.callback_url) ret = ws_client.get_object_info3( {'objects': [{ 'ref': ref } for ref in input_refs]}) name2ref = {} input_reads = {} for input_ref in input_refs: ret = ws_client.get_object_info3({'objects': [{'ref': input_ref}]}) obj_name = ret['infos'][0][1] name2ref[obj_name] = input_ref reads_params = { 'read_libraries': [input_ref], 'interleaved': 'false', 'gzipped': None } reads = ru.download_reads(reads_params)['files'] print('Input reads files downloaded:') print(reads) fwd_reads_file = reads[input_ref]['files']['fwd'] rev_reads_file = reads[input_ref]['files']['rev'] print('forward: ' + str(fwd_reads_file)) print('reverse: ' + str(rev_reads_file)) input_reads[obj_name] = {} input_reads[obj_name]['fwd'] = fwd_reads_file input_reads[obj_name]['rev'] = rev_reads_file fama_params = { 'input_reads': input_reads, 'work_dir': self.shared_folder, 'reference': fama_reference, 'is_paired_end': params['is_paired_end'], 'name2ref': name2ref, 'ws_name': params['workspace_name'], 'ws_client': ws_client, 'output_read_library_name': params['output_read_library_name'], 'output_functional_profile_name': params['output_functional_profile_name'], 'input_read_refs': params['read_library_refs'] } # Run Fama fama_output = functional_profiling_pipeline(fama_params) # Write filtered reads to workspace reads_params = { 'fwd_file': fama_output['fwd_reads'], 'sequencing_tech': reads[input_ref]['sequencing_tech'], 'single_genome': '0', 'wsname': params['workspace_name'], 'name': params['output_read_library_name'] } if 'rev_reads' in fama_output: reads_params['rev_file'] = fama_output['rev_reads'] reads_params['interleaved'] = '0' ru_ret = ru.upload_reads(reads_params) print('reads_params', reads_params) print('ru_ret', ru_ret) output_reads_ref = ru_ret['obj_ref'] # Write HTML output to workspace message = 'Fama functional profiling finished successfully' dfu = DataFileUtil(self.callback_url) try: dfu_output = dfu.file_to_shock( {'file_path': fama_output['html_report']}) except ServerError as dfue: # not really any way to test this block self.log('Logging exception loading results to shock') self.log(str(dfue)) raise html_links = [{ 'shock_id': dfu_output['shock_id'], 'description': 'HTML report for Fama App', 'name': 'fama_report.html', 'label': 'Fama_report' }] for krona_file in fama_output['krona_charts']: try: dfu_output = dfu.file_to_shock({'file_path': krona_file}) html_links.append({ 'shock_id': dfu_output['shock_id'], 'description': 'Krona chart for function taxonomy profile', 'name': fama_output['krona_charts'][krona_file][0], 'label': fama_output['krona_charts'][krona_file][1] }) except ServerError as dfue: # not really any way to test this block self.log('Logging exception loading results to shock') self.log(str(dfue)) raise self.log('Krona chart saved: ' + str(dfu_output)) # Save report report_params = { 'message': message, 'objects_created': [{ 'ref': output_reads_ref, 'description': 'Filtered Read Library' }, { 'ref': fama_output['trait_matrix_ref'], 'description': 'Raw counts matrix' }, { 'ref': fama_output['functional_profile_ref'], 'description': 'Functional profile' }], 'direct_html_link_index': 0, 'html_links': html_links, 'file_links': fama_output['report_files'], 'report_object_name': 'fama_profiling_report_' + str(uuid.uuid4()), 'workspace_name': params['workspace_name'], 'html_window_height': 460 } try: report = KBaseReport(self.callback_url) report_info = report.create_extended_report(report_params) except ServerError as kre: # not really any way to test this block self.log('Logging exception saving report') self.log(str(kre)) raise report_info['report_params'] = report_params self.log(str(report_info)) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } #END run_FamaReadProfiling # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_FamaReadProfiling return value ' + 'output is not type dict as required.') # return the results return [output]
class CocacolaUtil: CONCOCT_BASE_PATH = '/kb/deployment/bin/CONCOCT' COCACOLA_BASE_PATH = '/kb/module/lib/kb_cocacola/bin/COCACOLA-python' BINNER_RESULT_DIRECTORY = 'cocacola_output_dir' BINNER_BIN_RESULT_DIR = 'final_bins' MAPPING_THREADS = 16 BBMAP_MEM = '30g' def __init__(self, config): self.callback_url = os.environ['SDK_CALLBACK_URL'] self.scratch = config['scratch'] self.shock_url = config['shock-url'] self.ws_url = config['workspace-url'] self.dfu = DataFileUtil(self.callback_url) self.ru = ReadsUtils(self.callback_url) self.au = AssemblyUtil(self.callback_url) self.mgu = MetagenomeUtils(self.callback_url) def _validate_run_cocacola_params(self, task_params): """ _validate_run_cocacola_params: validates params passed to run_cocacola method """ log('Start validating run_cocacola params') # check for required parameters for p in ['assembly_ref', 'binned_contig_name', 'workspace_name', 'reads_list', 'read_mapping_tool']: if p not in task_params: raise ValueError('"{}" parameter is required, but missing'.format(p)) def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _run_command(self, command): """ _run_command: run command and print result """ os.chdir(self.scratch) log('Start executing command:\n{}'.format(command)) log('Command is running from:\n{}'.format(self.scratch)) pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True) output, stderr = pipe.communicate() exitCode = pipe.returncode if (exitCode == 0): log('Executed command:\n{}\n'.format(command) + 'Exit Code: {}\n'.format(exitCode)) else: error_msg = 'Error running command:\n{}\n'.format(command) error_msg += 'Exit Code: {}\nOutput:\n{}\nStderr:\n{}'.format(exitCode, output, stderr) raise ValueError(error_msg) sys.exit(1) return (output, stderr) # this function has been customized to return read_type variable (interleaved vs single-end library) def stage_reads_list_file(self, reads_list): """ stage_reads_list_file: download fastq file associated to reads to scratch area and return result_file_path """ log('Processing reads object list: {}'.format(reads_list)) result_file_path = [] read_type = [] # getting from workspace and writing to scratch. The 'reads' dictionary now has file paths to scratch. reads = self.ru.download_reads({'read_libraries': reads_list, 'interleaved': None})['files'] # reads_list is the list of file paths on workspace? (i.e. 12804/1/1). # "reads" is the hash of hashes where key is "12804/1/1" or in this case, read_obj and # "files" is the secondary key. The tertiary keys are "fwd" and "rev", as well as others. for read_obj in reads_list: files = reads[read_obj]['files'] # 'files' is dictionary where 'fwd' is key of file path on scratch. result_file_path.append(files['fwd']) read_type.append(files['type']) if 'rev' in files and files['rev'] is not None: result_file_path.append(files['rev']) return result_file_path, read_type def _get_contig_file(self, assembly_ref): """ _get_contig_file: get contig file from GenomeAssembly object """ contig_file = self.au.get_assembly_as_fasta({'ref': assembly_ref}).get('path') sys.stdout.flush() contig_file = self.dfu.unpack_file({'file_path': contig_file})['file_path'] return contig_file def retrieve_and_clean_assembly(self, task_params): if os.path.exists(task_params['contig_file_path']): assembly = task_params['contig_file_path'] print("FOUND ASSEMBLY ON LOCAL SCRATCH") else: # we are on njsw so lets copy it over to scratch assembly = self._get_contig_file(task_params['assembly_ref']) # remove spaces from fasta headers because that breaks bedtools assembly_clean = os.path.abspath(assembly).split('.fa')[0] + "_clean.fa" command = '/bin/bash reformat.sh in={} out={} addunderscore overwrite=true'.format(assembly, assembly_clean) log('running reformat command: {}'.format(command)) out, err = self._run_command(command) return assembly_clean def fasta_filter_contigs_generator(self, fasta_record_iter, min_contig_length): """ generates SeqRecords iterator for writing from a legacy contigset object """ rows = 0 rows_added = 0 for record in fasta_record_iter: rows += 1 if len(record.seq) >= min_contig_length: rows_added += 1 yield record def filter_contigs_by_length(self, fasta_file_path, min_contig_length): """ removes all contigs less than the min_contig_length provided """ filtered_fasta_file_path = os.path.abspath(fasta_file_path).split('.fa')[0] + "_filtered.fa" fasta_record_iter = SeqIO.parse(fasta_file_path, 'fasta') SeqIO.write(self.fasta_filter_contigs_generator(fasta_record_iter, min_contig_length), filtered_fasta_file_path, 'fasta') return filtered_fasta_file_path def generate_stats_for_genome_bins(self, task_params, genome_bin_fna_file, bbstats_output_file): """ generate_command: bbtools stats.sh command """ log("running generate_stats_for_genome_bins on {}".format(genome_bin_fna_file)) genome_bin_fna_file = os.path.join(self.scratch, self.BINNER_RESULT_DIRECTORY, genome_bin_fna_file) command = '/bin/bash stats.sh in={} format=3 > {}'.format(genome_bin_fna_file, bbstats_output_file) self._run_command(command) bbstats_output = open(bbstats_output_file, 'r').readlines()[1] n_scaffolds = bbstats_output.split('\t')[0] n_contigs = bbstats_output.split('\t')[1] scaf_bp = bbstats_output.split('\t')[2] contig_bp = bbstats_output.split('\t')[3] gap_pct = bbstats_output.split('\t')[4] scaf_N50 = bbstats_output.split('\t')[5] scaf_L50 = bbstats_output.split('\t')[6] ctg_N50 = bbstats_output.split('\t')[7] ctg_L50 = bbstats_output.split('\t')[8] scaf_N90 = bbstats_output.split('\t')[9] scaf_L90 = bbstats_output.split('\t')[10] ctg_N90 = bbstats_output.split('\t')[11] ctg_L90 = bbstats_output.split('\t')[12] scaf_max = bbstats_output.split('\t')[13] ctg_max = bbstats_output.split('\t')[14] scaf_n_gt50K = bbstats_output.split('\t')[15] scaf_pct_gt50K = bbstats_output.split('\t')[16] gc_avg = float(bbstats_output.split('\t')[17]) * 100 # need to figure out if correct gc_std = float(bbstats_output.split('\t')[18]) * 100 # need to figure out if correct log('Generated generate_stats_for_genome_bins command: {}'.format(command)) return {'n_scaffolds': n_scaffolds, 'n_contigs': n_contigs, 'scaf_bp': scaf_bp, 'contig_bp': contig_bp, 'gap_pct': gap_pct, 'scaf_N50': scaf_N50, 'scaf_L50': scaf_L50, 'ctg_N50': ctg_N50, 'ctg_L50': ctg_L50, 'scaf_N90': scaf_N90, 'scaf_L90': scaf_L90, 'ctg_N90': ctg_N90, 'ctg_L90': ctg_L90, 'scaf_max': scaf_max, 'ctg_max': ctg_max, 'scaf_n_gt50K': scaf_n_gt50K, 'scaf_pct_gt50K': scaf_pct_gt50K, 'gc_avg': gc_avg, 'gc_std': gc_std } def deinterlace_raw_reads(self, fastq): fastq_forward = fastq.split('.fastq')[0] + "_forward.fastq" fastq_reverse = fastq.split('.fastq')[0] + "_reverse.fastq" command = 'reformat.sh in={} out1={} out2={} overwrite=true'.format(fastq, fastq_forward, fastq_reverse) self._run_command(command) return (fastq_forward, fastq_reverse) def run_read_mapping_interleaved_pairs_mode(self, task_params, assembly_clean, fastq, sam): read_mapping_tool = task_params['read_mapping_tool'] log("running {} mapping in interleaved mode.".format(read_mapping_tool)) if task_params['read_mapping_tool'] == 'bbmap': command = 'bbmap.sh -Xmx{} '.format(self.BBMAP_MEM) command += 'threads={} '.format(self.MAPPING_THREADS) command += 'ref={} '.format(assembly_clean) command += 'in={} '.format(fastq) command += 'out={} '.format(sam) command += 'fast interleaved=true mappedonly nodisk overwrite' elif task_params['read_mapping_tool'] == 'bwa': (fastq_forward, fastq_reverse) = self.deinterlace_raw_reads(fastq) command = 'bwa index {} && '.format(assembly_clean) command += 'bwa mem -t {} '.format(self.MAPPING_THREADS) command += '{} '.format(assembly_clean) command += '{} '.format(fastq_forward) command += '{} > '.format(fastq_reverse) command += '{}'.format(sam) elif task_params['read_mapping_tool'] == 'bowtie2_default': (fastq_forward, fastq_reverse) = self.deinterlace_raw_reads(fastq) bt2index = os.path.basename(assembly_clean) + '.bt2' command = 'bowtie2-build -f {} '.format(assembly_clean) command += '--threads {} '.format(self.MAPPING_THREADS) command += '{} && '.format(bt2index) command += 'bowtie2 -x {} '.format(bt2index) command += '-1 {} '.format(fastq_forward) command += '-2 {} '.format(fastq_reverse) command += '--threads {} '.format(self.MAPPING_THREADS) command += '-S {}'.format(sam) elif task_params['read_mapping_tool'] == 'bowtie2_very_sensitive': (fastq_forward, fastq_reverse) = self.deinterlace_raw_reads(fastq) bt2index = os.path.basename(assembly_clean) + '.bt2' command = 'bowtie2-build -f {} '.format(assembly_clean) command += '--threads {} '.format(self.MAPPING_THREADS) command += '{} && '.format(bt2index) command += 'bowtie2 --very-sensitive -x {} '.format(bt2index) command += '-1 {} '.format(fastq_forward) command += '-2 {} '.format(fastq_reverse) command += '--threads {} '.format(self.MAPPING_THREADS) command += '-S {}'.format(sam) elif task_params['read_mapping_tool'] == 'minimap2': (fastq_forward, fastq_reverse) = self.deinterlace_raw_reads(fastq) command = 'minimap2 -ax sr -t {} '.format(self.MAPPING_THREADS) command += '{} '.format(assembly_clean) command += '{} '.format(fastq_forward) command += '{} > '.format(fastq_reverse) command += '{}'.format(sam) elif task_params['read_mapping_tool'] == 'hisat2': (fastq_forward, fastq_reverse) = self.deinterlace_raw_reads(fastq) ht2index = os.path.basename(assembly_clean) + '.ht2' command = 'hisat2-build {} '.format(assembly_clean) command += '{} && '.format(ht2index) command += 'hisat2 -x {} '.format(ht2index) command += '-1 {} '.format(fastq_forward) command += '-2 {} '.format(fastq_reverse) command += '-S {} '.format(sam) command += '--threads {}'.format(self.MAPPING_THREADS) log('running alignment command: {}'.format(command)) out, err = self._run_command(command) def run_read_mapping_unpaired_mode(self, task_params, assembly_clean, fastq, sam): read_mapping_tool = task_params['read_mapping_tool'] log("running {} mapping in single-end (unpaired) mode.".format(read_mapping_tool)) if task_params['read_mapping_tool'] == 'bbmap': command = 'bbmap.sh -Xmx{} '.format(self.BBMAP_MEM) command += 'threads={} '.format(self.MAPPING_THREADS) command += 'ref={} '.format(assembly_clean) command += 'in={} '.format(fastq) command += 'out={} '.format(sam) command += 'fast interleaved=false mappedonly nodisk overwrite' # BBMap is deterministic without the deterministic flag if using single-ended reads elif task_params['read_mapping_tool'] == 'bwa': command = 'bwa index {} && '.format(assembly_clean) command += 'bwa mem -t {} '.format(self.MAPPING_THREADS) command += '{} '.format(assembly_clean) command += '{} > '.format(fastq) command += '{}'.format(sam) elif task_params['read_mapping_tool'] == 'bowtie2_default': bt2index = os.path.basename(assembly_clean) + '.bt2' command = 'bowtie2-build -f {} '.format(assembly_clean) command += '--threads {} '.format(self.MAPPING_THREADS) command += '{} && '.format(bt2index) command += 'bowtie2 -x {} '.format(bt2index) command += '-U {} '.format(fastq) command += '--threads {} '.format(self.MAPPING_THREADS) command += '-S {}'.format(sam) elif task_params['read_mapping_tool'] == 'bowtie2_very_sensitive': bt2index = os.path.basename(assembly_clean) + '.bt2' command = 'bowtie2-build -f {} '.format(assembly_clean) command += '--threads {} '.format(self.MAPPING_THREADS) command += '{} && '.format(bt2index) command += 'bowtie2 --very-sensitive -x {} '.format(bt2index) command += '-U {} '.format(fastq) command += '--threads {} '.format(self.MAPPING_THREADS) command += '-S {}'.format(sam) elif task_params['read_mapping_tool'] == 'minimap2': command = 'minimap2 -ax sr -t {} '.format(self.MAPPING_THREADS) command += '{} '.format(assembly_clean) command += '{} > '.format(fastq) command += '{}'.format(sam) elif task_params['read_mapping_tool'] == 'hisat2': ht2index = os.path.basename(assembly_clean) + '.ht2' command = 'hisat2-build {} '.format(assembly_clean) command += '{} && '.format(ht2index) command += 'hisat2 -x {} '.format(ht2index) command += '-U {} '.format(fastq) command += '-S {} '.format(sam) command += '--threads {}'.format(self.MAPPING_THREADS) log('running alignment command: {}'.format(command)) out, err = self._run_command(command) def convert_sam_to_sorted_and_indexed_bam(self, sam): # create bam files from sam files sorted_bam = os.path.abspath(sam).split('.sam')[0] + "_sorted.bam" command = 'samtools view -F 0x04 -uS {} | '.format(sam) command += 'samtools sort - -o {}'.format(sorted_bam) log('running samtools command to generate sorted bam: {}'.format(command)) self._run_command(command) # verify we got bams if not os.path.exists(sorted_bam): log('Failed to find bam file\n{}'.format(sorted_bam)) sys.exit(1) elif(os.stat(sorted_bam).st_size == 0): log('Bam file is empty\n{}'.format(sorted_bam)) sys.exit(1) # index the bam file command = 'samtools index {}'.format(sorted_bam) log('running samtools command to index sorted bam: {}'.format(command)) self._run_command(command) return sorted_bam def generate_alignment_bams(self, task_params, assembly_clean): """ This function runs the selected read mapper and creates the sorted and indexed bam files from sam files using samtools. """ reads_list = task_params['reads_list'] (read_scratch_path, read_type) = self.stage_reads_list_file(reads_list) sorted_bam_file_list = [] # list of reads files, can be 1 or more. assuming reads are either type unpaired or interleaved # will not handle unpaired forward and reverse reads input as seperate (non-interleaved) files for i in range(len(read_scratch_path)): fastq = read_scratch_path[i] fastq_type = read_type[i] sam = os.path.basename(fastq).split('.fastq')[0] + ".sam" sam = os.path.join(self.BINNER_RESULT_DIRECTORY, sam) if fastq_type == 'interleaved': # make sure working - needs tests log("Running interleaved read mapping mode") self.run_read_mapping_interleaved_pairs_mode(task_params, assembly_clean, fastq, sam) else: # running read mapping in single-end mode log("Running unpaired read mapping mode") self.run_read_mapping_unpaired_mode(task_params, assembly_clean, fastq, sam) sorted_bam = self.convert_sam_to_sorted_and_indexed_bam(sam) sorted_bam_file_list.append(sorted_bam) return sorted_bam_file_list def generate_make_coverage_table_command(self, task_params, sorted_bam_file_list): # create the depth file for this bam # min_contig_length = task_params['min_contig_length'] sorted_bam = task_params['sorted_bam'] depth_file_path = os.path.join(self.scratch, str('cocacola_depth.txt')) command = '/kb/module/lib/kb_cocacola/bin/jgi_summarize_bam_contig_depths ' command += '--outputDepth {} '.format(depth_file_path) command += '--minContigLength {} '.format(min_contig_length) command += '--minContigDepth 1 {}'.format(sorted_bam) log('running summarize_bam_contig_depths command: {}'.format(command)) self._run_command(command) return depth_file_path def generate_cocacola_cut_up_fasta_command(self, task_params): """ generate_command: cocacola cut_up_fasta """ contig_file_path = task_params['contig_file_path'] contig_split_size = task_params['contig_split_size'] contig_split_overlap = task_params['contig_split_overlap'] log("\n\nRunning generate_cocacola_cut_up_fasta_command") command = 'python {}/scripts/cut_up_fasta.py '.format(self.CONCOCT_BASE_PATH) command += '{} '.format(contig_file_path) command += '-c {} '.format(contig_split_size) command += '-o {} '.format(contig_split_overlap) command += '--merge_last -b temp.bed > {}/split_contigs.fa'.format(self.BINNER_RESULT_DIRECTORY) log('Generated cocacola_cut_up_fasta command: {}'.format(command)) self._run_command(command) def generate_cocacola_input_table_from_bam(self, task_params): """ generate_command: cocacola generate input table """ log("\n\nRunning generate_cocacola_input_table_from_bam") command = 'python {}/scripts/gen_input_table.py '.format(self.CONCOCT_BASE_PATH) command += '{}/split_contigs.fa '.format(self.BINNER_RESULT_DIRECTORY) command += '{}/*_sorted.bam > '.format(self.BINNER_RESULT_DIRECTORY) command += '{}/coverage_table.tsv'.format(self.BINNER_RESULT_DIRECTORY) log('Generated cocacola generate input table from bam command: {}'.format(command)) calc_contigs = 0 for line in open('{}/split_contigs.fa'.format(self.BINNER_RESULT_DIRECTORY)): if line.startswith(">"): calc_contigs += 1 task_params['calc_contigs'] = calc_contigs self._run_command(command) def generate_cocacola_kmer_composition_table(self, task_params): """ generate_command: cocacola generate kmer composition table """ log("\n\nRunning generate_cocacola_kmer_composition_table") calc_contigs = task_params['calc_contigs'] kmer_size = task_params['kmer_size'] command = 'python {}/scripts/fasta_to_features.py '.format(self.CONCOCT_BASE_PATH) command += '{}/split_contigs.fa '.format(self.BINNER_RESULT_DIRECTORY) command += '{} '.format(calc_contigs) command += '{} '.format(kmer_size) command += '{}/split_contigs_kmer_{}.csv'.format(self.BINNER_RESULT_DIRECTORY, kmer_size) log('Generated cocacola generate input table from bam command: {}'.format(command)) self._run_command(command) def generate_cocacola_command(self, task_params): """ generate_command: cocacola """ min_contig_length = task_params['min_contig_length'] kmer_size = task_params['kmer_size'] log("\n\nRunning generate_cocacola_command") command = 'python {}/cocacola.py '.format(self.COCACOLA_BASE_PATH) command += '--contig_file {}/split_contigs.fa '.format(self.BINNER_RESULT_DIRECTORY) command += '--abundance_profiles {}/coverage_table.tsv '.format(self.BINNER_RESULT_DIRECTORY) command += '--composition_profiles {}/split_contigs_kmer_{}.csv '.format(self.BINNER_RESULT_DIRECTORY, kmer_size) command += '--output {}/cocacola_output_clusters_min{}.csv'.format(self.BINNER_RESULT_DIRECTORY, min_contig_length) log('Generated cocacola command: {}'.format(command)) self._run_command(command) def add_header_to_post_clustering_file(self, task_params): min_contig_length = task_params['min_contig_length'] header = "contig_id,cluster_id" with open('{}/cocacola_output_clusters_min{}_headers.csv'.format(self.BINNER_RESULT_DIRECTORY, min_contig_length), 'w') as outfile: outfile.write(header) with open('{}/cocacola_output_clusters_min{}.csv'.format(self.BINNER_RESULT_DIRECTORY, min_contig_length), 'r') as datafile: for line in datafile: outfile.write(line) def generate_cocacola_post_clustering_merging_command(self, task_params): """ generate_command: cocacola post cluster merging """ min_contig_length = task_params['min_contig_length'] log("\n\nRunning generate_cocacola_post_clustering_merging_command") command = 'python {}/scripts/merge_cutup_clustering.py '.format(self.CONCOCT_BASE_PATH) command += '{}/cocacola_output_clusters_min{}_headers.csv > '.format(self.BINNER_RESULT_DIRECTORY, min_contig_length) command += '{}/clustering_merged_min{}.csv'.format(self.BINNER_RESULT_DIRECTORY, min_contig_length) log('Generated generate_cocacola_post_clustering_merging command: {}'.format(command)) self._run_command(command) def generate_cocacola_extract_fasta_bins_command(self, task_params): """ generate_command: cocacola extract_fasta_bins """ log("\n\nRunning generate_cocacola_extract_fasta_bins_command") contig_file_path = task_params['contig_file_path'] min_contig_length = task_params['min_contig_length'] bin_result_directory = self.BINNER_RESULT_DIRECTORY + '/' + self.BINNER_BIN_RESULT_DIR self._mkdir_p(bin_result_directory) command = 'python {}/scripts/extract_fasta_bins.py '.format(self.CONCOCT_BASE_PATH) command += '{} '.format(contig_file_path) command += '{}/clustering_merged_min{}.csv '.format(self.BINNER_RESULT_DIRECTORY, min_contig_length) command += '--output_path {}/{}'.format(self.BINNER_RESULT_DIRECTORY, self.BINNER_BIN_RESULT_DIR) log('Generated generate_cocacola_extract_fasta_bins_command command: {}'.format(command)) self._run_command(command) def rename_and_standardize_bin_names(self, task_params): """ generate_command: generate renamed bins """ log("\n\nRunning rename_and_standardize_bin_names") path_to_cocacola_result_bins = os.path.abspath(self.BINNER_RESULT_DIRECTORY) + \ '/' + self.BINNER_BIN_RESULT_DIR + '/' for dirname, subdirs, files in os.walk(path_to_cocacola_result_bins): for file in files: if file.endswith('.fa'): os.rename(os.path.abspath(path_to_cocacola_result_bins) + '/' + file, os.path.abspath(path_to_cocacola_result_bins) + '/bin.' + file.split('.fa')[0].zfill(3) + '.fasta') # need to change to 4 digits def make_binned_contig_summary_file_for_binning_apps(self, task_params): """ generate_command: generate binned contig summary command """ log("\n\nRunning make_binned_contig_summary_file_for_binning_apps") path_to_cocacola_result = os.path.abspath(self.BINNER_RESULT_DIRECTORY) path_to_cocacola_result_bins = '{}/{}/'.format(path_to_cocacola_result, self.BINNER_BIN_RESULT_DIR) path_to_summary_file = path_to_cocacola_result_bins + 'binned_contig.summary' with open(path_to_summary_file, 'w+') as f: f.write("Bin name\tCompleteness\tGenome size\tGC content\n") for dirname, subdirs, files in os.walk(path_to_cocacola_result_bins): for file in files: if file.endswith('.fasta'): genome_bin_fna_file = os.path.join(self.BINNER_BIN_RESULT_DIR, file) bbstats_output_file = os.path.join(self.scratch, self.BINNER_RESULT_DIRECTORY, genome_bin_fna_file).split('.fasta')[0] + ".bbstatsout" bbstats_output = self.generate_stats_for_genome_bins(task_params, genome_bin_fna_file, bbstats_output_file) f.write('{}\t0\t{}\t{}\n'.format(genome_bin_fna_file.split("/")[-1], bbstats_output['contig_bp'], bbstats_output['gc_avg'])) f.close() log('Finished make_binned_contig_summary_file_for_binning_apps function') def generate_output_file_list(self, result_directory): """ generate_output_file_list: zip result files and generate file_links for report """ log('Start packing result files') output_files = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file = os.path.join(output_directory, 'cocacola_result.zip') with zipfile.ZipFile(result_file, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as zip_file: for dirname, subdirs, files in os.walk(result_directory): for file in files: if (file.endswith('.sam') or file.endswith('.bam') or file.endswith('.bai') or file.endswith('.summary')): continue if (dirname.endswith(self.BINNER_BIN_RESULT_DIR)): continue zip_file.write(os.path.join(dirname, file), file) if (dirname.endswith(self.BINNER_BIN_RESULT_DIR)): baseDir = os.path.basename(dirname) for file in files: full = os.path.join(dirname, file) zip_file.write(full, os.path.join(baseDir, file)) output_files.append({'path': result_file, 'name': os.path.basename(result_file), 'label': os.path.basename(result_file), 'description': 'Files generated by CONCOCT App'}) return output_files def generate_html_report(self, result_directory, assembly_ref, binned_contig_obj_ref): """ generate_html_report: generate html summary report """ log('Start generating html report') html_report = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file_path = os.path.join(output_directory, 'report.html') # get summary data from existing assembly object and bins_objects Summary_Table_Content = '' Overview_Content = '' (binned_contig_count, input_contig_count, total_bins_count) = \ self.generate_overview_info(assembly_ref, binned_contig_obj_ref, result_directory) Overview_Content += '<p>Binned contigs: {}</p>'.format(binned_contig_count) Overview_Content += '<p>Input contigs: {}</p>'.format(input_contig_count) Overview_Content += '<p>Number of bins: {}</p>'.format(total_bins_count) with open(result_file_path, 'w') as result_file: with open(os.path.join(os.path.dirname(__file__), 'report_template.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace('<p>Overview_Content</p>', Overview_Content) report_template = report_template.replace('Summary_Table_Content', Summary_Table_Content) result_file.write(report_template) html_report.append({'path': result_file_path, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for kb_cocacola App'}) return html_report def generate_overview_info(self, assembly_ref, binned_contig_obj_ref, result_directory): """ _generate_overview_info: generate overview information from assembly and binnedcontig """ # get assembly and binned_contig objects that already have some data populated in them assembly = self.dfu.get_objects({'object_refs': [assembly_ref]})['data'][0] binned_contig = self.dfu.get_objects({'object_refs': [binned_contig_obj_ref]})['data'][0] input_contig_count = assembly.get('data').get('num_contigs') binned_contig_count = 0 total_bins_count = 0 total_bins = binned_contig.get('data').get('bins') total_bins_count = len(total_bins) for bin in total_bins: binned_contig_count += len(bin.get('contigs')) return (binned_contig_count, input_contig_count, total_bins_count) def generate_report(self, binned_contig_obj_ref, task_params): """ generate_report: generate summary report """ log('Generating report') result_directory = os.path.join(self.scratch, "cocacola_output_dir") task_params['result_directory'] = result_directory output_files = self.generate_output_file_list(task_params['result_directory']) output_html_files = self.generate_html_report(task_params['result_directory'], task_params['assembly_ref'], binned_contig_obj_ref) report_params = { 'message': '', 'workspace_name': task_params['workspace_name'], 'file_links': output_files, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 266, 'report_object_name': 'kb_cocacola_report_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url) output = kbase_report_client.create_extended_report(report_params) report_output = {'report_name': output['name'], 'report_ref': output['ref']} return report_output def create_dict_from_depth_file(self, depth_file_path): # keep contig order (required by metabat2) depth_file_dict = {} with open(depth_file_path, 'r') as f: header = f.readline().rstrip().split("\t") # print('HEADER1 {}'.format(header)) # map(str.strip, header) for line in f: # deal with cases were fastq name has spaces.Assume first # non white space word is unique and use this as ID. # line = line.rstrip() vals = line.rstrip().split("\t") if ' ' in vals[0]: ID = vals[0].split()[0] else: ID = vals[0] depth_file_dict[ID] = vals[1:] depth_file_dict['header'] = header return depth_file_dict def run_cocacola(self, task_params): """ run_cocacola: cocacola app required params: assembly_ref: Metagenome assembly object reference binned_contig_name: BinnedContig object name and output file header workspace_name: the name of the workspace it gets saved to. reads_list: list of reads object (PairedEndLibrary/SingleEndLibrary) upon which CONCOCT will be run optional params: min_contig_length: minimum contig length; default 1000 ref: https://github.com/BinPro/CONCOCT/blob/develop/README.md """ log('--->\nrunning CocacolaUtil.run_cocacola\n' + 'task_params:\n{}'.format(json.dumps(task_params, indent=1))) self._validate_run_cocacola_params(task_params) # get assembly contig_file = self._get_contig_file(task_params['assembly_ref']) task_params['contig_file_path'] = contig_file # clean the assembly file so that there are no spaces in the fasta headers assembly_clean = self.retrieve_and_clean_assembly(task_params) assembly_clean_temp = self.filter_contigs_by_length(assembly_clean, task_params['min_contig_length']) task_params['contig_file_path'] = assembly_clean_temp assembly_clean = assembly_clean_temp # need to clean this up, ugly redundant variable usage # get reads (reads_list_file, read_type) = self.stage_reads_list_file(task_params['reads_list']) task_params['read_type'] = read_type task_params['reads_list_file'] = reads_list_file # prep result directory result_directory = os.path.join(self.scratch, self.BINNER_RESULT_DIRECTORY) self._mkdir_p(result_directory) cwd = os.getcwd() log('changing working dir to {}'.format(result_directory)) os.chdir(result_directory) # run alignments, and update input contigs to use the clean file # this function has an internal loop to generate a sorted bam file for each input read file self.generate_alignment_bams(task_params, assembly_clean) # not used right now # depth_file_path = self.generate_make_coverage_table_command(task_params, sorted_bam_file_list) # depth_dict = self.create_dict_from_depth_file(depth_file_path) # run cocacola prep, cut up fasta input self.generate_cocacola_cut_up_fasta_command(task_params) # run cococola prep, generate coverage tables from bam self.generate_cocacola_input_table_from_bam(task_params) # run cococola prep, generate kmer table self.generate_cocacola_kmer_composition_table(task_params) # run cocacola prep and cocacola self.generate_cocacola_command(task_params) # run command to add header to output file self.add_header_to_post_clustering_file(task_params) # run cocacola post cluster merging command self.generate_cocacola_post_clustering_merging_command(task_params) # run extract bins command self.generate_cocacola_extract_fasta_bins_command(task_params) # run fasta renaming self.rename_and_standardize_bin_names(task_params) # make binned contig summary file self.make_binned_contig_summary_file_for_binning_apps(task_params) # file handling and management os.chdir(cwd) log('changing working dir to {}'.format(cwd)) log('Saved result files to: {}'.format(result_directory)) log('Generated files:\n{}'.format('\n'.join(os.listdir(result_directory)))) # make new BinnedContig object and upload to KBase generate_binned_contig_param = { 'file_directory': os.path.join(result_directory, self.BINNER_BIN_RESULT_DIR), 'assembly_ref': task_params['assembly_ref'], 'binned_contig_name': task_params['binned_contig_name'], 'workspace_name': task_params['workspace_name'] } binned_contig_obj_ref = \ self.mgu.file_to_binned_contigs(generate_binned_contig_param).get('binned_contig_obj_ref') # generate report reportVal = self.generate_report(binned_contig_obj_ref, task_params) returnVal = { 'result_directory': result_directory, 'binned_contig_obj_ref': binned_contig_obj_ref } returnVal.update(reportVal) return returnVal
def run_centrifuge(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_centrifuge # Step 2 - Download the input data as a Fasta and # We can use the AssemblyUtils module to download a FASTA file from our Assembly data object. # The return object gives us the path to the file that was created. logging.info('Downloading reads data as a Fastq file.') readsUtil = ReadsUtils(self.callback_url) download_reads_output = readsUtil.download_reads({'read_libraries': params['input_refs']}) #print(f"Input parameters {params['input_refs']}, {params['db_type']} download_reads_output {download_reads_output}") fastq_files = [] fastq_files_name = [] for key,val in download_reads_output['files'].items(): if 'fwd' in val['files'] and val['files']['fwd']: fastq_files.append(val['files']['fwd']) fastq_files_name.append(val['files']['fwd_name']) if 'rev' in val['files'] and val['files']['rev']: fastq_files.append(val['files']['rev']) fastq_files_name.append(val['files']['rev_name']) #logging.info(f"fastq files {fastq_files}") fastq_files_string = ','.join(fastq_files) output_dir = os.path.join(self.scratch, 'centrifuge_out') if not os.path.exists(output_dir): os.makedirs(output_dir) outprefix = "centrifuge" # Checking db cmd0 = ["ls", "-al", '/data/centrifuge/'] #logging.info(f'cmd {cmd0}') pls = subprocess.Popen(cmd0, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) logging.info(f'subprocess {pls.communicate()}') cmd = ['/kb/module/lib/centrifuge/Utils/uge-centrifuge.sh', '-i', fastq_files_string, '-o', output_dir, '-p', 'centrifuge', '-d', '/data/centrifuge/' + params['db_type']] logging.info(f'cmd {cmd}') p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) logging.info(f'subprocess {p.communicate()}') summary_file = os.path.join(output_dir, outprefix + '.report.txt') # generate report directory and html file report_dir = os.path.join(output_dir, 'html_report') if not os.path.exists(report_dir): os.makedirs(report_dir) summary_file_dt = os.path.join(report_dir, 'centrifuge.datatable.html') self._generate_DataTable(summary_file,summary_file_dt) shutil.copy2('/kb/module/lib/centrifuge/Utils/index.html',os.path.join(report_dir,'index.html')) shutil.copy2(os.path.join(output_dir,outprefix+'.krona.html'),os.path.join(report_dir,'centrifuge.krona.html')) shutil.move(os.path.join(output_dir,outprefix+'.tree.svg'),os.path.join(report_dir,'centrifuge.tree.svg')) html_zipped = self.package_folder(report_dir, 'index.html', 'index.html') # Step 5 - Build a Report and return objects_created = [] output_files = os.listdir(output_dir) output_files_list = [] for output in output_files: output_files_list.append({'path': os.path.join(output_dir, output), 'name': output }) # not used output_html_files = [{'path': os.path.join(report_dir, 'index.html'), 'name': 'index.html'}, {'path': os.path.join(report_dir, 'centrifuge.krona.html'), 'name': 'centrifuge.krona.html'}, {'path': os.path.join(report_dir, 'centrifuge.datatable.html'), 'name': 'centrifuge.datatable.html'}, {'path': os.path.join(report_dir, 'centrifuge.tree.svg'), 'name': 'centrifuge.tree.svg'} ] message = 'Centrifuge run finished on %s against %s.' % (','.join(fastq_files_name) , params['db_type']) report_params = {'message': message, 'workspace_name': params.get('workspace_name'), 'objects_created': objects_created, 'file_links': output_files_list, 'html_links': [html_zipped], 'direct_html_link_index': 0, 'html_window_height': 480} # STEP 6: contruct the output to send back kbase_report_client = KBaseReport(self.callback_url) report_info = kbase_report_client.create_extended_report(report_params) report_info['report_params'] = report_params logging.info(report_info) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } #END run_centrifuge # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_centrifuge return value ' + 'output is not type dict as required.') # return the results return [output]
def run_velvet(self, ctx, params): """ Definition of run_velvet :param params: instance of type "VelvetParams" (Arguments for run_velvet string workspace_name - the name of the workspace from which to take input and store output. int hash_length - an odd integer (if even, it will be decremented) <= 31 string output_contigset_name - the name of the output contigset list<paired_end_lib> read_libraries - Illumina PairedEndLibrary files to assemble min_contig_length - integer to filter out contigs with length < min_contig_length from the Velvet output. Default value is 500 (where 0 implies no filter). @optional min_contig_length @optional cov_cutoff @optional ins_length @optional read_trkg @optional amos_file @optional exp_cov @optional long_cov_cutoff) -> structure: parameter "workspace_name" of String, parameter "hash_length" of Long, parameter "read_libraries" of list of type "read_lib" (The workspace object name of a SingleEndLibrary or PairedEndLibrary file, whether of the KBaseAssembly or KBaseFile type.), parameter "output_contigset_name" of String, parameter "min_contig_length" of Long, parameter "cov_cutoff" of Double, parameter "ins_length" of Long, parameter "read_trkg" of type "bool" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "amos_file" of type "bool" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "exp_cov" of Double, parameter "long_cov_cutoff" of Double :returns: instance of type "VelvetResults" (Output parameter items for run_velvet report_name - the name of the KBaseReport.Report workspace object. report_ref - the workspace reference of the report.) -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_velvet self.log('Running run_velvet with params:\n' + pformat(params)) token = ctx['token'] wsname = params[self.PARAM_IN_WS] self.process_params(params) input_reads_refs = params[self.PARAM_IN_LIB] # STEP 0: preprocess the reads in KBase way obj_ids = [] for r in input_reads_refs: obj_ids.append({'ref': r if '/' in r else (wsname + '/' + r)}) ws = workspaceService(self.workspaceURL, token=token) ws_info = ws.get_object_info_new({'objects': obj_ids}) reads_params = [] reftoname = {} for wsi, oid in zip(ws_info, obj_ids): ref = oid['ref'] reads_params.append(ref) obj_name = wsi[1] reftoname[ref] = wsi[7] + '/' + obj_name readcli = ReadsUtils(self.callbackURL, token=token) typeerr = ('Supported types: KBaseFile.SingleEndLibrary ' + 'KBaseFile.PairedEndLibrary ' + 'KBaseAssembly.SingleEndLibrary ' + 'KBaseAssembly.PairedEndLibrary') try: reads = readcli.download_reads({'read_libraries': reads_params})['files'] except ServerError as se: self.log('logging stacktrace from dynamic client error') self.log(se.data) if typeerr in se.message: prefix = se.message.split('.')[0] raise ValueError( prefix + '. Only the types ' + 'KBaseAssembly.SingleEndLibrary ' + 'KBaseAssembly.PairedEndLibrary ' + 'KBaseFile.SingleEndLibrary ' + 'and KBaseFile.PairedEndLibrary are supported') else: raise self.log('Got reads data from converter:\n' + pformat(reads)) reads_data = [] reads_name = '' for ref in input_reads_refs: reads_name = reftoname[ref] f = reads[ref]['files'] seq_tech = reads[ref]["sequencing_tech"] if f['type'] == 'interleaved': reads_data.append({ 'fwd_file': f['fwd'], 'type': 'interleaved', 'seq_tech': seq_tech }) elif f['type'] == 'paired': reads_data.append({ 'fwd_file': f['fwd'], 'rev_file': f['rev'], 'type': 'paired', 'seq_tech': seq_tech }) elif f['type'] == 'single': reads_data.append({ 'fwd_file': f['fwd'], 'type': 'single', 'seq_tech': seq_tech }) else: raise ValueError('Something is very wrong with read lib' + reads_name) # STEP 1: run velveth and velvetg sequentially velvet_out = self.exec_velvet(params, reads_data) #self.log('Velvet final return: ' + str(velvet_out)) # STEP 2: parse the output and save back to KBase, create report in the same time if isinstance(velvet_out, str) and velvet_out != '': output_contigs = os.path.join(velvet_out, 'contigs.fa') min_contig_len = params.get(self.PARAM_IN_MIN_CONTIG_LENGTH, 0) if (os.path.isfile(output_contigs) and os.path.getsize(output_contigs) == 0): self.log( 'Given the minimal contig length of {} bp, Velvet could not find any ' 'contig of the input reads libary.'.format( str(min_contig_len))) output = {'report_name': 'Empty contigs', 'report_ref': None} elif (os.path.isfile(output_contigs) and os.path.getsize(output_contigs) > 0): self.log('Uploading FASTA file to Assembly') assemblyUtil = AssemblyUtil(self.callbackURL, token=ctx['token'], service_ver='release') if min_contig_len > 0: assemblyUtil.save_assembly_from_fasta({ 'file': { 'path': output_contigs }, 'workspace_name': wsname, 'assembly_name': params[self.PARAM_IN_CS_NAME], 'min_contig_length': min_contig_len }) else: assemblyUtil.save_assembly_from_fasta({ 'file': { 'path': output_contigs }, 'workspace_name': wsname, 'assembly_name': params[self.PARAM_IN_CS_NAME] }) # generate report from contigs.fa report_name, report_ref = self.generate_report( output_contigs, params, velvet_out, wsname) # STEP 3: contruct the output to send back output = {'report_name': report_name, 'report_ref': report_ref} else: output = { 'report_name': 'Velvet found empty contig file', 'report_ref': None } else: output = {'report_name': 'Velvet aborted', 'report_ref': None} #END run_velvet # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_velvet return value ' + 'output is not type dict as required.') # return the results return [output]
class SPAdesUtils: """ Define the SPAdesUtils functions """ SPADES_VERSION = '3.13.0' SPADES_BIN = '/opt/SPAdes-' + SPADES_VERSION + '-Linux/bin' DISABLE_SPADES_OUTPUT = False # should be False in production # Basic options PARAM_IN_SINGLE_CELL = 'single_cell' # --sc PARAM_IN_METAGENOME = 'metagenomic' # --meta PARAM_IN_PLASMID = 'plasmid' # --plasmid PARAM_IN_RNA = 'rna' # --rna PARAM_IN_IONTORRENT = 'iontorrent' # --iontorrent # Pipeline options PARAM_IN_ONLY_ERROR_CORR = 'only-error-correction' # --only-error-correction PARAM_IN_ONLY_ASSEMBLER = 'only-assembler' # --only-assembler PARAM_IN_CAREFUL = 'careful' # --careful PARAM_IN_CONTINUE = 'continue' # --continue PARAM_IN_DISABLE_GZIP = 'disable-gzip-output' # --disable-gzip-output # Input parameters PARAM_IN_WS = 'workspace_name' PARAM_IN_CS_NAME = 'output_contigset_name' PARAM_IN_READS = 'reads_libraries' PARAM_IN_LONG_READS = 'long_reads_libraries' PARAM_IN_KMER_SIZES = 'kmer_sizes' PARAM_IN_SKIP_ERR_CORRECT = 'skip_error_correction' PARAM_IN_MIN_CONTIG_LENGTH = 'min_contig_length' PARAM_IN_DNA_SOURCE = 'dna_source' PARAM_IN_PIPELINE_OPTION = 'pipeline_options' ASSEMBLE_RESULTS_DIR = 'assemble_results' INVALID_WS_OBJ_NAME_RE = re.compile('[^\\w\\|._-]') INVALID_WS_NAME_RE = re.compile('[^\\w:._-]') THREADS_PER_CORE = 3 MAX_THREADS = 64 # per email thread with Anton Korobeynikov MAX_THREADS_META = 128 # Increase threads for metagenomic assemblies MEMORY_OFFSET_GB = 1 # 1GB MIN_MEMORY_GB = 5 MAX_MEMORY_GB_SPADES = 500 MAX_MEMORY_GB_META_SPADES = 1000 GB = 1000000000 # private method definition def __init__(self, prj_dir, config): self.workspace_url = config['workspace-url'] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] if 'shock-url' in config: self.shock_url = config['shock-url'] if 'handle-service-url' in config: self.handle_url = config['handle-service-url'] self.ws_client = Workspace(self.workspace_url, token=self.token) self.ru = ReadsUtils(self.callback_url, token=self.token, service_ver='release') self.au = AssemblyUtil(self.callback_url, token=self.token, service_ver='release') self.kbr = KBaseReport(self.callback_url) self.kbq = kb_quast(self.callback_url) self.proj_dir = prj_dir self.spades_version = 'SPAdes-' + os.environ['SPADES_VERSION'] def _get_kbreads_info(self, wsname, reads_refs): """ _get_kbreads_info--from a set of given KBase reads refs, fetches the corresponding reads info with as interleaved fastq files and returns a list of reads data in the following structure: reads_data = { 'fwd_file': path_to_fastq_file, 'type': reads_type, # ('interleaved', 'paired', or 'single') 'seq_tech': sequencing_tech, 'reads_ref': KBase object ref for downstream convenience, 'reads_name': KBase object name for downstream convenience, 'rev_file': path_to_fastq_file, # only if paired end } """ obj_ids = [] for r in reads_refs: if r: obj_ids.append({'ref': r if '/' in r else (wsname + '/' + r)}) if not obj_ids: return [] ws_info = self.ws_client.get_object_info_new({'objects': obj_ids}) reads_params = [] reftoname = {} for wsi, oid in zip(ws_info, obj_ids): ref = oid['ref'] reads_params.append(ref) obj_name = wsi[1] reftoname[ref] = wsi[7] + '/' + obj_name typeerr = ('Supported types: KBaseFile.SingleEndLibrary ' + 'KBaseFile.PairedEndLibrary ' + 'KBaseAssembly.SingleEndLibrary ' + 'KBaseAssembly.PairedEndLibrary') try: reads = self.ru.download_reads({ 'read_libraries': reads_params, 'interleaved': 'false' })['files'] except ServerError as se: log('logging stacktrace from dynamic client error') log(se.data) if typeerr in se.message: prefix = se.message.split('.')[0] raise ValueError( prefix + '. Only the types ' + 'KBaseAssembly.SingleEndLibrary ' + 'KBaseAssembly.PairedEndLibrary ' + 'KBaseFile.SingleEndLibrary ' + 'and KBaseFile.PairedEndLibrary are supported') else: raise # log('Downloaded reads data from KBase:\n' + pformat(reads)) reads_data = [] for ref in reads_refs: reads_name = reftoname[ref] f = reads[ref]['files'] seq_tech = reads[ref]['sequencing_tech'] rds_info = { 'fwd_file': f['fwd'], 'reads_ref': ref, 'type': f['type'], 'seq_tech': seq_tech, 'reads_name': reads_name } if f.get('rev', None): rds_info['rev_file'] = f['rev'] reads_data.append(rds_info) return reads_data def _generate_output_file_list(self, out_dir): """ _generate_output_file_list: zip result files and generate file_links for report """ log('start packing result files') output_files = list() output_directory = os.path.join(self.proj_dir, str(uuid.uuid4())) _mkdir_p(output_directory) spades_output = os.path.join(output_directory, 'spades_output.zip') self._zip_folder(out_dir, spades_output) output_files.append({'path': spades_output, 'name': os.path.basename(spades_output), 'label': os.path.basename(spades_output), 'description': 'Output file(s) generated by {}'.format( self.spades_version)}) return output_files def _zip_folder(self, folder_path, output_path): """ _zip_folder: Zip the contents of an entire folder (with that folder included in the archive). Empty subfolders could be included in the archive as well if the commented portion is used. """ with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as ziph: for root, folders, files in os.walk(folder_path): for f in files: absolute_path = os.path.join(root, f) relative_path = os.path.join(os.path.basename(root), f) # print "Adding {} to archive.".format(absolute_path) ziph.write(absolute_path, relative_path) print("{} created successfully.".format(output_path)) # with zipfile.ZipFile(output_path, "r") as f: # print 'Checking the zipped file......\n' # for info in f.infolist(): # print info.filename, info.date_time, info.file_size, info.compress_size def _load_stats(self, input_file_name): log('Starting conversion of FASTA to KBaseGenomeAnnotations.Assembly') log('Building Object.') if not os.path.isfile(input_file_name): raise Exception('The input file name {0} is not a file!'.format(input_file_name)) with open(input_file_name, 'r') as input_file_handle: contig_id = None sequence_len = 0 fasta_dict = dict() first_header_found = False # Pattern for replacing white space pattern = re.compile(r'\s+') for current_line in input_file_handle: if (current_line[0] == '>'): # found a header line # Wrap up previous fasta sequence if not first_header_found: first_header_found = True else: fasta_dict[contig_id] = sequence_len sequence_len = 0 fasta_header = current_line.replace('>', '').strip() try: contig_id = fasta_header.strip().split(' ', 1)[0] except (IndexError, ValueError, KeyError): contig_id = fasta_header.strip() else: sequence_len += len(re.sub(pattern, '', current_line)) # wrap up last fasta sequence if not first_header_found: raise Exception("There are no contigs in this file") else: fasta_dict[contig_id] = sequence_len return fasta_dict def _parse_single_reads(self, reads_type, reads_list): """ _parse_single_reads: given the reads_type and a list of reads, return an object defining the type and a list of fastq files. """ single_reads_fqs = [] ret_obj = {} if reads_list and isinstance(reads_list, list): for rds in reads_list: single_reads_fqs.append(rds['fwd_file']) if single_reads_fqs: ret_obj = { "type": reads_type, "single reads": single_reads_fqs } return ret_obj def _parse_pair_reads(self, reads_type, reads_list): """ _parse_pair_reads: given the reads_type and a list of reads, return an object defining the type and a list of fastq files. """ right_reads_fqs = [] left_reads_fqs = [] ret_obj = {} if reads_list and isinstance(reads_list, list): for rds in reads_list: right_reads_fqs.append(rds['fwd_file']) if rds.get('rev_file', None): left_reads_fqs.append(rds['rev_file']) orent = reads_list[0]['orientation'] if right_reads_fqs: ret_obj["right reads"] = right_reads_fqs ret_obj["orientation"] = orent ret_obj["type"] = reads_type if left_reads_fqs: ret_obj["left reads"] = left_reads_fqs return ret_obj # end of private methods # public method definitions def check_spades_params(self, params): """ check_spades_params: checks params passed to run_HybridSPAdes method and set default values """ # log('Start validating run_HybridSPAdes parameters:\n{}'.format( # json.dumps(params, indent=1))) # check for mandatory parameters if params.get(self.PARAM_IN_WS, None) is None: raise ValueError('Parameter {} is mandatory!'.format(self.PARAM_IN_WS)) if self.INVALID_WS_NAME_RE.search(params[self.PARAM_IN_WS]): raise ValueError('Invalid workspace name: {}.'.format(params[self.PARAM_IN_WS])) if params.get(self.PARAM_IN_CS_NAME, None) is None: raise ValueError('Parameter {} is mandatory!'.format(self.PARAM_IN_CS_NAME)) if self.INVALID_WS_OBJ_NAME_RE.search(params[self.PARAM_IN_CS_NAME]): raise ValueError('Invalid workspace object name: {}.'.format( params[self.PARAM_IN_CS_NAME])) if params.get(self.PARAM_IN_READS, None) is None: raise ValueError('Parameter {} is mandatory!'.format(self.PARAM_IN_READS)) if type(params[self.PARAM_IN_READS]) != list: raise ValueError('Input reads {} must be a list.'.format(self.PARAM_IN_READS)) if len(params[self.PARAM_IN_READS]) == 0: raise ValueError('Input parameter {} should have at least one reads.'.format( self.PARAM_IN_READS)) if self.PARAM_IN_MIN_CONTIG_LENGTH in params: if not isinstance(params[self.PARAM_IN_MIN_CONTIG_LENGTH], int): raise ValueError('{} must be of type int.'.format(self.PARAM_IN_MIN_CONTIG_LENGTH)) if not params.get(self.PARAM_IN_KMER_SIZES, None): params[self.PARAM_IN_KMER_SIZES] = [21, 33, 55] kmer_sstr = ",".join(str(num) for num in params[self.PARAM_IN_KMER_SIZES]) params[self.PARAM_IN_KMER_SIZES] = kmer_sstr print("KMER_SIZES: " + kmer_sstr) if params.get(self.PARAM_IN_SKIP_ERR_CORRECT, None): print("SKIP ERR CORRECTION: " + str(params[self.PARAM_IN_SKIP_ERR_CORRECT])) # check for basic option parameters if params.get(self.PARAM_IN_DNA_SOURCE, None): dna_src = params[self.PARAM_IN_DNA_SOURCE] if dna_src not in [self.PARAM_IN_SINGLE_CELL, self.PARAM_IN_METAGENOME, self.PARAM_IN_PLASMID, self.PARAM_IN_RNA, self.PARAM_IN_IONTORRENT]: params[self.PARAM_IN_DNA_SOURCE] = None else: params[self.PARAM_IN_DNA_SOURCE] = None # a list of basic options0 params['basic_options'] = ['-o', self.ASSEMBLE_RESULTS_DIR] dna_src = params.get(self.PARAM_IN_DNA_SOURCE) if dna_src == self.PARAM_IN_SINGLE_CELL: params['basic_options'].append('--sc') elif dna_src == self.PARAM_IN_METAGENOME: params['basic_options'].append('--meta') elif dna_src == self.PARAM_IN_PLASMID: params['basic_options'].append('--plasmid') elif dna_src == self.PARAM_IN_RNA: params['basic_options'].append('--rna') elif dna_src == self.PARAM_IN_IONTORRENT: params['basic_options'].append('--iontorrent') # processing pipeline option parameters if params.get(self.PARAM_IN_PIPELINE_OPTION, None): pipe_opts = params[self.PARAM_IN_PIPELINE_OPTION] opts = [self.PARAM_IN_ONLY_ERROR_CORR, self.PARAM_IN_ONLY_ASSEMBLER, self.PARAM_IN_CONTINUE, self.PARAM_IN_DISABLE_GZIP, self.PARAM_IN_CAREFUL] if any(elem in opts for elem in pipe_opts): pass else: params[self.PARAM_IN_PIPELINE_OPTION] = [self.PARAM_IN_CAREFUL] else: params[self.PARAM_IN_PIPELINE_OPTION] = [self.PARAM_IN_CAREFUL] if '--meta' in params['basic_options']: # you cannot specify --careful, --mismatch-correction # or --cov-cutoff in metagenomic mode! try: params[self.PARAM_IN_PIPELINE_OPTION].remove(self.PARAM_IN_CAREFUL) params[self.PARAM_IN_PIPELINE_OPTION].remove('mismatch-correction') params[self.PARAM_IN_PIPELINE_OPTION].remove('cov-cutoff') except ValueError: pass if params.get('create_report', None) is None: params['create_report'] = 0 return params def generate_report(self, fa_file_name, params, out_dir, wsname): """ Generating and saving report """ log('Generating and saving report') fa_file_with_path = os.path.join(out_dir, fa_file_name) fasta_stats = self._load_stats(fa_file_with_path) lengths = [fasta_stats[contig_id] for contig_id in fasta_stats] assembly_ref = wsname + '/' + params[self.PARAM_IN_CS_NAME] report_text = '' report_text += 'SPAdes results saved to: ' + wsname + '/' + out_dir + '\n' report_text += 'Assembly saved to: ' + assembly_ref + '\n' report_text += 'Assembled into ' + str(len(lengths)) + ' contigs.\n' report_text += 'Avg Length: ' + str(sum(lengths) / float(len(lengths))) + ' bp.\n' # compute a simple contig length distribution bins = 10 counts, edges = np.histogram(lengths, bins) report_text += 'Contig Length Distribution (# of contigs -- min to max ' + 'basepairs):\n' for c in range(bins): report_text += (' ' + str(counts[c]) + '\t--\t' + str(edges[c]) + ' to ' + str(edges[c + 1]) + ' bp\n') print('Running QUAST') quastret = self.kbq.run_QUAST( {'files': [{'path': fa_file_with_path, 'label': params[self.PARAM_IN_CS_NAME]}]}) output_files = self._generate_output_file_list(out_dir) print('Saving report') report_output = self.kbr.create_extended_report( {'message': report_text, 'objects_created': [{'ref': assembly_ref, 'description': 'Assembled contigs'}], 'direct_html_link_index': 0, 'file_links': output_files, 'html_links': [{'shock_id': quastret['shock_id'], 'name': 'report.html', 'label': 'QUAST report'} ], 'report_object_name': 'kb_spades_report_' + str(uuid.uuid4()), 'workspace_name': params[self.PARAM_IN_WS]}) return report_output['name'], report_output['ref'] def get_hybrid_reads_info(self, input_params): """ get_hybrid_reads_info--from a list of ReadsParams structures fetches the corresponding reads info with the ReadsParams[lib_ref] returns None or a tuple of nine reads data each is a list of the following structure: { 'fwd_file': path_to_fastq_file, 'orientation': (default value is "fr" (forward-reverse) for paired-end libraries "rf" (reverse-forward) for mate-pair libraries), None for others 'lib_type': ("paired-end", "mate-pairs", "hq-mate-pairs", "single", "pacbio", "nanopore", "sanger", "trusted-contigs", "untrusted-contigs"), 'type': reads_type, # 'interleaved', 'paired', or 'single' 'seq_tech': sequencing_tech, 'reads_ref': KBase object ref for downstream convenience, 'reads_name': KBase object name for downstream convenience, 'rev_file': path_to_fastq_file # only if paired end } OR: { 'fwd_file': path_to_fastq_file, 'long_reads_type': ("pacbio-ccs", "pacbio-clr", "nanopore", "sanger", "trusted-contigs", "untrusted-contigs"), 'type': reads_type, # 'interleaved', 'paired', or 'single' 'seq_tech': sequencing_tech, 'reads_ref': KBase object ref for downstream convenience, 'reads_name': KBase object name for downstream convenience } """ rds_params = copy.deepcopy(input_params) if rds_params.get(self.PARAM_IN_READS, None) is None: return () # an empty tuple wsname = rds_params[self.PARAM_IN_WS] sgl_rds_data = [] # single pe_rds_data = [] # paired-end mp_rds_data = [] # mate-pairs pb_ccs_data = [] # pacbio-ccs pb_clr_data = [] # pacbio-clr np_rds_data = [] # nanopore sgr_rds_data = [] # sanger tr_ctg_data = [] # trusted-contigs ut_ctg_data = [] # untrusted-contigs # a list of Illumina or IonTorrent paired-end/high-quality mate-pairs/unpaired reads rds_refs = [] rds_libs = rds_params[self.PARAM_IN_READS] for rds_lib in rds_libs: if rds_lib.get('lib_ref', None): rds_refs.append(rds_lib['lib_ref']) kb_rds_data = self._get_kbreads_info(wsname, rds_refs) for rds_lib in rds_libs: for kb_d in kb_rds_data: if 'lib_ref' in rds_lib and rds_lib['lib_ref'] == kb_d['reads_ref']: if rds_lib['lib_type'] == 'single': # single end reads grouped params kb_d['orientation'] = None kb_d['lib_type'] = 'single' sgl_rds_data.append(kb_d) elif rds_lib['lib_type'] == 'paired-end': # pairedEnd reads grouped params kb_d['orientation'] = ('fr' if rds_lib.get('orientation', None) is None else rds_lib['orientation']) kb_d['lib_type'] = 'paired-end' pe_rds_data.append(kb_d) elif rds_lib['lib_type'] == 'mate-pairs': # mate-pairs reads grouped params kb_d['orientation'] = ('rf' if rds_lib.get('orientation', None) is None else rds_lib['orientation']) kb_d['lib_type'] = 'mate-pairs' mp_rds_data.append(kb_d) # a list of PacBio (CCS or CLR), Oxford Nanopore Sanger reads # and/or additional contigs long_rds_refs = [] if rds_params.get(self.PARAM_IN_LONG_READS, None): long_rds_libs = rds_params[self.PARAM_IN_LONG_READS] for lrds_lib in long_rds_libs: if lrds_lib.get('long_reads_ref', None): long_rds_refs.append(lrds_lib['long_reads_ref']) kb_lrds_data = self._get_kbreads_info(wsname, long_rds_refs) for lrds_lib in long_rds_libs: for kb_ld in kb_lrds_data: if ('long_reads_ref' in lrds_lib and lrds_lib['long_reads_ref'] == kb_ld['reads_ref']): if lrds_lib['long_reads_type'] == 'pacbio-ccs': kb_ld['long_reads_type'] = lrds_lib['long_reads_type'] pb_ccs_data.append(kb_ld) elif lrds_lib['long_reads_type'] == 'pacbio-clr': kb_ld['long_reads_type'] = lrds_lib['long_reads_type'] pb_clr_data.append(kb_ld) elif lrds_lib['long_reads_type'] == 'nanopore': kb_ld['long_reads_type'] = lrds_lib['long_reads_type'] np_rds_data.append(kb_ld) elif lrds_lib['long_reads_type'] == 'sanger': kb_ld['long_reads_type'] = lrds_lib['long_reads_type'] sgr_rds_data.append(kb_ld) elif lrds_lib['long_reads_type'] == 'trusted-contigs': kb_ld['long_reads_type'] = lrds_lib['long_reads_type'] tr_ctg_data.append(kb_ld) elif lrds_lib['long_reads_type'] == 'untrusted-contigs': kb_ld['long_reads_type'] = lrds_lib['long_reads_type'] ut_ctg_data.append(kb_ld) return (sgl_rds_data, pe_rds_data, mp_rds_data, pb_ccs_data, pb_clr_data, np_rds_data, sgr_rds_data, tr_ctg_data, ut_ctg_data) def construct_yaml_dataset_file(self, sgl_libs=None, pe_libs=None, mp_libs=None, pb_ccs=None, pb_clr=None, np_libs=None, sgr_libs=None, tr_ctgs=None, ut_ctgs=None): """ construct_yaml_dataset_file: Specifying input data with YAML data set file (advanced) An alternative way to specify an input data set for SPAdes is to create a YAML data set file. By using a YAML file you can provide an unlimited number of paired-end, mate-pair and unpaired libraries. Basically, YAML data set file is a text file, in which input libraries are provided as a comma-separated list in square brackets. Each library is provided in braces as a comma-separated list of attributes. The following attributes are available: - orientation ("fr", "rf", "ff") - type ("paired-end", "mate-pairs", "hq-mate-pairs", "single", "pacbio", "nanopore", "sanger", "trusted-contigs", "untrusted-contigs") - interlaced reads (comma-separated list of files with interlaced reads) - left reads (comma-separated list of files with left reads) - right reads (comma-separated list of files with right reads) - single reads (comma-separated list of files with single reads or unpaired reads from paired library) - merged reads (comma-separated list of files with merged reads) To properly specify a library you should provide its type and at least one file with reads. For ONT, PacBio, Sanger and contig libraries you can provide only single reads. Orientation is an optional attribute. Its default value is "fr" (forward-reverse) for paired-end libraries and "rf" (reverse-forward) for mate-pair libraries. The value for each attribute is given after a colon. Comma-separated lists of files should be given in square brackets. For each file you should provide its full path in double quotes. Make sure that files with right reads are given in the same order as corresponding files with left reads. For example, if you have one paired-end library splitted into two pairs of files: lib_pe1_left_1.fastq lib_pe1_right_1.fastq lib_pe1_left_2.fastq lib_pe1_right_2.fastq one mate-pair library: lib_mp1_left.fastq lib_mp1_right.fastq and PacBio CCS and CLR reads: pacbio_ccs.fastq pacbio_clr.fastq YAML file should look like this: ------------------------------------------------ [ { orientation: "fr", type: "paired-end", right reads: [ "/FULL_PATH_TO_DATASET/lib_pe1_right_1.fastq", "/FULL_PATH_TO_DATASET/lib_pe1_right_2.fastq" ], left reads: [ "/FULL_PATH_TO_DATASET/lib_pe1_left_1.fastq", "/FULL_PATH_TO_DATASET/lib_pe1_left_2.fastq" ] }, { orientation: "rf", type: "mate-pairs", right reads: [ "/FULL_PATH_TO_DATASET/lib_mp1_right.fastq" ], left reads: [ "/FULL_PATH_TO_DATASET/lib_mp1_left.fastq" ] }, { type: "single", single reads: [ "/FULL_PATH_TO_DATASET/pacbio_ccs.fastq" ] }, { type: "pacbio", single reads: [ "/FULL_PATH_TO_DATASET/pacbio_clr.fastq" ] } ] ------------------------------------------------ Once you have created a YAML file save it with .yaml extension (e.g. as my_data_set.yaml) and run SPAdes using the --dataset option: e.g., <SPAdes_bin_dir>/spades.py --dataset <your YAML file> -o spades_output """ # STEP 1: get the working folder housing the .yaml file and the SPAdes results if not os.path.exists(self.proj_dir): os.makedirs(self.proj_dir) yaml_file_path = os.path.join(self.proj_dir, 'input_data_set.yaml') # STEP 2: construct and save the 'input_data_set.yaml' file # generate the object array input_data_set = [] if pe_libs: pair_libs = self._parse_pair_reads('paired-end', pe_libs) if pair_libs: input_data_set.append(pair_libs) if mp_libs: pair_libs = self._parse_pair_reads('mate-pairs', mp_libs) if pair_libs: input_data_set.append(pair_libs) # for reads_type = 'single' if sgl_libs: single_libs = self._parse_single_reads("single", sgl_libs) if single_libs: input_data_set.append(single_libs) # for long_reads_type = 'pacbio-ccs', treated as type of 'single' if pb_ccs: single_libs = self._parse_single_reads("single", pb_ccs) if single_libs: input_data_set.append(single_libs) # for long_reads_type = 'pacbio-clr' if pb_clr: single_libs = self._parse_single_reads("pacbio", pb_clr) if single_libs: input_data_set.append(single_libs) # for long_reads_type = 'nanopore' if np_libs: single_libs = self._parse_single_reads("nanopore", np_libs) if single_libs: input_data_set.append(single_libs) # for long_reads_type = 'sanger' if sgr_libs: single_libs = self._parse_single_reads("sanger", sgr_libs) if single_libs: input_data_set.append(single_libs) # for long_reads_type = 'trusted-contigs' if tr_ctgs: single_libs = self._parse_single_reads("trusted-contigs", tr_ctgs) if single_libs: input_data_set.append(single_libs) # for long_reads_type = 'untrusted-contigs' if ut_ctgs: single_libs = self._parse_single_reads("untrusted-contigs", ut_ctgs) if single_libs: input_data_set.append(single_libs) if input_data_set == []: print('Empty input data set!!') return '' pprint(input_data_set) try: with open(yaml_file_path, 'w') as yaml_file: json.dump(input_data_set, yaml_file) except IOError as ioerr: log('Creation of the {} file raised error:\n'.format(yaml_file_path)) pprint(ioerr) return '' else: return yaml_file_path def run_assemble(self, yaml_file, kmer_sizes, dna_source=None, basic_opts=None, pipeline_opts=['careful']): """ run_assemble: run the SPAdes assemble with given input parameters/options """ exit_code = 1 if not os.path.isfile(yaml_file): log("The input data set yaml file DOES NOT exist at {}\n".format(yaml_file)) return exit_code log("The input data set yaml file exists at {}\n".format(yaml_file)) yf_dir, yf_nm = os.path.split(yaml_file) mem = (psutil.virtual_memory().available / self.GB - self.MEMORY_OFFSET_GB) if mem < self.MIN_MEMORY_GB: raise ValueError( 'Only ' + str(psutil.virtual_memory().available) + ' bytes of memory are available. The SPAdes wrapper will' + ' not run without at least ' + str(self.MIN_MEMORY_GB + self.MEMORY_OFFSET_GB) + ' gigabytes available') if dna_source and dna_source == self.PARAM_IN_METAGENOME: max_mem = self.MAX_MEMORY_GB_META_SPADES max_threads = self.MAX_THREADS_META else: max_mem = self.MAX_MEMORY_GB_SPADES max_threads = self.MAX_THREADS threads = min(max_threads, psutil.cpu_count() * self.THREADS_PER_CORE) if mem > max_mem: mem = max_mem tmpdir = os.path.join(self.proj_dir, 'spades_tmp_dir') if not os.path.exists(tmpdir): os.makedirs(tmpdir) a_cmd = [os.path.join(self.SPADES_BIN, 'spades.py')] a_cmd += ['--threads', str(threads), '--memory', str(mem)] a_cmd += ['--tmp-dir', tmpdir] a_cmd += ['--dataset', yaml_file] if kmer_sizes is not None: a_cmd += ['-k ' + kmer_sizes] if basic_opts is None: basic_opts = ['-o', self.ASSEMBLE_RESULTS_DIR] if isinstance(basic_opts, list): a_cmd += basic_opts if pipeline_opts and isinstance(pipeline_opts, list): for p_opt in pipeline_opts: if p_opt == self.PARAM_IN_CAREFUL: a_cmd += ['--careful'] if p_opt == self.PARAM_IN_ONLY_ERROR_CORR: a_cmd += ['--only-error-correction'] if p_opt == self.PARAM_IN_ONLY_ASSEMBLER: a_cmd += ['--only-assembler'] if p_opt == self.PARAM_IN_CONTINUE: a_cmd += ['--continue'] if p_opt == self.PARAM_IN_DISABLE_GZIP: a_cmd += ['--disable-gzip-output'] # Last check of command options before the call if '--meta' in a_cmd: # you cannot specify --careful, --mismatch-correction # or --cov-cutoff in metagenomic mode! try: a_cmd.remove(self.PARAM_IN_CAREFUL) a_cmd.remove('mismatch-correction') a_cmd.remove('cov-cutoff') except ValueError: pass log("**************The HybridSPAdes assembling command is:\n{}".format(' '.join(a_cmd))) assemble_out_dir = os.path.join(self.proj_dir, self.ASSEMBLE_RESULTS_DIR) if not os.path.exists(assemble_out_dir): os.makedirs(assemble_out_dir) p = subprocess.Popen(a_cmd, cwd=yf_dir, shell=False) exit_code = p.wait() log('Return code: ' + str(exit_code)) if p.returncode != 0: raise ValueError('Error running spades.py, return code: ' + str(p.returncode) + '\n') else: exit_code = p.returncode return exit_code def save_assembly(self, fa_file_path, wsname, a_name, min_ctg_length=0): """ save_assembly: save the assembly to KBase workspace """ if os.path.isfile(fa_file_path): log('Uploading FASTA file to Assembly...') if min_ctg_length > 0: self.au.save_assembly_from_fasta( {'file': {'path': fa_file_path}, 'workspace_name': wsname, 'assembly_name': a_name, 'min_contig_length': min_ctg_length}) else: self.au.save_assembly_from_fasta( {'file': {'path': fa_file_path}, 'workspace_name': wsname, 'assembly_name': a_name}) else: log("The resulting sequence file {} is not found.".format(fa_file_path))
def run_metaphlan2(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_metaphlan2 # Check parameters logging.info(f'params {params}') # Check for presence of input file types in params input_genomes = 'input_genomes' in params and len( params['input_genomes'] ) > 0 and None not in params['input_genomes'] input_refs = 'input_ref' in params and len( params['input_ref']) > 0 and None not in params['input_ref'] # for name in ['workspace_name', 'db_type']: # if name not in params: # raise ValueError( # 'Parameter "' + name + '" is required but missing') if not input_genomes and not input_refs: raise ValueError( 'You must enter either an input genome or input reads') if input_refs and input_genomes: raise ValueError( 'You must enter either an input genome or input reads, ' 'but not both') if input_genomes and (not isinstance(params['input_genomes'][0], str)): raise ValueError('Pass in a valid input genome string') if input_refs and (not isinstance(params['input_ref'], list) or not len(params['input_ref'])): raise ValueError('Pass in a list of input references') # Start with base cmd and add parameters based on user input cmd = [ 'metaphlan2.py', '--bowtie2db', '/data/metaphlan2/mpa_v20_m200', '--mpa_pkl', '/data/metaphlan2/mpa_v20_m200.pkl' ] if input_genomes: assembly_util = AssemblyUtil(self.callback_url) fasta_file_obj = assembly_util.get_assembly_as_fasta( {'ref': params['input_genomes'][0]}) logging.info(fasta_file_obj) fasta_file = fasta_file_obj['path'] cmd.extend(['--input_type', 'fasta', fasta_file]) if input_refs: logging.info('Downloading Reads data as a Fastq file.') logging.info(f"Input parameters {params.items()}") readsUtil = ReadsUtils(self.callback_url) download_reads_output = readsUtil.download_reads( {'read_libraries': params['input_ref']}) print( f"Input refs {params['input_ref']} download_reads_output {download_reads_output}" ) fastq_files = [] fastq_files_name = [] for key, val in download_reads_output['files'].items(): if 'fwd' in val['files'] and val['files']['fwd']: fastq_files.append(val['files']['fwd']) fastq_files_name.append(val['files']['fwd_name']) if 'rev' in val['files'] and val['files']['rev']: fastq_files.append(val['files']['rev']) fastq_files_name.append(val['files']['rev_name']) logging.info(f"fastq files {fastq_files}") fastq_files_string = ' '.join(fastq_files) cmd.extend(['--input_type', 'fastq', fastq_files_string]) output_dir = os.path.join(self.scratch, 'metaphlan2_output') if not os.path.exists(output_dir): os.makedirs(output_dir) # insert into second to last position, before input file(s) cmd.insert( -1, '--min_alignment_len') if params['min_alignment_len'] > 0 else cmd cmd.insert(-1, str(params['min_alignment_len']) ) if params['min_alignment_len'] > 0 else cmd cmd.insert( -1, '--ignore_viruses') if params['ignore_viruses'] == 1 else cmd cmd.insert( -1, '--ignore_bacteria') if params['ignore_bacteria'] == 1 else cmd cmd.insert( -1, '--ignore_eukaryotes') if params['ignore_eukaryotes'] == 1 else cmd cmd.insert( -1, '--ignore_archaea') if params['ignore_archaea'] == 1 else cmd cmd.insert(-1, '--stat_q') cmd.insert(-1, str(params['stat_q'])) cmd.insert(-1, '--min_cu_len') cmd.insert(-1, str(params['min_cu_len'])) # append output file cmd.extend(['--bowtie2out', os.path.join(output_dir, 'report.txt')]) cmd00 = ["ls", '-la', '/data/metaphlan2/'] logging.info(f'cmd00 {cmd00}') pls = subprocess.Popen(cmd00, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) logging.info(f'subprocess {pls.communicate()}') # run pipeline logging.info(f'cmd {" ".join(cmd)}') p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) logging.info(f'subprocess {p.communicate()}') cmd = [ '/kb/module/lib/metaphlan2/src/accessories.sh', os.path.join(output_dir, 'report.txt'), output_dir, 'metaphlan2' ] p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) logging.info(f'subprocess {p.communicate()}') # get output file and convert to format for report # logging.info(f"params['input_ref'] {params['input_ref']}") report_df = pd.read_csv(os.path.join(output_dir, 'report.txt'), sep='\t') taxa_list = [ 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'strain', 'unclassified' ] abbrev_list = ['k', 'p', 'c', 'o', 'f', 'g', 's', 't', 'unclassified'] for taxa in taxa_list: report_df[taxa] = None tax_dict = dict(zip(abbrev_list, taxa_list)) # split dunderscores to get tax level and name report_df['taxonomy'] = report_df['#SampleID'].apply( lambda x: x.split('|')).apply(lambda x: [y.split('__') for y in x]) for idx, row in report_df.iterrows(): for col in row['taxonomy']: try: report_df.loc[idx, tax_dict[col[0]]] = col[1] except IndexError: report_df.loc[idx, tax_dict[col[0]]] = col[0] report_df.drop(['taxonomy', '#SampleID'], axis=1, inplace=True) report_html_file = os.path.join(output_dir, 'report.html') self._generate_report_table(report_df, report_html_file, output_dir) # report_df.to_html(report_html_file, classes='Metaphlan2_report', # index=False) html_zipped = self.package_folder(output_dir, 'report.html', 'report') # Step 5 - Build a Report and return objects_created = [] output_files = os.listdir(output_dir) output_files_list = [] for output in output_files: if not os.path.isdir(output): output_files_list.append({ 'path': os.path.join(output_dir, output), 'name': output }) message = f"MetaPhlAn2 run finished." report_params = { 'message': message, 'workspace_name': params.get('workspace_name'), 'objects_created': objects_created, 'file_links': output_files_list, 'html_links': [html_zipped], 'direct_html_link_index': 0, 'html_window_height': 460 } kbase_report_client = KBaseReport(self.callback_url) report_output = kbase_report_client.create_extended_report( report_params) report_output['report_params'] = report_params logging.info(report_output) # Return references which will allow inline display of # the report in the Narrative output = { 'report_name': report_output['name'], 'report_ref': report_output['ref'], 'report_params': report_output['report_params'] } #END run_metaphlan2 # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_metaphlan2 return value ' + 'output is not type dict as required.') # return the results return [output]
def run_SPAdes(self, ctx, params): """ Run SPAdes on paired end libraries :param params: instance of type "SPAdesParams" (Input parameters for running SPAdes. workspace_name - the name of the workspace from which to take input and store output. output_contigset_name - the name of the output contigset read_libraries - a list of Illumina PairedEndLibrary files in FASTQ or BAM format. dna_source - (optional) the source of the DNA used for sequencing 'single_cell': DNA amplified from a single cell via MDA anything else: Standard DNA sample from multiple cells. Default value is None. min_contig_length - (optional) integer to filter out contigs with length < min_contig_length from the SPAdes output. Default value is 0 implying no filter. kmer_sizes - (optional) K-mer sizes, Default values: 33, 55, 77, 99, 127 (all values must be odd, less than 128 and listed in ascending order) In the absence of these values, K values are automatically selected. skip_error_correction - (optional) Assembly only (No error correction). By default this is disabled.) -> structure: parameter "workspace_name" of String, parameter "output_contigset_name" of String, parameter "read_libraries" of list of type "paired_end_lib" (The workspace object name of a PairedEndLibrary file, whether of the KBaseAssembly or KBaseFile type.), parameter "dna_source" of String, parameter "min_contig_length" of Long, parameter "kmer_sizes" of list of Long, parameter "skip_error_correction" of type "bool" (A boolean. 0 = false, anything else = true.) :returns: instance of type "SPAdesOutput" (Output parameters for SPAdes run. report_name - the name of the KBaseReport.Report workspace object. report_ref - the workspace reference of the report.) -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_SPAdes # A whole lot of this is adapted or outright copied from # https://github.com/msneddon/MEGAHIT self.log('Running run_SPAdes with params:\n' + pformat(params)) token = ctx['token'] # the reads should really be specified as a list of absolute ws refs # but the narrative doesn't do that yet self.process_params(params) # get absolute refs from ws wsname = params[self.PARAM_IN_WS] obj_ids = [] for r in params[self.PARAM_IN_LIB]: obj_ids.append({'ref': r if '/' in r else (wsname + '/' + r)}) ws = Workspace(self.workspaceURL, token=token) ws_info = ws.get_object_info_new({'objects': obj_ids}) reads_params = [] reftoname = {} for wsi, oid in zip(ws_info, obj_ids): ref = oid['ref'] reads_params.append(ref) obj_name = wsi[1] reftoname[ref] = wsi[7] + '/' + obj_name readcli = ReadsUtils(self.callbackURL, token=ctx['token']) typeerr = ('Supported types: KBaseFile.SingleEndLibrary ' + 'KBaseFile.PairedEndLibrary ' + 'KBaseAssembly.SingleEndLibrary ' + 'KBaseAssembly.PairedEndLibrary') try: reads = readcli.download_reads({'read_libraries': reads_params, 'interleaved': 'false', 'gzipped': None })['files'] except ServerError as se: self.log('logging stacktrace from dynamic client error') self.log(se.data) if typeerr in se.message: prefix = se.message.split('.')[0] raise ValueError( prefix + '. Only the types ' + 'KBaseAssembly.PairedEndLibrary ' + 'and KBaseFile.PairedEndLibrary are supported') else: raise self.log('Got reads data from converter:\n' + pformat(reads)) phred_type = self.check_reads(params, reads, reftoname) reads_data = [] for ref in reads: reads_name = reftoname[ref] f = reads[ref]['files'] # print ("REF:" + str(ref)) # print ("READS REF:" + str(reads[ref])) seq_tech = reads[ref]["sequencing_tech"] if f['type'] == 'interleaved': reads_data.append({'fwd_file': f['fwd'], 'type': 'paired', 'seq_tech': seq_tech}) elif f['type'] == 'paired': reads_data.append({'fwd_file': f['fwd'], 'rev_file': f['rev'], 'type': 'paired', 'seq_tech': seq_tech}) elif f['type'] == 'single': reads_data.append({'fwd_file': f['fwd'], 'type': 'single', 'seq_tech': seq_tech}) else: raise ValueError('Something is very wrong with read lib' + reads_name) kmer_sizes = None if self.PARAM_IN_KMER_SIZES in params and params[self.PARAM_IN_KMER_SIZES] is not None: if (len(params[self.PARAM_IN_KMER_SIZES])) > 0: kmer_sizes = ",".join(str(num) for num in params[self.PARAM_IN_KMER_SIZES]) skip_error_correction = 0 if self.PARAM_IN_SKIP_ERR_CORRECT in params and params[self.PARAM_IN_SKIP_ERR_CORRECT] is not None: if params[self.PARAM_IN_SKIP_ERR_CORRECT] == 1: skip_error_correction = 1 spades_out = self.exec_spades(params[self.PARAM_IN_DNA_SOURCE], reads_data, phred_type, kmer_sizes, skip_error_correction) self.log('SPAdes output dir: ' + spades_out) # parse the output and save back to KBase output_contigs = os.path.join(spades_out, 'scaffolds.fasta') self.log('Uploading FASTA file to Assembly') assemblyUtil = AssemblyUtil(self.callbackURL, token=ctx['token'], service_ver='release') if params.get('min_contig_length', 0) > 0: assemblyUtil.save_assembly_from_fasta( {'file': {'path': output_contigs}, 'workspace_name': wsname, 'assembly_name': params[self.PARAM_IN_CS_NAME], 'min_contig_length': params['min_contig_length'] }) # load report from scaffolds.fasta.filtered.fa report_name, report_ref = self.load_report( output_contigs + '.filtered.fa', params, wsname) else: assemblyUtil.save_assembly_from_fasta( {'file': {'path': output_contigs}, 'workspace_name': wsname, 'assembly_name': params[self.PARAM_IN_CS_NAME] }) # load report from scaffolds.fasta report_name, report_ref = self.load_report( output_contigs, params, wsname) output = {'report_name': report_name, 'report_ref': report_ref } #END run_SPAdes # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_SPAdes return value ' + 'output is not type dict as required.') # return the results return [output]
class MetabolicUtil(): ''' Utilities for running METABOLIC ''' def __init__(self, config, callback_url, workspace_id, cpus): self.shared_folder = config['scratch'] self.callback_url = callback_url self.cpus = cpus self.ru = ReadsUtils(self.callback_url) logging.basicConfig(format='%(created)s %(levelname)s: %(message)s', level=logging.INFO) def stage_reads_list_file(self, reads_list): """ stage_reads_list_file: download fastq file associated to reads to scratch area and return result_file_path """ logging.info('Processing reads object list: {}'.format(reads_list)) result_file_path = [] read_type = [] # getting from workspace and writing to scratch. The 'reads' dictionary now has file paths to scratch. reads = self.ru.download_reads({ 'read_libraries': reads_list, 'interleaved': None })['files'] # reads_list is the list of file paths on workspace? (i.e. 12804/1/1). # "reads" is the hash of hashes where key is "12804/1/1" or in this case, read_obj and # "files" is the secondary key. The tertiary keys are "fwd" and "rev", as well as others. for read_obj in reads_list: files = reads[read_obj][ 'files'] # 'files' is dictionary where 'fwd' is key of file path on scratch. result_file_path.append(files['fwd']) read_type.append(files['type']) if 'rev' in files and files['rev'] is not None: result_file_path.append(files['rev']) return result_file_path, read_type def _run_command(self, command): """ _run_command: run command and print result """ os.chdir(self.shared_folder) logging.info('Start executing command:\n{}'.format(command)) logging.info('Command is running from:\n{}'.format(self.shared_folder)) pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True) output, stderr = pipe.communicate() exitCode = pipe.returncode if (exitCode == 0): logging.info('Executed command:\n{}\n'.format(command) + 'Exit Code: {}\n'.format(exitCode)) else: error_msg = 'Error running command:\n{}\n'.format(command) error_msg += 'Exit Code: {}\nOutput:\n{}\nStderr:\n{}'.format( exitCode, output, stderr) raise ValueError(error_msg) sys.exit(1) return (output, stderr) def deinterlace_raw_reads(self, fastq): fastq_forward = fastq.split('.fastq')[0] + "_forward.fastq" fastq_reverse = fastq.split('.fastq')[0] + "_reverse.fastq" command = 'reformat.sh in={} out1={} out2={} overwrite=true'.format( fastq, fastq_forward, fastq_reverse) self._run_command(command) return (fastq_forward, fastq_reverse) def make_metabolic_reads_file_input(self, params): """ This function runs the selected read mapper and creates the sorted and indexed bam files from sam files using samtools. """ reads_list = params['reads_list'] (read_scratch_path, read_type) = self.stage_reads_list_file(reads_list) omic_reads_parameter_file = os.path.abspath( self.shared_folder) + '/omic_reads_parameters.txt' with open(omic_reads_parameter_file, 'w+') as f: f.write("#Reads pair name with complete pathway: \n") for i in range(len(read_scratch_path)): fastq = read_scratch_path[i] fastq_type = read_type[i] if fastq_type == 'interleaved': # make sure working - needs tests logging.info("Running interleaved read mapping mode") (fastq_forward, fastq_reverse) = self.deinterlace_raw_reads(fastq) f.write(fastq_forward + ',' + fastq_reverse) else: # running read mapping in single-end mode logging.info("Running unpaired read mapping mode") f.write(fastq) return omic_reads_parameter_file def run_metabolic_without_reads(self, params): ''' Run the METABOLIC-G workflow (not using raw reads) ''' out_dir = os.path.join(self.shared_folder, "output") metabolic_cmd = " ".join([ "perl", "/kb/module/bin/METABOLIC/METABOLIC-G.pl", "-in-gn", self.shared_folder, "-t", str(self.cpus), "-m-cutoff", str(params['kegg_module_cutoff']), "-p", params['prodigal_method'], "-o", out_dir, "-m", "/data/METABOLIC" ]) logging.info("Starting Command:\n" + metabolic_cmd) output = subprocess.check_output(metabolic_cmd, shell=True).decode('utf-8') logging.info(output) # self._process_output_files(out_dir) return output def run_metabolic_with_reads(self, params): ''' Run the METABOLIC-C workflow (using raw reads) ''' out_dir = os.path.join(self.shared_folder, "output") omic_reads_parameter_file = self.make_metabolic_reads_file_input( params) metabolic_cmd = " ".join([ "perl", "/kb/module/bin/METABOLIC/METABOLIC-C.pl", "-in-gn", self.shared_folder, "-t", str(self.cpus), "-m-cutoff", params['kegg_module_cutoff'], "-p", params['prodigal_method'], "-o", out_dir, "-m", "/data/METABOLIC", "-r", omic_reads_parameter_file ]) logging.info("Starting Command:\n" + metabolic_cmd) output = subprocess.check_output(metabolic_cmd, shell=True).decode('utf-8') logging.info(output) # self._process_output_files(out_dir) return output
def run_kraken2(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_kraken2 # Download input data as FASTA or FASTQ logging.info('Calling run_kraken2') logging.info(f'params {params}') # Check for presence of input file types in params input_genomes = 'input_genomes' in params and len( params['input_genomes'] ) > 0 and None not in params['input_genomes'] input_refs = 'input_refs' in params and len( params['input_refs']) > 0 and None not in params['input_refs'] input_paired_refs = 'input_paired_refs' in params and len( params['input_paired_refs'] ) > 0 and None not in params['input_paired_refs'] for name in ['workspace_name', 'db_type']: if name not in params: raise ValueError('Parameter "' + name + '" is required but missing') if not input_genomes and not input_refs and not input_paired_refs: raise ValueError( 'You must enter either an input genome or input reads') if input_refs and input_paired_refs: raise ValueError( 'You must enter either single-end or paired-end reads, ' 'but not both') if input_genomes and (input_refs or input_paired_refs): raise ValueError( 'You must enter either an input genome or input reads, ' 'but not both') if input_genomes and (not isinstance(params['input_genomes'][0], str)): raise ValueError('Pass in a valid input genome string') if input_refs and (not isinstance(params['input_refs'], list)): raise ValueError('Pass in a list of input references') if input_paired_refs and (not isinstance(params['input_paired_refs'], list)): raise ValueError('Pass in a list of input references') logging.info(params['db_type']) logging.info( f'input_genomes {input_genomes} input_refs {input_refs} input_paired_refs {input_paired_refs}' ) input_string = [] if input_genomes: assembly_util = AssemblyUtil(self.callback_url) fasta_file_obj = assembly_util.get_assembly_as_fasta( {'ref': params['input_genomes'][0]}) logging.info(fasta_file_obj) fasta_file = fasta_file_obj['path'] input_string.append(fasta_file) if input_refs: logging.info('Downloading Reads data as a Fastq file.') logging.info(f"input_refs {params['input_refs']}") readsUtil = ReadsUtils(self.callback_url) download_reads_output = readsUtil.download_reads( {'read_libraries': params['input_refs']}) print( f"Input parameters {params['input_refs']}, {params['db_type']}" f"download_reads_output {download_reads_output}") fastq_files = [] fastq_files_name = [] for key, val in download_reads_output['files'].items(): if 'fwd' in val['files'] and val['files']['fwd']: fastq_files.append(val['files']['fwd']) fastq_files_name.append(val['files']['fwd_name']) if 'rev' in val['files'] and val['files']['rev']: fastq_files.append(val['files']['rev']) fastq_files_name.append(val['files']['rev_name']) logging.info(f"fastq files {fastq_files}") input_string.append(' '.join(fastq_files)) if input_paired_refs: logging.info('Downloading Reads data as a Fastq file.') logging.info(f"input_refs {params['input_paired_refs']}") readsUtil = ReadsUtils(self.callback_url) download_reads_output = readsUtil.download_reads( {'read_libraries': params['input_paired_refs']}) print( f"Input parameters {params['input_paired_refs']}, {params['db_type']}" f"download_reads_output {download_reads_output}") fastq_files = [] fastq_files_name = [] # input_string.append('--paired') for key, val in download_reads_output['files'].items(): if 'fwd' in val['files'] and val['files']['fwd']: fastq_files.append(val['files']['fwd']) fastq_files_name.append(val['files']['fwd_name']) if 'rev' in val['files'] and val['files']['rev']: fastq_files.append(val['files']['rev']) fastq_files_name.append(val['files']['rev_name']) # if len(fastq_files) % 2 != 0: # raise ValueError('There must be an even number of Paired-end reads files') logging.info(f"fastq files {fastq_files}") input_string.extend(fastq_files) logging.info(f'input_string {input_string}') output_dir = os.path.join(self.shared_folder, 'kraken2_output') report_file_name = 'report.txt' report_file = os.path.join(output_dir, report_file_name) if not os.path.exists(output_dir): os.makedirs(output_dir) outprefix = "kraken2" cmd = [ '/kb/module/lib/kraken2/src/kraken2.sh', '-d', '/data/kraken2/' + params['db_type'], '-o', output_dir, '-p', outprefix, '-t', '1', '-i' ] cmd.extend(input_string) # cmd = ['kraken2', '--db', '/data/kraken2/' + params['db_type'], # '--output', output_dir, '--report', report_file, # '--threads', '1'] # cmd.extend(['--confidence', str(params['confidence'])]) if 'confidence' in params else cmd logging.info(f'cmd {cmd}') p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) logging.info(f'subprocess {p.communicate()}') summary_file = os.path.join(output_dir, outprefix + '.report.csv') report_dir = os.path.join(output_dir, 'html_report') if not os.path.exists(report_dir): os.makedirs(report_dir) summary_file_dt = os.path.join(report_dir, 'kraken2.datatable.html') self._generate_DataTable(summary_file, summary_file_dt) shutil.copy2('/kb/module/lib/kraken2/src/index.html', os.path.join(report_dir, 'index.html')) shutil.copy2(os.path.join(output_dir, outprefix + '.krona.html'), os.path.join(report_dir, 'kraken2.krona.html')) shutil.move(os.path.join(output_dir, outprefix + '.tree.svg'), os.path.join(report_dir, 'kraken2.tree.svg')) html_zipped = self.package_folder(report_dir, 'index.html', 'index.html') # columns = [ # 'Percentage of fragments covered by the clade rooted at this taxon', # 'Number of fragments covered by the clade rooted at this taxon', # 'Number of fragments assigned directly to this taxon', 'rank code', # 'taxid', 'name'] # report_df = pd.read_csv(report_file, sep='\t', # header=None, names=columns) # code_dict = {'U': 'Unclassified', 'R': 'Root', 'D': 'Domain', # 'K': 'Kingdom', 'P': 'Phylum', 'C': 'Class', 'O': 'Order', # 'F': 'Family', 'G': 'Genus', 'S': 'Species'} # report_df['rank code'] = report_df['rank code'].apply( # lambda x: code_dict[x[0]] + x[1] if len(x) > 1 else code_dict[x]) # self._generate_report_table(report_df, report_html_file, output_dir) # report_df.to_html(report_html_file, classes='Kraken2_report', index=False) # html_zipped = self.package_folder(output_dir, 'report.html', # 'report') # Step 5 - Build a Report and return objects_created = [] output_files = os.listdir(output_dir) output_files_list = [] for output in output_files: if not os.path.isdir(output): output_files_list.append({ 'path': os.path.join(output_dir, output), 'name': output }) message = f"Kraken2 run finished on {input_string} against {params['db_type']}." report_params = { 'message': message, 'workspace_name': params.get('workspace_name'), 'objects_created': objects_created, 'file_links': output_files_list, 'html_links': [html_zipped], 'direct_html_link_index': 0, 'html_window_height': 460 } # STEP 6: construct the output to send back kbase_report_client = KBaseReport(self.callback_url) report_output = kbase_report_client.create_extended_report( report_params) report_output['report_params'] = report_params logging.info(report_output) # Return references which will allow inline display of # the report in the Narrative output = { 'report_name': report_output['name'], 'report_ref': report_output['ref'], 'report_params': report_output['report_params'] } #END run_kraken2 # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_kraken2 return value ' + 'output is not type dict as required.') # return the results return [output]
def runFastQC(self, ctx, input_params): """ :param input_params: instance of type "FastQCParams" -> structure: parameter "input_ws" of String, parameter "input_file" of String, parameter "input_file_ref" of String :returns: instance of type "FastQCOutput" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: reported_output #BEGIN runFastQC token = ctx['token'] wsClient = workspaceService(self.workspaceURL, token=token) uuid_string = str(uuid.uuid4()) read_file_path = self.scratch + "/" + uuid_string os.mkdir(read_file_path) input_file_ref = self._get_input_file_ref_from_params(input_params) library = None try: library = wsClient.get_objects2( {'objects': [{ 'ref': input_file_ref }]})['data'][0] except Exception as e: raise ValueError( 'Unable to get read library object from workspace: (' + input_file_ref + ')' + str(e)) download_read_params = {'read_libraries': [], 'interleaved': "false"} if ("SingleEnd" in library['info'][2] or "PairedEnd" in library['info'][2]): download_read_params['read_libraries'].append(library['info'][7] + "/" + library['info'][1]) elif ("SampleSet" in library['info'][2]): for sample_id in library['data']['sample_ids']: if ("/" in sample_id): download_read_params['read_libraries'].append(sample_id) else: if (sample_id.isdigit()): download_read_params['read_libraries'].append( library['info'][6] + "/" + sample_id) else: download_read_params['read_libraries'].append( library['info'][7] + "/" + sample_id) ru = ReadsUtils(os.environ['SDK_CALLBACK_URL']) ret = ru.download_reads(download_read_params) read_file_list = list() for file in ret['files']: obj_info = self.dfu.get_objects({'object_refs': [file]})['data'][0]['info'] obj_name = obj_info[1] obj_ref_suffix = '_' + str(obj_info[6]) + '_' + str( obj_info[0]) + '_' + str(obj_info[4]) files = ret['files'][file]['files'] fwd_name = files['fwd'].split('/')[-1] fwd_name = fwd_name.replace('.gz', '') # using object_name + ref_suffix + suffix as file name fwd_name = obj_name + obj_ref_suffix + '.' + fwd_name.split( '.', 1)[-1] shutil.move(files['fwd'], os.path.join(read_file_path, fwd_name)) read_file_list.append(os.path.join(read_file_path, fwd_name)) if (files['rev'] is not None): rev_name = files['rev'].split('/')[-1] rev_name = rev_name.replace('.gz', '') rev_name = obj_name + obj_ref_suffix + '.' + rev_name.split( '.', 1)[-1] shutil.move(files['rev'], os.path.join(read_file_path, rev_name)) read_file_list.append(os.path.join(read_file_path, rev_name)) subprocess.check_output(["fastqc"] + read_file_list) # report = "Command run: "+" ".join(["fastqc"]+read_file_list) output = self.create_report(token, input_params['input_ws'], uuid_string, read_file_path) reported_output = { 'report_name': output['name'], 'report_ref': output['ref'] } # Remove temp reads directory shutil.rmtree(read_file_path, ignore_errors=True) #END runFastQC # At some point might do deeper type checking... if not isinstance(reported_output, dict): raise ValueError('Method runFastQC return value ' + 'reported_output is not type dict as required.') # return the results return [reported_output]
def test_velveth(self): # get the test data out_folder = os.path.join(self.scratch, 'velvet_output_dir') if not os.path.exists(out_folder): os.makedirs(out_folder) rc1 = { 'read_type': 'long', 'file_format': 'fastq.gz', 'file_layout': 'interleaved', 'read_file_info': { 'read_file_name': 'ecoli_ref-5m-trim.fastq.gz' } } rc2 = { 'read_type': 'longPaired', 'file_format': 'fasta.gz', 'file_layout': 'interleaved', 'read_file_info': { 'read_file_name': 'ecoli-reads-5m-dn-paired.fa.gz' } } rc3 = { 'read_type': 'shortPaired', 'file_format': 'fastq', 'file_layout': 'separate', 'read_file_info': { 'read_file_name': 'small.reverse.fq', 'left_file': 'small.forward.fq', 'right_file': 'small.reverse.fq', } } pe_lib_info = self.getPairedEndLibInfo() print(pe_lib_info) obj_ids = [{'ref': pe_lib_info[7] + '/' + pe_lib_info[1]}] ws_info = self.wsClient.get_object_info_new({'objects': obj_ids}) reads_params = [] reftoname = {} for wsi, oid in zip(ws_info, obj_ids): ref = oid['ref'] reads_params.append(ref) obj_name = wsi[1] reftoname[ref] = wsi[7] + '/' + obj_name readcli = ReadsUtils(self.callback_url, token=self.token) typeerr = ('Supported types: KBaseFile.SingleEndLibrary ' + 'KBaseFile.PairedEndLibrary ' + 'KBaseAssembly.SingleEndLibrary ' + 'KBaseAssembly.PairedEndLibrary') try: reads = readcli.download_reads({ 'read_libraries': reads_params, 'interleaved': 'false', 'gzipped': None })['files'] except ServerError as se: print('logging stacktrace from dynamic client error') print(se.data) if typeerr in se.message: prefix = se.message.split('.')[0] raise ValueError( prefix + '. Only the types ' + 'KBaseAssembly.PairedEndLibrary ' + 'and KBaseFile.PairedEndLibrary are supported') else: raise print('Got reads data from converter:\n' + pformat(reads)) reads_data = [] for ref in reads: reads_name = reftoname[ref] f = reads[ref]['files'] seq_tech = reads[ref]["sequencing_tech"] if f['type'] == 'interleaved': reads_data.append({ 'fwd_file': f['fwd'], 'type': 'interleaved', 'seq_tech': seq_tech }) elif f['type'] == 'paired': reads_data.append({ 'fwd_file': f['fwd'], 'rev_file': f['rev'], 'type': 'separated', 'seq_tech': seq_tech }) elif f['type'] == 'single': reads_data.append({ 'fwd_file': f['fwd'], 'type': 'single', 'seq_tech': seq_tech }) else: raise ValueError('Something is very wrong with read lib' + reads_name) params = { 'workspace_name': pe_lib_info[7], 'out_folder': out_folder, 'hash_length': 21, 'reads_channels': [rc1, rc2, rc3] #tests passed #'reads_files': reads_data } result = self.getImpl().exec_velveth(params) self.assertTrue( os.path.isfile( os.path.join(self.scratch, params['out_folder'] + '/Roadmaps'))) self.assertTrue( os.path.isfile( os.path.join(self.scratch, params['out_folder'] + '/Sequences'))) print('RESULT from velveth is saved in:\n' + os.path.join(self.scratch, params['out_folder'])) pprint('Returned value by Velveth is: ' + str(result)) return result
def run_gottcha2(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_gottcha2 # Step 2 - Download the input data as a FASTQ and # We can use the ReadsUtils module to download a FASTQ file from our Reads data object. # The return object gives us the path to the file that was created. logging.info('Downloading Reads data as a Fastq file.') readsUtil = ReadsUtils(self.callback_url) download_reads_output = readsUtil.download_reads( {'read_libraries': params['input_refs']}) print( f"Input parameters {params['input_refs']}, {params['db_type']} download_reads_output {download_reads_output}" ) fastq_files = [] fastq_files_name = [] for key, val in download_reads_output['files'].items(): if 'fwd' in val['files'] and val['files']['fwd']: fastq_files.append(val['files']['fwd']) fastq_files_name.append(val['files']['fwd_name']) if 'rev' in val['files'] and val['files']['rev']: fastq_files.append(val['files']['rev']) fastq_files_name.append(val['files']['rev_name']) logging.info(f"fastq files {fastq_files}") fastq_files_string = ' '.join(fastq_files) output_dir = os.path.join(self.scratch, 'gottcha2_output') if not os.path.exists(output_dir): os.makedirs(output_dir) ## default options if 'min_coverage' not in params: params['min_coverage'] = 0.005 if 'min_reads' not in params: params['min_reads'] = 3 if 'min_length' not in params: params['min_length'] = 60 if 'min_mean_linear_read_length' not in params: params['min_mean_linear_read_length'] = 1 outprefix = "gottcha2" cmd0 = ["ls", "-al", '/data/gottcha2/RefSeq90/'] logging.info(f'cmd {cmd0}') pls = subprocess.Popen(cmd0, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) logging.info(f'subprocess {pls.communicate()}') cmd = [ '/kb/module/lib/gottcha2/src/uge-gottcha2.sh', '-i', fastq_files_string, '-t', '4', '-o', output_dir, '-p', outprefix, '-d', '/data/gottcha2/RefSeq90/' + params['db_type'], '-c', str(params['min_coverage']), '-r', str(params['min_reads']), '-s', str(params['min_length']), '-m', str(params['min_mean_linear_read_length']) ] logging.info(f'cmd {cmd}') p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) logging.info(f'subprocess {p.communicate()}') summary_file = os.path.join(output_dir, outprefix + '.summary.tsv') # generate report directory and html file report_dir = os.path.join(output_dir, 'html_report') if not os.path.exists(report_dir): os.makedirs(report_dir) summary_file_dt = os.path.join(report_dir, 'gottcha2.datatable.html') self._generate_DataTable(summary_file, summary_file_dt) shutil.copy2('/kb/module/lib/gottcha2/src/index.html', os.path.join(report_dir, 'index.html')) shutil.copy2(os.path.join(output_dir, outprefix + '.krona.html'), os.path.join(report_dir, 'gottcha2.krona.html')) shutil.move(os.path.join(output_dir, outprefix + '.tree.svg'), os.path.join(report_dir, 'gottcha2.tree.svg')) html_zipped = self.package_folder(report_dir, 'index.html', 'index.html') # Step 5 - Build a Report and return objects_created = [] output_files = os.listdir(output_dir) output_files_list = [] for output in output_files: if not os.path.isdir(output): output_files_list.append({ 'path': os.path.join(output_dir, output), 'name': output }) # not used output_html_files = [{ 'path': os.path.join(report_dir, 'index.html'), 'name': 'index.html' }, { 'path': os.path.join(report_dir, 'gottcha2.krona.html'), 'name': 'gottcha2.krona.html' }, { 'path': os.path.join(report_dir, 'gottcha2.datatable.html'), 'name': 'gottcha2.datatable.html' }, { 'path': os.path.join(report_dir, 'gottcha2.tree.svg'), 'name': 'gottcha2.tree.svg' }] message = 'GOTTCHA2 run finished on %s against %s.' % ( ','.join(fastq_files_name), params['db_type']) report_params = { 'message': message, 'workspace_name': params.get('workspace_name'), 'objects_created': objects_created, 'file_links': output_files_list, 'html_links': [html_zipped], 'direct_html_link_index': 0, 'html_window_height': 460 } # STEP 6: contruct the output to send back kbase_report_client = KBaseReport(self.callback_url) report_output = kbase_report_client.create_extended_report( report_params) report_output['report_params'] = report_params logging.info(report_output) # Return references which will allow inline display of # the report in the Narrative output = { 'report_name': report_output['name'], 'report_ref': report_output['ref'] } #END run_gottcha2 # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_gottcha2 return value ' + 'output is not type dict as required.') # return the results return [output]