def load_fastas(config, scratch: str, upa: str): ''' Returns list of (fasta_path, upa) ''' dfu = DataFileUtil(config['callback_url']) au = AssemblyUtil(config['callback_url']) mgu = MetagenomeUtils(config['callback_url']) ws = Workspace(config['workspace-url']) obj_data = dfu.get_objects({"object_refs": [upa]})['data'][0] obj_type = obj_data['info'][2] if 'KBaseSets.GenomeSet' in obj_type: upas = [gsi['ref'] for gsi in obj_data['data']['items']] elif 'KBaseSearch.GenomeSet' in obj_type: upas = [gse['ref'] for gse in obj_data['data']['elements'].values()] elif "KBaseGenomes.Genome" in obj_type: upas = [upa] elif "KBaseGenomes.ContigSet" in obj_type or "KBaseGenomeAnnotations.Assembly" in obj_type: # in this case we use the assembly file util to get the fasta file # file_output = os.path.join(scratch, "input_fasta.fa") faf = au.get_assembly_as_fasta({"ref": upa}) return [(faf['path'], upa)] elif "KBaseSets.AssemblySet" in obj_type: fasta_paths = [] for item_upa in obj_data['data']['items']: faf = au.get_assembly_as_fasta({"ref": item_upa['ref']}) fasta_paths.append((faf['path'], item_upa['ref'])) return fasta_paths elif 'KBaseMetagenomes.BinnedContigs' in obj_type: fasta_paths = [] bin_file_dir = mgu.binned_contigs_to_file({ 'input_ref': upa, 'save_to_shock': 0 })['bin_file_directory'] for (dirpath, dirnames, filenames) in os.walk(bin_file_dir): for fasta_file in filenames: fasta_path = os.path.join(scratch, fasta_file) fasta_path = os.path.splitext(fasta_path)[0] + ".fa" copyfile(os.path.join(bin_file_dir, fasta_file), fasta_path) # Should I verify that the bins have contigs? # is it possible to have empty bins? fasta_paths.append((fasta_path, upa)) break return fasta_paths else: raise Error('Input genome/metagenome reference has unhandled type') fasta_paths = [] for genome_upa in upas: genome_data = ws.get_objects2({'objects': [{ "ref": genome_upa }]})['data'][0]['data'] assembly_upa = genome_upa + ';' + str( genome_data.get('contigset_ref') or genome_data.get('assembly_ref')) faf = au.get_assembly_as_fasta({'ref': assembly_upa}) fasta_paths.append((faf['path'], assembly_upa)) return fasta_paths
def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.scratch = config['scratch'] self.shock_url = config['shock-url'] self.dfu = DataFileUtil(self.callback_url) self.ru = ReadsUtils(self.callback_url) self.au = AssemblyUtil(self.callback_url) self.mgu = MetagenomeUtils(self.callback_url)
def setUpClass(cls): token = environ.get('KB_AUTH_TOKEN', None) config_file = environ.get('KB_DEPLOYMENT_CONFIG', None) cls.cfg = {} config = ConfigParser() config.read(config_file) for nameval in config.items('kb_Msuite'): cls.cfg[nameval[0]] = nameval[1] # Getting username from Auth profile for token authServiceUrl = cls.cfg['auth-service-url'] auth_client = _KBaseAuth(authServiceUrl) user_id = auth_client.get_user(token) # WARNING: don't call any logging methods on the context object, # it'll result in a NoneType error cls.ctx = MethodContext(None) cls.ctx.update({'token': token, 'user_id': user_id, 'provenance': [ {'service': 'kb_Msuite', 'method': 'please_never_use_it_in_production', 'method_params': [] }], 'authenticated': 1}) cls.wsURL = cls.cfg['workspace-url'] cls.wsClient = workspaceService(cls.wsURL) cls.serviceImpl = kb_Msuite(cls.cfg) cls.callback_url = os.environ['SDK_CALLBACK_URL'] cls.scratch = cls.cfg['scratch'] cls.suffix = int(time.time() * 1000) #cls.scratch = cls.cfg['scratch']+'_'+str(suffix) #cls.cfg['scratch'] = cls.scratch #if not os.path.exists(cls.scratch): # os.mkdir(cls.scratch) cls.checkm_runner = CheckMUtil(cls.cfg, cls.ctx) cls.wsName = "test_kb_Msuite_" + str(cls.suffix) cls.ws_info = cls.wsClient.create_workspace({'workspace': cls.wsName}) cls.au = AssemblyUtil(os.environ['SDK_CALLBACK_URL']) cls.setAPI = SetAPI(url=cls.cfg['srv-wiz-url'], token=cls.ctx['token']) cls.gfu = GenomeFileUtil(os.environ['SDK_CALLBACK_URL'], service_ver='dev') cls.mu = MetagenomeUtils(os.environ['SDK_CALLBACK_URL']) # stage an input and output directory """ cls.input_dir = os.path.join(cls.scratch, 'input_1') cls.output_dir = os.path.join(cls.scratch, 'output_1') cls.all_seq_fasta = os.path.join(cls.scratch, 'all_seq.fna') shutil.copytree(os.path.join('data', 'example_out', 'input'), cls.input_dir) shutil.copytree(os.path.join('data', 'example_out', 'output'), cls.output_dir) shutil.copy(os.path.join('data', 'example_out', 'all_seq.fna'), cls.all_seq_fasta) """ # prepare WS data cls.prepare_data()
def save_binned_contigs(self, params, assembly_ref, filtered_bins_dir): try: mgu = MetagenomeUtils(self.callback_url) except: raise ValueError("unable to connect with MetagenomeUtils") filtered_binned_contig_obj_name = params.get( 'output_filtered_binnedcontigs_obj_name') generate_binned_contig_param = { 'file_directory': filtered_bins_dir, 'assembly_ref': assembly_ref, 'binned_contig_name': filtered_binned_contig_obj_name, 'workspace_name': params.get('workspace_name') } filtered_binned_contig_obj_ref = mgu.file_to_binned_contigs( generate_binned_contig_param).get('binned_contig_obj_ref') return { 'obj_name': filtered_binned_contig_obj_name, 'obj_ref': filtered_binned_contig_obj_ref }
def setUpClass(cls): token = environ.get('KB_AUTH_TOKEN', None) config_file = environ.get('KB_DEPLOYMENT_CONFIG', None) test_time_stamp = int(time.time() * 1000) cls.cfg = {} config = ConfigParser() config.read(config_file) for nameval in config.items('kb_Msuite'): cls.cfg[nameval[0]] = nameval[1] # Getting username from Auth profile for token authServiceUrl = cls.cfg['auth-service-url'] auth_client = _KBaseAuth(authServiceUrl) user_id = auth_client.get_user(token) # WARNING: don't call any logging methods on the context object, # it'll result in a NoneType error cls.ctx = MethodContext(None) cls.ctx.update({ 'token': token, 'user_id': user_id, 'provenance': [{ 'service': 'kb_Msuite', 'method': 'please_never_use_it_in_production', 'method_params': [] }], 'authenticated': 1 }) cls.wsURL = cls.cfg['workspace-url'] cls.wsClient = Workspace(cls.wsURL) cls.serviceImpl = kb_Msuite(cls.cfg) cls.callback_url = os.environ['SDK_CALLBACK_URL'] cls.scratch = cls.cfg['scratch'] cls.appdir = cls.cfg['appdir'] cls.test_data_dir = os.path.join(cls.scratch, 'test_data') cls.suffix = test_time_stamp cls.checkm_runner = CheckMUtil(cls.cfg, cls.ctx) cls.wsName = "test_kb_Msuite_" + str(cls.suffix) cls.ws_info = cls.wsClient.create_workspace({'workspace': cls.wsName}) cls.au = AssemblyUtil(os.environ['SDK_CALLBACK_URL']) cls.gfu = GenomeFileUtil(os.environ['SDK_CALLBACK_URL'], service_ver='dev') cls.mu = MetagenomeUtils(os.environ['SDK_CALLBACK_URL']) cls.setAPI = SetAPI(url=cls.cfg['srv-wiz-url'], token=cls.ctx['token']) cls.kr = KBaseReport(os.environ['SDK_CALLBACK_URL']) cls.data_loaded = False
def __init__(self, callback_url: str, workspace_url: str, user_token: str): ''' Create the client set. :param callback_url: The url of the callback server. :param workspace_url: The url of the KBase workspace server. :param user_token: The user's token. ''' # TODO check inputs aren't None or empty string self._dfu = DataFileUtil(callback_url, token=user_token) self._au = AssemblyUtil(callback_url, token=user_token) self._mgu = MetagenomeUtils(callback_url, token=user_token) self._report = KBaseReport(callback_url, token=user_token) self._ws = Workspace(workspace_url, token=user_token)
def setUpClass(cls): cls.token = os.environ.get('KB_AUTH_TOKEN', None) config_file = os.environ.get('KB_DEPLOYMENT_CONFIG', None) cls.cfg = {} config = ConfigParser() config.read(config_file) for nameval in config.items('kb_das_tool'): cls.cfg[nameval[0]] = nameval[1] # Getting username from Auth profile for token authServiceUrl = cls.cfg['auth-service-url'] auth_client = _KBaseAuth(authServiceUrl) user_id = auth_client.get_user(cls.token) # WARNING: don't call any logging methods on the context object, # it'll result in a NoneType error cls.ctx = MethodContext(None) cls.ctx.update({ 'token': cls.token, 'user_id': user_id, 'provenance': [{ 'service': 'kb_das_tool', 'method': 'please_never_use_it_in_production', 'method_params': [] }], 'authenticated': 1 }) cls.wsURL = cls.cfg['workspace-url'] cls.wsClient = Workspace(cls.wsURL) cls.serviceImpl = kb_das_tool(cls.cfg) cls.scratch = cls.cfg['scratch'] cls.callback_url = os.environ['SDK_CALLBACK_URL'] suffix = int(time.time() * 1000) cls.wsName = "test_kb_das_tool_" + str(suffix) # ret = cls.wsClient.create_workspace({'workspace': cls.wsName}) # noqa cls.ws_info = cls.wsClient.create_workspace({'workspace': cls.wsName}) # noqa cls.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'], token=cls.token) cls.ru = ReadsUtils(os.environ['SDK_CALLBACK_URL'], token=cls.token) cls.au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'], token=cls.token) cls.mgu = MetagenomeUtils(os.environ['SDK_CALLBACK_URL'], token=cls.token) cls.das_tool_runner = DASToolUtil(cls.cfg) cls.prepare_data()
def __init__(self, config): #BEGIN_CONSTRUCTOR callback_url = os.environ['SDK_CALLBACK_URL'] workspace_url = config['workspace-url'] shared_folder = config['scratch'] reset_globals() app.update({ 'shared_folder': config['scratch'], 'ws': Workspace(workspace_url), 'dfu': DataFileUtil(callback_url), 'mgu': MetagenomeUtils(callback_url, service_ver='dev'), 'au': AssemblyUtil(callback_url), 'kbr': KBaseReport(callback_url), }) logging.basicConfig(format='%(created)s %(levelname)s: %(message)s', level=logging.INFO) #END_CONSTRUCTOR pass
def stage_input(self, input_ref, fasta_file_extension): ''' Stage input based on an input data reference for CheckM input_ref can be a reference to an Assembly, BinnedContigs, or (not yet implemented) a Genome This method creates a directory in the scratch area with the set of Fasta files, names will have the fasta_file_extension parameter tacked on. ex: staged_input = stage_input('124/15/1', 'fna') staged_input {"input_dir": '...'} ''' # config #SERVICE_VER = 'dev' SERVICE_VER = 'release' [OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I] = range(11) # object_info tuple ws = Workspace(self.ws_url) # 1) generate a folder in scratch to hold the input suffix = str(int(time.time() * 1000)) input_dir = os.path.join(self.scratch, 'bins_' + suffix) all_seq_fasta = os.path.join(self.scratch, 'all_sequences_' + suffix + '.' + fasta_file_extension) if not os.path.exists(input_dir): os.makedirs(input_dir) # 2) based on type, download the files obj_name = self.get_data_obj_name (input_ref) type_name = self.get_data_obj_type (input_ref) # auClient try: auClient = AssemblyUtil(self.callbackURL, token=self.ctx['token'], service_ver=SERVICE_VER) except Exception as e: raise ValueError('Unable to instantiate auClient with callbackURL: '+ self.callbackURL +' ERROR: ' + str(e)) # setAPI_Client try: #setAPI_Client = SetAPI (url=self.callbackURL, token=self.ctx['token']) # for SDK local. local doesn't work for SetAPI setAPI_Client = SetAPI (url=self.serviceWizardURL, token=self.ctx['token']) # for dynamic service except Exception as e: raise ValueError('Unable to instantiate setAPI_Client with serviceWizardURL: '+ self.serviceWizardURL +' ERROR: ' + str(e)) # mguClient try: mguClient = MetagenomeUtils(self.callbackURL, token=self.ctx['token'], service_ver=SERVICE_VER) except Exception as e: raise ValueError('Unable to instantiate mguClient with callbackURL: '+ self.callbackURL +' ERROR: ' + str(e)) # Standard Single Assembly # if type_name in ['KBaseGenomeAnnotations.Assembly', 'KBaseGenomes.ContigSet']: # create file data filename = os.path.join(input_dir, obj_name + '.' + fasta_file_extension) auClient.get_assembly_as_fasta({'ref': input_ref, 'filename': filename}) if not os.path.isfile(filename): raise ValueError('Error generating fasta file from an Assembly or ContigSet with AssemblyUtil') # make sure fasta file isn't empty min_fasta_len = 1 if not self.fasta_seq_len_at_least(filename, min_fasta_len): raise ValueError('Assembly or ContigSet is empty in filename: '+str(filename)) # AssemblySet # elif type_name == 'KBaseSets.AssemblySet': # read assemblySet try: assemblySet_obj = setAPI_Client.get_assembly_set_v1 ({'ref':input_ref, 'include_item_info':1}) except Exception as e: raise ValueError('Unable to get object from workspace: (' + input_ref +')' + str(e)) assembly_refs = [] assembly_names = [] for assembly_item in assemblySet_obj['data']['items']: this_assembly_ref = assembly_item['ref'] # assembly obj info try: this_assembly_info = ws.get_object_info_new ({'objects':[{'ref':this_assembly_ref}]})[0] this_assembly_name = this_assembly_info[NAME_I] except Exception as e: raise ValueError('Unable to get object from workspace: (' + this_assembly_ref +'): ' + str(e)) assembly_refs.append(this_assembly_ref) assembly_names.append(this_assembly_name) # create file data (name for file is what's reported in results) for ass_i,assembly_ref in enumerate(assembly_refs): this_name = assembly_names[ass_i] filename = os.path.join(input_dir, this_name + '.' + fasta_file_extension) auClient.get_assembly_as_fasta({'ref': assembly_ref, 'filename': filename}) if not os.path.isfile(filename): raise ValueError('Error generating fasta file from an Assembly or ContigSet with AssemblyUtil') # make sure fasta file isn't empty min_fasta_len = 1 if not self.fasta_seq_len_at_least(filename, min_fasta_len): raise ValueError('Assembly or ContigSet is empty in filename: '+str(filename)) # Binned Contigs # elif type_name == 'KBaseMetagenomes.BinnedContigs': # download the bins as fasta and set the input folder name bin_file_dir = mguClient.binned_contigs_to_file({'input_ref': input_ref, 'save_to_shock': 0})['bin_file_directory'] os.rename(bin_file_dir, input_dir) # make sure fasta file isn't empty self.set_fasta_file_extensions(input_dir, fasta_file_extension) for (dirpath, dirnames, filenames) in os.walk(input_dir): for fasta_file in filenames: fasta_path = os.path.join (input_dir,fasta_file) min_fasta_len = 1 if not self.fasta_seq_len_at_least(fasta_path, min_fasta_len): raise ValueError('Binned Assembly is empty for fasta_path: '+str(fasta_path)) break # Genome and GenomeSet # elif type_name == 'KBaseGenomes.Genome' or type_name == 'KBaseSearch.GenomeSet': genome_obj_names = [] genome_sci_names = [] genome_assembly_refs = [] if type_name == 'KBaseGenomes.Genome': genomeSet_refs = [input_ref] else: # get genomeSet_refs from GenomeSet object genomeSet_refs = [] try: genomeSet_object = ws.get_objects2({'objects':[{'ref':input_ref}]})['data'][0]['data'] except Exception as e: raise ValueError('Unable to fetch '+str(input_ref)+' object from workspace: ' + str(e)) #to get the full stack trace: traceback.format_exc() # iterate through genomeSet members for genome_id in genomeSet_object['elements'].keys(): if 'ref' not in genomeSet_object['elements'][genome_id] or \ genomeSet_object['elements'][genome_id]['ref'] == None or \ genomeSet_object['elements'][genome_id]['ref'] == '': raise ValueError('genome_ref not found for genome_id: '+str(genome_id)+' in genomeSet: '+str(input_ref)) else: genomeSet_refs.append(genomeSet_object['elements'][genome_id]['ref']) # genome obj data for i,this_input_ref in enumerate(genomeSet_refs): try: objects = ws.get_objects2({'objects':[{'ref':this_input_ref}]})['data'] genome_obj = objects[0]['data'] genome_obj_info = objects[0]['info'] genome_obj_names.append(genome_obj_info[NAME_I]) genome_sci_names.append(genome_obj['scientific_name']) except: raise ValueError ("unable to fetch genome: "+this_input_ref) # Get genome_assembly_ref if ('contigset_ref' not in genome_obj or genome_obj['contigset_ref'] == None) \ and ('assembly_ref' not in genome_obj or genome_obj['assembly_ref'] == None): msg = "Genome "+genome_obj_names[i]+" (ref:"+input_ref+") "+genome_sci_names[i]+" MISSING BOTH contigset_ref AND assembly_ref. Cannot process. Exiting." raise ValueError (msg) continue elif 'assembly_ref' in genome_obj and genome_obj['assembly_ref'] != None: msg = "Genome "+genome_obj_names[i]+" (ref:"+input_ref+") "+genome_sci_names[i]+" USING assembly_ref: "+str(genome_obj['assembly_ref']) print (msg) genome_assembly_refs.append(genome_obj['assembly_ref']) elif 'contigset_ref' in genome_obj and genome_obj['contigset_ref'] != None: msg = "Genome "+genome_obj_names[i]+" (ref:"+input_ref+") "+genome_sci_names[i]+" USING contigset_ref: "+str(genome_obj['contigset_ref']) print (msg) genome_assembly_refs.append(genome_obj['contigset_ref']) # create file data (name for file is what's reported in results) for ass_i,assembly_ref in enumerate(genome_assembly_refs): this_name = genome_obj_names[ass_i] filename = os.path.join(input_dir, this_name + '.' + fasta_file_extension) auClient.get_assembly_as_fasta({'ref': assembly_ref, 'filename': filename}) if not os.path.isfile(filename): raise ValueError('Error generating fasta file from an Assembly or ContigSet with AssemblyUtil') # make sure fasta file isn't empty min_fasta_len = 1 if not self.fasta_seq_len_at_least(filename, min_fasta_len): raise ValueError('Assembly or ContigSet is empty in filename: '+str(filename)) # Unknown type slipped through # else: raise ValueError('Cannot stage fasta file input directory from type: ' + type_name) # create summary fasta file with all bins self.cat_fasta_files(input_dir, fasta_file_extension, all_seq_fasta) return {'input_dir': input_dir, 'folder_suffix': suffix, 'all_seq_fasta': all_seq_fasta}
def __init__(self, callback_url, scratch, wrkspc, token): self.ws = wrkspc self.scratch = scratch self.callback_url = callback_url self.mgu = MetagenomeUtils(callback_url, token=token) self.fasta_dict = {}
class TypeToFasta: def __init__(self, callback_url, scratch, wrkspc, token): self.ws = wrkspc self.scratch = scratch self.callback_url = callback_url self.mgu = MetagenomeUtils(callback_url, token=token) self.fasta_dict = {} def log(self, message, prefix_newline=False): print(('\n' if prefix_newline else '') + str(_time.time()) + ': ' + message) def add_to_dict(self, key, val): if key in self.fasta_dict: # if key is already dict, we want to add a field to the the 'parent_refs' if 'parent_refs' in self.fasta_dict[key]: self.fasta_dict[key]['parent_refs'] += val['parent_refs'] else: self.fasta_dict[key] = val def genome_obj_to_fasta(self, ref, obj_type): # Initiate needed objects atf = AssemblyToFasta(self.callback_url, self.scratch) upas = [] if 'KBaseSets.GenomeSet' in obj_type: obj_data = self.ws.get_objects2({'objects': [{ "ref": ref }]})['data'][0] upas = [gsi['ref'] for gsi in obj_data['data']['items']] elif 'KBaseSearch.GenomeSet' in obj_type: obj_data = self.ws.get_objects2({'objects': [{ "ref": ref }]})['data'][0] upas = [ gse['ref'] for gse in obj_data['data']['elements'].values() ] elif "KBaseGenomes.Genome" in obj_type: upas = [ref] if upas: for genome_upa in upas: # Get genome object assembly_ref or contigset_ref through subsetting object genome_data = self.ws.get_objects2({'objects': \ [{"ref": genome_upa, 'included' : ['/assembly_ref/','/contigset_ref/']}]}) \ ['data'][0]['data'] # If genome object contains an assembly_ref or contigset_ref it will return a dictionary, genome_data. # If not an empty dictionary will be returned if genome_data: # Get assembly_upa and fasta assembly_upa = genome_upa + ';' + \ str(genome_data.get('assembly_ref') or genome_data.get('contigset_ref')) faf = atf.assembly_as_fasta({'ref': assembly_upa}) # Input data into object dict self.add_to_dict( assembly_upa, { 'paths': [faf['path']], 'type': obj_type, 'parent_refs': [ref] }) else: raise TypeError( "KBase object type %s does not contain an assembly reference or contig reference." % obj_type) def assembly_obj_to_fasta(self, ref, obj_type, input_ref=None, input_type=None): # Initiate needed objects atf = AssemblyToFasta(self.callback_url, self.scratch) obj = {"ref": ref} if "KBaseGenomes.ContigSet" in obj_type or "KBaseGenomeAnnotations.Assembly" in obj_type: # Get fasta faf = atf.assembly_as_fasta(obj) if input_ref and input_type: self.add_to_dict( input_ref, { 'paths': [faf['path']], 'type': input_type, 'parent_refs': [input_ref, ref] }) else: self.add_to_dict(ref, { 'paths': [faf['path']], 'type': obj_type, 'parent_refs': [ref] }) elif "KBaseSets.AssemblySet" in obj_type: # Get assembly set object obj_data = self.ws.get_objects2({'objects': [obj]})['data'][0] for item_upa in obj_data['data']['items']: # Get fasta faf = atf.assembly_as_fasta({"ref": item_upa['ref']}) # Input data into object dict self.add_to_dict(item_upa['ref'], { 'paths': [faf['path']], 'type': obj_type, 'parent_refs': [ref] }) def metagenome_obj_to_fasta(self, ref, obj_type): if 'KBaseMetagenomes.BinnedContigs' in obj_type: fasta_paths = [] try: # Binned_contigs_to_file saves fasta file to a directory in scratch. # Path: scratch/binned_contig_files_EXTENSION/Bin#.fasta bin_file_dir = self.mgu.binned_contigs_to_file({'input_ref': ref, 'save_to_shock': 0}) \ ['bin_file_directory'] for (dirpath, dirnames, filenames) in os.walk(bin_file_dir): for fasta_file in filenames: # For fasta file in the binned contigs directory, copy fasta directly to scratch # New path: scratch/Bin#.fasta fasta_path = os.path.join(self.scratch, fasta_file) copyfile(os.path.join(bin_file_dir, fasta_file), fasta_path) fasta_paths.append(fasta_path) # Input data into object dict self.add_to_dict(ref, {'paths': fasta_paths, 'type': obj_type}) # Catch MetagenomeUtil Error except _MGUError as mgue: self.log('Logging exception loading binned contigs to file.') self.log(str(mgue)) raise if 'KBaseMetagenomes.AnnotatedMetagenomeAssembly' in obj_type: ret = self.ws.get_objects2( {'objects': [{ 'ref': ref, 'included': ['assembly_ref'] }]})['data'][0] assembly_ref = ret['data']['assembly_ref'] assembly_obj_type = self.ws.get_object_info3( {'objects': [{ 'ref': assembly_ref }]})['infos'][0][2] self.assembly_obj_to_fasta(assembly_ref, assembly_obj_type, input_ref=ref, input_type=obj_type) def type_to_fasta(self, ref_lst): """type_to_fasta takes in a list of KBase objects references. The object type of each reference is checked in functions: assembly_obj_to_fasta, metagenome_obj_to_fasta, and genome_obj_to_fasta. Depending on the type of KBase object input a fasta file is made through one of the functions mentioned above and a fasta object dictionary is created with structure: {ref: {'path' : fasta_paths, 'type': object type} } for objects of type AssemblySet and GenomeSet a parent ref key-value pair is added such that the structure is: {ref: {'path' : fasta_paths, 'type': object type, 'parent_refs': [ref]} } for objects of type KBaseMetagenomes.BinnedContigs a unique fasta path is made for each bin in binnedContigs Thus the output structure is: {ref: {'paths' : [fasta_contigbin1, fasta_contigbin2], 'type': object type} } where the key 'paths' points to an array of fasta paths for each contig bin in ascending order. """ # Get type info for each ref in ref_lst for idx, ref in enumerate(ref_lst): # Get KBase object type with get_object_info3 obj_info = self.ws.get_object_info3({"objects": [{"ref": ref}]}) obj_type = obj_info["infos"][0][2] # Put object in object specific fasta dictionary by type self.genome_obj_to_fasta(ref, obj_type) self.assembly_obj_to_fasta(ref, obj_type) self.metagenome_obj_to_fasta(ref, obj_type) # Append all individual object dictionaries to complete fasta dictionary for reference list return self.fasta_dict
class MaxBinUtil: MAXBIN_TOOLKIT_PATH = '/kb/deployment/bin/MaxBin' def _validate_run_maxbin_params(self, params): """ _validate_run_maxbin_params: validates params passed to run_maxbin method """ log('Start validating run_maxbin params') # check for required parameters for p in [ 'assembly_ref', 'binned_contig_name', 'workspace_name', 'reads_list' ]: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _run_command(self, command): """ _run_command: run command and print result """ log('Start executing command:\n{}'.format(command)) pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True) output = pipe.communicate()[0] exitCode = pipe.returncode if (exitCode == 0): log('Executed commend:\n{}\n'.format(command) + 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output)) else: error_msg = 'Error running commend:\n{}\n'.format(command) error_msg += 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output) raise ValueError(error_msg) def _stage_reads_list_file(self, reads_list): """ _stage_reads_list_file: download fastq file associated to reads to scratch area and write result_file_path to file """ log('Processing reads object list: {}'.format(reads_list)) result_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_directory) result_file = os.path.join(result_directory, 'reads_list_file.txt') result_file_path = [] reads = self.ru.download_reads({ 'read_libraries': reads_list, 'interleaved': 'true' })['files'] for read_obj in reads_list: files = reads[read_obj]['files'] result_file_path.append(files['fwd']) if 'rev' in files and files['rev'] is not None: result_file_path.append(files['rev']) log('Saving reads file path(s) to: {}'.format(result_file)) with open(result_file, 'w') as file_handler: for item in result_file_path: file_handler.write("{}\n".format(item)) return result_file def _get_contig_file(self, assembly_ref): """ _get_contig_file: get contif file from GenomeAssembly object """ contig_file = self.au.get_assembly_as_fasta({ 'ref': assembly_ref }).get('path') sys.stdout.flush() contig_file = self.dfu.unpack_file({'file_path': contig_file})['file_path'] return contig_file def _generate_command(self, params): """ _generate_command: generate run_MaxBin.pl params """ command = self.MAXBIN_TOOLKIT_PATH + '/run_MaxBin.pl ' command += '-contig {} -out {} '.format(params.get('contig_file_path'), params.get('out_header')) if params.get('abund_list_file'): command += '-abund_list {} '.format(params.get('abund_list_file')) if params.get('reads_list_file'): command += '-reads_list {} '.format(params.get('reads_list_file')) if params.get('thread'): command += '-thread {} '.format(params.get('thread')) if params.get('prob_threshold'): command += '-prob_threshold {} '.format( params.get('prob_threshold')) if params.get('markerset'): command += '-markerset {} '.format(params.get('markerset')) if params.get('min_contig_length'): command += '-min_contig_length {} '.format( params.get('min_contig_length')) if params.get('plotmarker'): command += '-plotmarker ' if params.get('reassembly'): command += '-reassembly ' log('Generated run_MaxBin command: {}'.format(command)) return command def _generate_output_file_list(self, result_directory): """ _generate_output_file_list: zip result files and generate file_links for report """ log('Start packing result files') output_files = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file = os.path.join(output_directory, 'maxbin_result.zip') report_file = None with zipfile.ZipFile(result_file, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as zip_file: for root, dirs, files in os.walk(result_directory): for file in files: if not (file.endswith('.fasta') or file.endswith('.DS_Store')): zip_file.write(os.path.join(root, file), file) if file.endswith('.marker.pdf'): report_file = os.path.join(root, file) output_files.append({ 'path': result_file, 'name': os.path.basename(result_file), 'label': os.path.basename(result_file), 'description': 'File(s) generated by MaxBin2 App' }) if report_file: output_files.append({ 'path': report_file, 'name': os.path.basename(report_file), 'label': os.path.basename(report_file), 'description': 'Visualization of the marker by MaxBin2 App' }) return output_files def _generate_html_report(self, result_directory, assembly_ref, binned_contig_obj_ref, header): """ _generate_html_report: generate html summary report """ log('Start generating html report') html_report = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file_path = os.path.join(output_directory, 'report.html') Overview_Content = '' (binned_contig_count, input_contig_count, too_short_count, total_bins_count, total_binned_contig_len, totoal_contig_length, total_short_contig_len) = self._generate_overview_info( assembly_ref, binned_contig_obj_ref, result_directory) Overview_Content += '<p>Bins: {}</p>'.format(total_bins_count) Overview_Content += '<p>Input Contigs: {}</p>'.format( input_contig_count) Overview_Content += '<p>Binned Contigs: {} ({:.1%})</p>'.format( binned_contig_count, binned_contig_count / float(input_contig_count)) Overview_Content += '<p>Unbinned Contigs: {} ({:.1%})</p>'.format( input_contig_count - binned_contig_count, 1 - binned_contig_count / float(input_contig_count)) Overview_Content += '<p>Contigs Too Short: {} ({:.1%})</p>'.format( too_short_count, too_short_count / float(input_contig_count)) Overview_Content += '<p>Summed Length of Binned Contigs: {} ({:.1%})</p>'.format( total_binned_contig_len, total_binned_contig_len / float(totoal_contig_length)) Overview_Content += '<p>Summed Length of Unbinned Contigs: {} ({:.1%})</p>'.format( totoal_contig_length - total_binned_contig_len, 1 - total_binned_contig_len / float(totoal_contig_length)) Overview_Content += '<p>Summed Length of Short Contigs: {} ({:.1%})</p>'.format( total_short_contig_len, total_short_contig_len / float(totoal_contig_length)) with open(result_file_path, 'w') as result_file: with open( os.path.join(os.path.dirname(__file__), 'report_template.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace( '<p>Overview_Content</p>', Overview_Content) result_file.write(report_template) html_report.append({ 'path': result_file_path, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for MaxBin2 App' }) return html_report def _generate_overview_info(self, assembly_ref, binned_contig_obj_ref, result_directory): """ _generate_overview_info: generate overview information from assembly and binnedcontig """ assembly = self.dfu.get_objects({'object_refs': [assembly_ref]})['data'][0] binned_contig = self.dfu.get_objects( {'object_refs': [binned_contig_obj_ref]})['data'][0] input_contig_count = assembly.get('data').get('num_contigs') totoal_contig_length = 0 for contig_id, contig in assembly.get('data').get('contigs').items(): totoal_contig_length += int(contig.get('length')) binned_contig_count = 0 total_bins = binned_contig.get('data').get('bins') total_binned_contig_len = binned_contig.get('data').get( 'total_contig_len') total_bins_count = len(total_bins) for bin in total_bins: binned_contig_count += len(bin.get('contigs')) too_short_count = 0 total_short_contig_len = 0 result_files = os.listdir(result_directory) for file_name in result_files: if file_name.endswith('.tooshort'): for record in SeqIO.parse( os.path.join(result_directory, file_name), "fasta"): total_short_contig_len += len(str(record.seq)) too_short_count += 1 return (binned_contig_count, input_contig_count, too_short_count, total_bins_count, total_binned_contig_len, totoal_contig_length, total_short_contig_len) def _generate_report(self, binned_contig_obj_ref, result_directory, params): """ generate_report: generate summary report """ log('Generating report') output_files = self._generate_output_file_list(result_directory) output_html_files = self._generate_html_report( result_directory, params.get('assembly_ref'), binned_contig_obj_ref, params.get('out_header')) created_objects = [] created_objects.append({ "ref": binned_contig_obj_ref, "description": "BinnedContigs from MaxBin2" }) report_params = { 'message': '', 'workspace_name': params.get('workspace_name'), 'objects_created': created_objects, 'file_links': output_files, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 266, 'report_object_name': 'kb_maxbin_report_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.scratch = config['scratch'] self.shock_url = config['shock-url'] self.dfu = DataFileUtil(self.callback_url) self.ru = ReadsUtils(self.callback_url) self.au = AssemblyUtil(self.callback_url) self.mgu = MetagenomeUtils(self.callback_url) def run_maxbin(self, params): """ run_maxbin: run_MaxBin.pl app required params: assembly_ref: Metagenome assembly object reference binned_contig_name: BinnedContig object name and output file header workspace_name: the name of the workspace it gets saved to. reads_list: list of reads object (PairedEndLibrary/SingleEndLibrary) upon which MaxBin will be run optional params: thread: number of threads; default 1 reassembly: specify this option if you want to reassemble the bins. note that at least one reads file needs to be designated. prob_threshold: minimum probability for EM algorithm; default 0.8 markerset: choose between 107 marker genes by default or 40 marker genes min_contig_length: minimum contig length; default 1000 plotmarker: specify this option if you want to plot the markers in each contig ref: http://downloads.jbei.org/data/microbial_communities/MaxBin/README.txt """ log('--->\nrunning MaxBinUtil.run_maxbin\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self._validate_run_maxbin_params(params) params['out_header'] = 'Bin' contig_file = self._get_contig_file(params.get('assembly_ref')) params['contig_file_path'] = contig_file reads_list_file = self._stage_reads_list_file(params.get('reads_list')) params['reads_list_file'] = reads_list_file result_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_directory) command = self._generate_command(params) cwd = os.getcwd() log('changing working dir to {}'.format(result_directory)) os.chdir(result_directory) self._run_command(command) os.chdir(cwd) log('changing working dir to {}'.format(cwd)) log('Saved result files to: {}'.format(result_directory)) log('Generated files:\n{}'.format('\n'.join( os.listdir(result_directory)))) generate_binned_contig_param = { 'file_directory': result_directory, 'assembly_ref': params.get('assembly_ref'), 'binned_contig_name': params.get('binned_contig_name'), 'workspace_name': params.get('workspace_name') } binned_contig_obj_ref = self.mgu.file_to_binned_contigs( generate_binned_contig_param).get('binned_contig_obj_ref') reportVal = self._generate_report(binned_contig_obj_ref, result_directory, params) returnVal = { 'result_directory': result_directory, 'binned_contig_obj_ref': binned_contig_obj_ref } returnVal.update(reportVal) return returnVal
def get_mgu(): mgu = MetagenomeUtils(os.environ['SDK_CALLBACK_URL']) return mgu
def test_fractiontate_contigs_ASSEMBLY_BINNEDCONTIGS_08(self): method = 'fractionate_contigs_pos_filter_ASSEMBLY_BINNEDCONTIGS_08' print("\n\nRUNNING: test_" + method + "()") print("==========================================================\n\n") # upload test data try: auClient = AssemblyUtil(self.callback_url, token=self.getContext()['token']) except Exception as e: raise ValueError( 'Unable to instantiate auClient with callbackURL: ' + self.callback_url + ' ERROR: ' + str(e)) try: mguClient = MetagenomeUtils(self.callback_url, token=self.getContext()['token']) except Exception as e: raise ValueError( 'Unable to instantiate mguClient with callbackURL: ' + self.callback_url + ' ERROR: ' + str(e)) base_1 = 'assembly_1plus2' base_2 = 'assembly' dir_2 = 'binned_contigs' type_1 = 'Assembly' type_2 = 'BinnedContigs' ass_file_1_fa = base_1 + '.fa.gz' ass_path_1_fa = os.path.join(self.scratch, ass_file_1_fa) dir_2_path = os.path.join(self.scratch, dir_2) shutil.copy(os.path.join("data", ass_file_1_fa), ass_path_1_fa) shutil.copytree(os.path.join("data", dir_2), dir_2_path) ass_ref_1 = auClient.save_assembly_from_fasta({ 'file': { 'path': ass_path_1_fa }, 'workspace_name': self.getWsName(), 'assembly_name': base_1 + '.' + type_1 }) binned_contigs_ref_2 = mguClient.file_to_binned_contigs({ 'file_directory': dir_2_path, 'workspace_name': self.getWsName(), 'assembly_ref': ass_ref_1, 'binned_contig_name': base_2 + '.' + type_2 })['binned_contig_obj_ref'] # run method base_output_name = method + '_output' fractionate_mode = 'neg' params = { 'workspace_name': self.getWsName(), 'input_assembly_ref': ass_ref_1, 'input_pos_filter_obj_refs': [binned_contigs_ref_2], 'fractionate_mode': fractionate_mode, 'output_name': 'test_fractionated' + '-' + base_1 + '.' + type_1 + '-' + 'binned_contigs_2a2b' + '-' + fractionate_mode } result = self.getImpl().run_fractionate_contigs( self.getContext(), params) print('RESULT:') pprint(result) pass
def __init__(self, callback_url, scratch, ws_url): self.ws_url = ws_url self.callback_url = callback_url self.scratch = scratch self.dfu = DataFileUtil(callback_url) self.mgu = MetagenomeUtils(callback_url)
def load_fastas(config, scratch, upa): ''' ''' dfu = DataFileUtil(config['callback_url']) au = AssemblyUtil(config['callback_url']) mgu = MetagenomeUtils(config['callback_url']) ws = Workspace(config['workspace-url']) obj_data = dfu.get_objects({"object_refs": [upa]})['data'][0] upa = str(obj_data['info'][6]) + '/' + str( obj_data['info'][0]) + '/' + str(obj_data['info'][4]) obj_type = obj_data['info'][2] id_to_assy_info = {} if 'KBaseSets.GenomeSet' in obj_type: upas = [gsi['ref'] for gsi in obj_data['data']['items']] elif 'KBaseSearch.GenomeSet' in obj_type: upas = [gse['ref'] for gse in obj_data['data']['elements'].values()] elif "KBaseGenomes.Genome" in obj_type: upas = [upa] elif "KBaseGenomes.ContigSet" in obj_type or "KBaseGenomeAnnotations.Assembly" in obj_type: # in this case we use the assembly file util to get the fasta file # file_output = os.path.join(scratch, "input_fasta.fa") faf = au.get_assembly_as_fasta({ "ref": upa, 'filename': upa_to_path(scratch, upa) }) return {file_safe_upa(upa): faf} elif "KBaseSets.AssemblySet" in obj_type: for item_upa in obj_data['data']['items']: faf = au.get_assembly_as_fasta({ "ref": upa + ';' + item_upa['ref'], 'filename': upa_to_path(scratch, item_upa['ref']) }) id_to_assy_info[file_safe_upa(item_upa['ref'])] = faf return id_to_assy_info elif 'KBaseMetagenomes.BinnedContigs' in obj_type: return handle_binned_contigs(upa, mgu, scratch) for genome_upa in upas: # this could be sped up by batching the get_objects call # does assy file util not take bulk calls? # maybe doesn't matter since Shock doesn't handle bulk calls if upa != genome_upa: # for single genomes, upa and genome_upa will be the same genome_upa = upa + ';' + genome_upa genome_data = ws.get_objects2({'objects': [{ "ref": genome_upa }]})['data'][0]['data'] target_upa = genome_data.get('contigset_ref') or genome_data.get( 'assembly_ref') assembly_upa = genome_upa + ';' + target_upa faf = au.get_assembly_as_fasta({ 'ref': assembly_upa, 'filename': upa_to_path(scratch, target_upa) }) id_to_assy_info[file_safe_upa(target_upa)] = faf return id_to_assy_info
class VirSorterUtils: def __init__(self, config): self.scratch = os.path.abspath(config['scratch']) self.callback_url = os.environ['SDK_CALLBACK_URL'] self.mgu = MetagenomeUtils(self.callback_url) self.au = AssemblyUtil(self.callback_url) self.ws = Workspace(config['workspace-url'], token=config['token']) def VirSorter_help(self): command = 'wrapper_phage_contigs_sorter_iPlant.pl --help' self._run_command(command) def get_fasta(self, ref): # check type of object, i.e KBaseGenomeAnnotations.Assembly-3.0 obj_type = self.ws.get_object_info3({'objects': [{ 'ref': ref }]})['infos'][0][2] if 'assembly' in obj_type.lower(): genome_ref = ref elif 'kbasegenomes' in obj_type.lower(): data = self.ws.get_objects2({ 'objects': [{ 'ref': ref, 'included': ['assembly_ref'], 'strict_maps': 1 }] })['data'][0]['data'] genome_ref = data['assembly_ref'] else: raise ValueError( f"Input reference {ref} is of type {obj_type}. Type KBaseGenomes.Genome or " f"KBaseGenomeAnnotations.Assembly required.") return self.au.get_assembly_as_fasta({'ref': genome_ref})['path'] def run_VirSorter(self, params): params['SDK_CALLBACK_URL'] = self.callback_url params['KB_AUTH_TOKEN'] = os.environ['KB_AUTH_TOKEN'] # Get contigs from 'assembly' genome_fp = self.get_fasta(params['genomes']) command = 'wrapper_phage_contigs_sorter_iPlant.pl --data-dir /data/virsorter-data' # Add in first args command += f' -f {genome_fp} --db {params["database"]}' # Check if additional genomes were submitted if params.get('add_genomes'): add_genomes_fp = self.get_fasta(params['add_genomes']) print(f'Added genomes DETECTED: {add_genomes_fp}') command += f' --cp {add_genomes_fp}' bool_args = ['virome', 'diamond', 'keep_db', 'no_c'] # keep_db = keep-db for bool_arg in bool_args: if params[ bool_arg] == 1: # 0 is true and therefore run... though for some reason it's reversed on json if bool_arg == 'keep_db': bool_arg = 'keep-db' command += f' --{bool_arg}' self._run_command(command) report = self._generate_report( params) # Basically, do everything that's after the tool runs return report def _run_command(self, command): """ :param command: :return: """ log('Start executing command:\n{}'.format(command)) pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True) output, err = pipe.communicate() exitCode = pipe.returncode if exitCode == 0: log('Executed command:\n{}\n'.format(command) + 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output)) else: error_msg = 'Error running command:\n{}\n'.format(command) error_msg += 'Exit Code: {}\nOutput:\n{}\nError: {}'.format( exitCode, output, err) raise RuntimeError(error_msg) def _parse_summary(self, virsorter_global_fp, affi_contigs_shock_id): columns = [ 'Contig_id', 'Nb genes contigs', 'Fragment', 'Nb genes', 'Category', 'Nb phage hallmark genes', 'Phage gene enrichment sig', 'Non-Caudovirales phage gene enrichment sig', 'Pfam depletion sig', 'Uncharacterized enrichment sig', 'Strand switch depletion sig', 'Short genes enrichment sig', ] try: with open(virsorter_global_fp, 'r') as vir_fh: data = {} category = '' for line in vir_fh: if line.startswith('## Contig_id'): continue elif line.startswith( '## ' ): # If 'header' lines are consumed by 1st if, then remaining should be good category = line.split('## ')[-1].split(' -')[0] else: values = line.strip().split(',') data[values[0]] = dict(zip(columns[1:], values[1:])) except: vir_path = os.path.join(os.getcwd(), 'virsorter-out') files = os.listdir(vir_path) raise RuntimeError( f"{virsorter_global_fp} is not a file. existing files {files}." ) df = pd.DataFrame().from_dict(data, orient='index') df.index.name = columns[0] df.reset_index(inplace=True) html = df.to_html(index=False, classes='my_class table-striped" id = "my_id') # Need to file write below direct_html = html_template.substitute( html_table=html, affi_contigs_shock_id=affi_contigs_shock_id) # Find header so it can be copied to footer, as dataframe.to_html doesn't include footer start_header = Literal("<thead>") end_header = Literal("</thead>") text = start_header + SkipTo(end_header) new_text = '' for data, start_pos, end_pos in text.scanString(direct_html): new_text = ''.join(data).replace( ' style="text-align: right;"', '').replace( 'thead>', 'tfoot>\n ') + '\n</tfoot>' # Get start and end positions to insert new text end_tbody = Literal("</tbody>") end_table = Literal("</table>") insertion_pos = end_tbody + SkipTo(end_table) final_html = '' for data, start_pos, end_pos in insertion_pos.scanString(direct_html): final_html = direct_html[:start_pos + 8] + '\n' + new_text + direct_html[ start_pos + 8:] return final_html def get_assembly_contig_ids(self, assembly_ref): """get contig ids from assembly_ref""" contigs = self.ws.get_objects2( {'objects': [{ 'ref': assembly_ref, 'included': ['contigs'] }]})['data'][0]['data']['contigs'] return contigs.keys() def _generate_report(self, params): """ :param params: :return: """ # Get URL self.dfu = dfu(params['SDK_CALLBACK_URL']) # Output directory should be $PWD/virsorter-out - ASSUMES that's the output location virsorter_outdir = os.path.join(os.getcwd(), 'virsorter-out') print( f'VIRSorter output directory contents: {os.listdir(virsorter_outdir)}' ) # Replacing individual download files with BinnedContigs # kb_deseq adds output files, then builds report files and sends all of them to the workspace output_files = [] # Appended list of dicts containing attributes # Collect all the files needed to report to end-user # Get all predicted viral sequences pred_fnas = glob.glob( os.path.join(virsorter_outdir, 'Predicted_viral_sequences/VIRSorter_*.fasta')) pred_gbs = glob.glob( os.path.join(virsorter_outdir, 'Predicted_viral_sequences/VIRSorter_*.gb')) # Summary 'table' glob_signal = os.path.join(virsorter_outdir, 'VIRSorter_global-phage-signal.csv') print('Identified the following predicted viral sequences:\n{}'.format( '\n\t'.join(pred_fnas))) if len(pred_fnas) == 0: print( f"Unable to find predicted viral sequences, here are the directory's content:\n" f"{os.listdir(os.path.join(virsorter_outdir, 'Predicted_viral_sequences'))}" ) if os.path.exists(glob_signal): print(f'Identified the global phage signal: {glob_signal}') lines = -1 # Don't count header with open(glob_signal) as fh: for ln in fh: lines += 1 if lines == 0: print('But it is EMPTY!') else: print( 'Unable to find the global phage signal file. Was there an error during the run?' ) # Append error and out files from VIRSorter err_fp = os.path.join(virsorter_outdir, 'logs/err') # if os.path.exists(err_fp): # output_files.append({ # 'path': os.path.join(virsorter_outdir, 'logs/err'), # 'name': 'VIRSorter_err', # 'label': 'VIRSorter_err', # 'description': 'VIRSorter error log file, generated from the tool itself.' # }) out_fp = os.path.join(virsorter_outdir, 'logs/out') # if os.path.exists(out_fp): # output_files.append({ # 'path': os.path.join(virsorter_outdir, 'logs/out'), # 'name': 'VIRSorter_out', # 'label': 'VIRSorter_out', # 'description': 'VIRSorter output log file, generated from the tool itself.' # }) if not (os.path.exists(err_fp) or os.path.exists(out_fp)): print( 'Unable to find err and/or out files in LOG directory, contents:' ) print(os.listdir(os.path.join(virsorter_outdir, 'logs'))) # Make output directory output_dir = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_dir) # Deal with nucleotide and protein fasta pred_fna_tgz_fp = os.path.join(output_dir, 'VIRSorter_predicted_viral_fna.tar.gz') with tarfile.open( pred_fna_tgz_fp, 'w:gz') as pred_fna_tgz_fh: # Compress to minimize disk usage for pred_fna in pred_fnas: pred_fna_tgz_fh.add(pred_fna, arcname=os.path.basename(pred_fna)) output_files.append({ 'path': pred_fna_tgz_fp, 'name': os.path.basename(pred_fna_tgz_fp), 'label': os.path.basename(pred_fna_tgz_fp), 'description': 'FASTA-formatted nucleotide sequences of VIRSorter predicted viruses' }) if os.path.exists(pred_fna_tgz_fp): print( f'Generated gzipped version of the predicted viral sequences in FASTA format: ' f'{pred_fna_tgz_fp}') pred_gb_tgz_fp = os.path.join(output_dir, 'VIRSorter_predicted_viral_gb.tar.gz') with tarfile.open(pred_gb_tgz_fp, 'w:gz') as pred_gb_tgz_fh: for pred_gb in pred_gbs: pred_gb_tgz_fh.add(pred_gb, arcname=os.path.basename(pred_gb)) output_files.append({ 'path': pred_gb_tgz_fp, 'name': os.path.basename(pred_gb_tgz_fp), 'label': os.path.basename(pred_gb_tgz_fp), 'description': 'Genbank-formatted sequences of VIRSorter predicted viruses' }) if os.path.exists(pred_gb_tgz_fp): print( f'Generated gzipped version of the predicted viral sequences in Genbank format: ' f'{pred_gb_tgz_fp}') # To create BinnedContig, need to create another directory with each of the "bins" as separate files? binned_contig_output_dir = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(binned_contig_output_dir) # Before creating final HTML output, need to create BinnedContig object so other tools/users can take advantage # of its features, but also to feed more easily into other tools (e.g. vConTACT) created_objects = [] # Will store the objects that go to the workspace # load contig ids from the assembly input # assembly_contig_ids = self.get_assembly_contig_ids(self.assembly_ref) assembly_contig_ids = self.get_assembly_contig_ids( params['genomes']) # Will fail for Genome summary_fp = os.path.join( binned_contig_output_dir, 'VIRSorter.summary') # Anything that ends in .summary with open(summary_fp, 'w') as summary_fh: summary_writer = csv.writer(summary_fh, delimiter='\t', quoting=csv.QUOTE_MINIMAL) summary_writer.writerow( ['Bin name', 'Completeness', 'Genome size', 'GC content']) for category_fp in pred_fnas: # _get_bin_ids from MetaGenomeUtils requires files to follow the header.0xx.fasta convention category = os.path.basename(category_fp).split( 'cat-')[-1].split('.')[0] dest_fn = 'VirSorter.{}.fasta'.format(category.zfill(3)) dest_fp = os.path.join(output_dir, dest_fn) binned_contig_fp = os.path.join(binned_contig_output_dir, dest_fn) genome_size = 0 gc_content = [] # Need stats for summary file # Also need to adjust sequence name so binnedContig object can retrieve sequences adjusted_sequences = [] with open(category_fp, 'rU') as category_fh: for record in SeqIO.parse(category_fh, 'fasta'): seq = record.seq gc_content.append(SeqUtils.GC(seq)) genome_size += len(seq) # This is very dirty, but need to change name to match original contigs record.id = record.id.replace('VIRSorter_', '').replace( '-circular', '').split('-cat_')[0] if 'gene' in record.id: # Prophage record.id = record.id.split('_gene')[0] record.id = record.id.rsplit('_', 1)[0] # here we make sure that the id's line up with contig ids in the input assembly object if record.id not in assembly_contig_ids: for assembly_contig_id in assembly_contig_ids: # first check if record.id is substring of current contig id, # then check if current contig id is substring of record.id # NOTE: this is not a perfect way of checking and will likely # fail in some circumstances. # A more complete check would be to make sure there is a 1:1 # mapping of contig id's in the assembly object as compared to # the binned contig object (the fasta files defined here). if (record.id in assembly_contig_id) or ( assembly_contig_id in record.id): record.id = assembly_contig_id break record.description = '' record.name = '' adjusted_sequences.append(record) if genome_size != 0: # Empty file summary_writer.writerow([ dest_fn, '100%', genome_size, (sum(gc_content) / len(gc_content)) ]) print('Copying {} to results directory'.format( os.path.basename(category_fp))) # Yes, need both. One is to get file_links in report. Second is for binnedContigs object shutil.copyfile(category_fp, dest_fp) # Write renamed sequences with open(binned_contig_fp, 'w') as binned_contig_fh: SeqIO.write(adjusted_sequences, binned_contig_fh, 'fasta') result = self.au.save_assembly_from_fasta({ 'file': { 'path': dest_fp }, 'workspace_name': params['workspace_name'], 'assembly_name': 'VirSorter-Category-{}'.format(category) }) created_objects.append({ "ref": result, "description": "KBase Assembly object from VIRSorter" }) # Create BinnedContigs object, but 1st, a little metadata generate_binned_contig_param = { 'file_directory': binned_contig_output_dir, 'assembly_ref': params['genomes'], # params.get('genomes'), self.assembly_ref 'binned_contig_name': params['binned_contig_name'], 'workspace_name': params['workspace_name'] } binned_contig_object_ref = self.mgu.file_to_binned_contigs( generate_binned_contig_param).get('binned_contig_obj_ref') # Add binned contigs reference here, as it was already created above created_objects.append({ "ref": binned_contig_object_ref, "description": "BinnedContigs from VIRSorter" }) # Save VIRSorter_affi-contigs.tab for DRAM-v affi_contigs_fp = os.path.join(virsorter_outdir, 'Metric_files', 'VIRSorter_affi-contigs.tab') affi_contigs_shock_id = self.dfu.file_to_shock( {'file_path': affi_contigs_fp})['shock_id'] # Use global signal (i.e. summary) file and create HTML-formatted version raw_html = self._parse_summary(glob_signal, affi_contigs_shock_id) html_fp = os.path.join(output_dir, 'index.html') with open(html_fp, 'w') as html_fh: html_fh.write(raw_html) report_shock_id = self.dfu.file_to_shock({ 'file_path': output_dir, 'pack': 'zip' })['shock_id'] html_report = [{ 'shock_id': report_shock_id, 'name': os.path.basename(html_fp), 'label': os.path.basename(html_fp), 'description': 'HTML summary report for VIRSorter-predicted viral genomes.' }] report_params = { 'message': 'Here are the results from your VIRSorter run. Above, you\'ll find a report with ' 'all the identified (putative) viral genomes, and below, links to the report as ' 'well as files generated.', 'workspace_name': params['workspace_name'], 'html_links': html_report, 'direct_html_link_index': 0, 'report_object_name': 'VIRSorter_report_{}'.format(str(uuid.uuid4())), 'file_links': output_files, 'objects_created': created_objects, } kbase_report_client = KBaseReport(params['SDK_CALLBACK_URL'], token=params['KB_AUTH_TOKEN']) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'], 'result_directory': binned_contig_output_dir, 'binned_contig_obj_ref': binned_contig_object_ref } return report_output def _mkdir_p(self, path): """ :param path: :return: """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise
class DASToolUtil: DASTOOL_THREADS = 2 BINNER_RESULT_DIRECTORY = 'das_tool_output_dir' BINNER_BIN_RESULT_DIR = 'das_tool_output_dir_DASTool_bins' def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.scratch = config['scratch'] self.shock_url = config['shock-url'] self.ws_url = config['workspace-url'] self.dfu = DataFileUtil(self.callback_url) self.ru = ReadsUtils(self.callback_url) self.au = AssemblyUtil(self.callback_url) self.mgu = MetagenomeUtils(self.callback_url) def validate_run_das_tool_params(self, params): """ validate_run_concoct_params: validates params passed to run_concoct method """ log('Start validating run_kb_das_tool params') # check for required parameters for p in [ 'assembly_ref', 'input_binned_contig_names', 'output_binned_contig_name', 'workspace_name' ]: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) def mkdir_p(self, path): """ mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def run_command(self, command): """ run_command: run command and print result """ #os.chdir(self.scratch) log('Start executing command:\n{}'.format(command)) log('Command is running from:\n{}'.format(self.scratch)) pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True) output, stderr = pipe.communicate() exitCode = pipe.returncode if (exitCode == 0): log('Executed command:\n{}\n'.format(command) + 'Exit Code: {}\n'.format(exitCode)) else: error_msg = 'Error running command:\n{}\n'.format(command) error_msg += 'Exit Code: {}\nOutput:\n{}\nStderr:\n{}'.format( exitCode, output, stderr) raise ValueError(error_msg) sys.exit(1) return (output, stderr) def get_contig_file(self, assembly_ref): """ get_contig_file: get contif file from GenomeAssembly object """ contig_file = self.au.get_assembly_as_fasta({ 'ref': assembly_ref }).get('path') sys.stdout.flush() contig_file = self.dfu.unpack_file({'file_path': contig_file})['file_path'] return contig_file def retrieve_and_clean_assembly(self, task_params): if os.path.exists(task_params['contig_file_path']): assembly = task_params['contig_file_path'] print("FOUND ASSEMBLY ON LOCAL SCRATCH") else: # we are on njsw so lets copy it over to scratch assembly = self.get_contig_file(task_params['assembly_ref']) # remove spaces from fasta headers because that breaks bedtools assembly_clean = os.path.abspath(assembly).split( '.fa')[0] + "_clean.fa" command = '/bin/bash reformat.sh in={} out={} addunderscore'.format( assembly, assembly_clean) log('running reformat command: {}'.format(command)) out, err = self.run_command(command) return assembly_clean def generate_output_file_list(self, result_directory): """ generate_output_file_list: zip result files and generate file_links for report """ log('Start packing result files') output_files = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self.mkdir_p(output_directory) result_file = os.path.join(output_directory, 'das_tool_result.zip') report_file = None with zipfile.ZipFile(result_file, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as zip_file: # grab all files we want to zip for dirname, subdirs, files in os.walk(result_directory): for file in files: if (file.endswith('.sam') or file.endswith('.bam') or file.endswith('.bai') or file.endswith('.summary')): continue if (dirname.endswith(self.BINNER_BIN_RESULT_DIR)): continue zip_file.write(os.path.join(dirname, file), file) if (dirname.endswith(self.BINNER_BIN_RESULT_DIR)): baseDir = os.path.basename(dirname) for file in files: full = os.path.join(dirname, file) zip_file.write(full, os.path.join(baseDir, file)) output_files.append({ 'path': result_file, 'name': os.path.basename(result_file), 'label': os.path.basename(result_file), 'description': 'Files generated by kb_das_tool App' }) return output_files def generate_html_report(self, result_directory, assembly_ref, binned_contig_obj_ref): """ generate_html_report: generate html summary report """ log('Start generating html report') #html_report = list() output_directory = os.path.join(self.scratch, 'html_dir_' + str(uuid.uuid4())) self.mkdir_p(output_directory) result_file_path = os.path.join(output_directory, 'report.html') # get summary data from existing assembly object and bins_objects Summary_Table_Content = '' Overview_Content = '' (binned_contig_count, input_contig_count, total_bins_count) = self.generate_overview_info( assembly_ref, binned_contig_obj_ref, result_directory) # get pdfs pdf_filename_l = [ f for f in os.listdir(self.BINNER_RESULT_DIRECTORY) if f.endswith('.pdf') ] assert len(pdf_filename_l) == 2 Overview_Content += '<p>Binned contigs: {}</p>'.format( binned_contig_count) Overview_Content += '<p>Input contigs: {}</p>'.format( input_contig_count) Overview_Content += '<p>Number of bins: {}</p>'.format( total_bins_count) for pdf_filename in pdf_filename_l: Overview_Content += '\n<embed src="{}" width="1000px" height="700px">'.format( pdf_filename) with open(result_file_path, 'w') as result_file: with open( os.path.join(os.path.dirname(__file__), 'report_template.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace( '<p>Overview_Content</p>', Overview_Content) report_template = report_template.replace( 'Summary_Table_Content', Summary_Table_Content) result_file.write(report_template) # copy pdfs into html dir for pdf_filename in pdf_filename_l: shutil.copyfile( os.path.join(self.BINNER_RESULT_DIRECTORY, pdf_filename), os.path.join(output_directory, pdf_filename)) # save html dir to shock def dir_to_shock(dir_path, name, description): ''' For regular directories or html directories name - for regular directories: the name of the flat (zip) file returned to ui for html directories: the name of the html file ''' dfu_fileToShock_ret = self.dfu.file_to_shock({ 'file_path': dir_path, 'make_handle': 0, 'pack': 'zip', }) dir_shockInfo = { 'shock_id': dfu_fileToShock_ret['shock_id'], 'name': name, 'description': description } return dir_shockInfo html_shockInfo = dir_to_shock(output_directory, 'report.html', 'Report html for DAS tool') """ html_report.append({'path': result_file_path, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for kb_concoct App'}) return html_report """ return [html_shockInfo] def generate_overview_info(self, assembly_ref, binned_contig_obj_ref, result_directory): """ _generate_overview_info: generate overview information from assembly and binnedcontig """ # get assembly and binned_contig objects that already have some data populated in them assembly = self.dfu.get_objects({'object_refs': [assembly_ref]})['data'][0] binned_contig = self.dfu.get_objects( {'object_refs': [binned_contig_obj_ref]})['data'][0] input_contig_count = assembly.get('data').get('num_contigs') bins_directory = os.path.join(self.scratch, result_directory, self.BINNER_BIN_RESULT_DIR) binned_contig_count = 0 total_bins_count = 0 total_bins = binned_contig.get('data').get('bins') total_bins_count = len(total_bins) for bin in total_bins: binned_contig_count += len(bin.get('contigs')) return (binned_contig_count, input_contig_count, total_bins_count) def generate_report(self, binned_contig_obj_ref, params): """ generate_report: generate summary report """ log('Generating report') params['result_directory'] = self.BINNER_RESULT_DIRECTORY output_files = self.generate_output_file_list( params['result_directory']) output_html_files = self.generate_html_report( params['result_directory'], params['assembly_ref'], binned_contig_obj_ref) report_params = { 'message': '', 'workspace_name': params.get('workspace_name'), 'file_links': output_files, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 500, 'report_object_name': 'kb_das_tool_report_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output def rename_and_standardize_bin_names(self): """ generate_command: generate renamed bins """ log("\n\nRunning rename_and_standardize_bin_names") i = 0 path_to_result_bins = os.path.join(self.scratch, self.BINNER_RESULT_DIRECTORY, "das_tool_output_dir_DASTool_bins") path_to_das_tool_key = os.path.abspath( path_to_result_bins) + '/das_tool_name_key.tsv' with open(path_to_das_tool_key, 'w+') as f: f.write("Original.Bin.Name\tRenamed.Bin.Name\n") for dirname, subdirs, files in os.walk(path_to_result_bins): for file in files: if file.endswith('.fa'): i += 1 os.rename( os.path.abspath(path_to_result_bins) + '/' + file, os.path.abspath(path_to_result_bins) + '/bin.' + str(i).zfill(3) + '.fasta') # need to change to 4 digits f.write(file + '\tbin.' + str(i).zfill(3) + '.fasta\n') def make_binned_contig_summary_file_for_binning_apps(self, task_params): """ generate_command: generate binned contig summary command """ log("\n\nRunning make_binned_contig_summary_file_for_binning_apps") result_directory = task_params['result_directory'] path_to_result_bins = '{}/{}/'.format( result_directory, task_params['bin_result_directory']) path_to_summary_file = path_to_result_bins + 'binned_contig.summary' with open(path_to_summary_file, 'w+') as f: f.write("Bin name\tCompleteness\tGenome size\tGC content\n") for dirname, subdirs, files in os.walk(path_to_result_bins): for file in files: if file.endswith('.fasta'): genome_bin_fna_file = os.path.join( path_to_result_bins, file) bbstats_output_file = os.path.join( self.scratch, self.BINNER_RESULT_DIRECTORY, genome_bin_fna_file).split( '.fasta')[0] + ".bbstatsout" bbstats_output = self.generate_stats_for_genome_bins( task_params, genome_bin_fna_file, bbstats_output_file) f.write('{}\t0\t{}\t{}\n'.format( genome_bin_fna_file.split("/")[-1], bbstats_output['contig_bp'], bbstats_output['gc_avg'])) log('Finished make_binned_contig_summary_file_for_binning_apps function' ) # # def make_binned_contig_summary_file_for_binning_apps(self, task_params): # """ # generate_command: generate binned contig summary command # """ # log("\n\nRunning make_binned_contig_summary_file_for_binning_apps") # path_to_result = os.path.join(self.scratch, self.BINNER_RESULT_DIRECTORY, "das_tool_output_dir_DASTool_bins") # path_to_summary_file = path_to_result + '/binned_contig.summary' # with open(path_to_summary_file, 'w+') as f: # f.write("Bin name\tCompleteness\tGenome size\tGC content\n") # for dirname, subdirs, files in os.walk(path_to_result): # for file in files: # if file.endswith('.fasta'): # genome_bin_fna_file = os.path.join(path_to_result, file) # bbstats_output_file = os.path.join(path_to_result, # genome_bin_fna_file).split('.fasta')[0] + ".bbstatsout" # bbstats_output = self.generate_stats_for_genome_bins(task_params, # genome_bin_fna_file, # bbstats_output_file) # f.write('{}\t0\t{}\t{}\n'.format(genome_bin_fna_file.split("/")[-1], # bbstats_output['contig_bp'], # bbstats_output['gc_avg'])) # f.close() # log('Finished make_binned_contig_summary_file_for_binning_apps function') # def generate_stats_for_genome_bins(self, task_params, genome_bin_fna_file, bbstats_output_file): """ generate_command: bbtools stats.sh command """ log("running generate_stats_for_genome_bins on {}".format( genome_bin_fna_file)) genome_bin_fna_file = os.path.join(self.scratch, self.BINNER_RESULT_DIRECTORY, genome_bin_fna_file) command = '/bin/bash stats.sh in={} format=3 > {}'.format( genome_bin_fna_file, bbstats_output_file) self.run_command(command) bbstats_output = open(bbstats_output_file, 'r').readlines()[1] n_scaffolds = bbstats_output.split('\t')[0] n_contigs = bbstats_output.split('\t')[1] scaf_bp = bbstats_output.split('\t')[2] contig_bp = bbstats_output.split('\t')[3] gap_pct = bbstats_output.split('\t')[4] scaf_N50 = bbstats_output.split('\t')[5] scaf_L50 = bbstats_output.split('\t')[6] ctg_N50 = bbstats_output.split('\t')[7] ctg_L50 = bbstats_output.split('\t')[8] scaf_N90 = bbstats_output.split('\t')[9] scaf_L90 = bbstats_output.split('\t')[10] ctg_N90 = bbstats_output.split('\t')[11] ctg_L90 = bbstats_output.split('\t')[12] scaf_max = bbstats_output.split('\t')[13] ctg_max = bbstats_output.split('\t')[14] scaf_n_gt50K = bbstats_output.split('\t')[15] scaf_pct_gt50K = bbstats_output.split('\t')[16] gc_avg = float(bbstats_output.split('\t') [17]) * 100 # need to figure out if correct gc_std = float(bbstats_output.split('\t') [18]) * 100 # need to figure out if correct log('Generated generate_stats_for_genome_bins command: {}'.format( command)) return { 'n_scaffolds': n_scaffolds, 'n_contigs': n_contigs, 'scaf_bp': scaf_bp, 'contig_bp': contig_bp, 'gap_pct': gap_pct, 'scaf_N50': scaf_N50, 'scaf_L50': scaf_L50, 'ctg_N50': ctg_N50, 'ctg_L50': ctg_L50, 'scaf_N90': scaf_N90, 'scaf_L90': scaf_L90, 'ctg_N90': ctg_N90, 'ctg_L90': ctg_L90, 'scaf_max': scaf_max, 'ctg_max': ctg_max, 'scaf_n_gt50K': scaf_n_gt50K, 'scaf_pct_gt50K': scaf_pct_gt50K, 'gc_avg': gc_avg, 'gc_std': gc_std } def generate_das_tool_input_files_and_commands_from_binned_contigs( self, params): #params['binned_contig_list_file'] = binned_contig_list_file binned_contig_names = params['input_binned_contig_names'] trimmed_binned_contig_name_list = [] contig_to_bin_file_name_list = [] for input_ref in binned_contig_names: # next line needed for testing # binned_contig = self.dfu.get_objects({'object_refs': [input_ref['binned_contig_obj_ref']]})['data'][0] # next line needed in production only binned_contig = self.dfu.get_objects({'object_refs': [input_ref]})['data'][0] binned_contig_name = binned_contig.get('info')[1] binned_contig_data = binned_contig.get('data') bins = binned_contig_data.get('bins') trimmed_binned_contig_name = binned_contig_name.split( ".BinnedContig")[0] trimmed_binned_contig_name_list.append(trimmed_binned_contig_name) contig_to_bin_file_name = "{}_contigs_to_bins.tsv".format( trimmed_binned_contig_name) contig_to_bin_file_name_list.append(contig_to_bin_file_name) f = open(contig_to_bin_file_name, "w+") for bin in bins: bin_id = bin.get('bid') trimmed_bin_id = bin_id.split(".fasta")[0] contigs = bin.get('contigs') for contig_id, contig_value in contigs.items(): f.write("{}\t{}.{}\n".format(contig_id, trimmed_binned_contig_name, trimmed_bin_id)) f.close() #contig_to_bin_file_name_list = self.BINNER_RESULT_DIRECTORY + contig_to_bin_file_name # temp = str(self.BINNER_RESULT_DIRECTORY) + '/' # contig_to_bin_file_name_list = [temp + s for s in contig_to_bin_file_name_list] return (trimmed_binned_contig_name_list, contig_to_bin_file_name_list) def generate_das_tool_command(self, params, trimmed_binned_contig_name_list, contig_to_bin_file_name_list): """ generate_command: generate concoct params """ print("\n\nRunning generate_das_tool_command") command = 'DAS_Tool ' command += '-i {} '.format(contig_to_bin_file_name_list) command += '-l {} '.format(trimmed_binned_contig_name_list) command += '-c {} '.format(params.get('contig_file_path')) command += '-o {} '.format(self.BINNER_RESULT_DIRECTORY) command += '--search_engine {} '.format(params.get('search_engine')) command += '--score_threshold {} '.format( params.get('score_threshold')) command += '--duplicate_penalty {} '.format( params.get('duplicate_penalty')) command += '--megabin_penalty {} '.format( params.get('megabin_penalty')) command += '--write_bin_evals {} '.format( params.get('write_bin_evals')) command += '--create_plots {} '.format(params.get('create_plots')) command += '--write_bins 1 ' command += '--write_unbinned 0 ' command += '-t {}'.format(self.DASTOOL_THREADS) log('Generated das_tool command: {}'.format(command)) return command def run_das_tool(self, params): """ run_das_tool: DAS_Tool app required params: assembly_ref: Metagenome assembly object reference input_binned_contig_names: list of BinnedContig objects output_binned_contig_name: output BinnedContig object name workspace_name: the name of the workspace it gets saved to. optional params: search_engine; default diamond score_threshold; default 0.5 duplicate_penalty; default 0.6 megabin_penalty; default 0.5 write_bin_evals; default 1 create_plots; default 1 write_bins; default 1 write_unbinned; default 0 ref: https://github.com/cmks/DAS_Tool """ log('--->\nrunning DASToolUtil.run_das_tool\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self.validate_run_das_tool_params(params) print("\n\nFinished running validate_run_das_tool_params") # contig_file = self.get_contig_file(params.get('assembly_ref')) params['contig_file_path'] = contig_file result_directory = os.path.join(self.scratch, self.BINNER_RESULT_DIRECTORY) params['result_directory'] = result_directory self.mkdir_p(result_directory) cwd = os.getcwd() log('Changing working dir to {}'.format(result_directory)) os.chdir(result_directory) ( trimmed_binned_contig_name_list, contig_to_bin_file_name_list ) = self.generate_das_tool_input_files_and_commands_from_binned_contigs( params) comma_symbol = ',' trimmed_binned_contig_name_list = comma_symbol.join( trimmed_binned_contig_name_list) contig_to_bin_file_name_list = comma_symbol.join( contig_to_bin_file_name_list) log(os.listdir(result_directory)) log("trimmed_binned_contig_name_list {}".format( trimmed_binned_contig_name_list)) log("contig_to_bin_file_name_list {}".format( contig_to_bin_file_name_list)) # binned_contig_to_file_params = { # 'input_ref': input_ref['binned_contig_obj_ref'], # 'save_to_shock': 1, # 'bin_file_directory': '{}/bin_set_{}/'.format(result_directory, i), # 'workspace_name': params.get('workspace_name'), # } # # self.mgu.binned_contigs_to_file(binned_contig_to_file_params) # returns "binned_contig_obj_ref" of type "obj_ref" (An X/Y/Z style reference) #shutil.copytree(bin_file_directory, os.path.join(result_directory, bin_file_directory)) #print('\n\n\n result: {}'.format(self.mgu.binned_contigs_to_file(binned_contig_to_file_params))) #run concoct command = self.generate_das_tool_command( params, trimmed_binned_contig_name_list, contig_to_bin_file_name_list) log('\nWorking dir is {}'.format(result_directory)) log('\nWorking dir is {}'.format(os.getcwd())) log('Changing working dir to {}'.format(result_directory)) os.chdir(result_directory) self.run_command(command) os.chdir(self.scratch) task_params = {} task_params['result_directory'] = os.path.join(self.scratch) task_params['bin_result_directory'] = os.path.join( self.BINNER_RESULT_DIRECTORY, "das_tool_output_dir_DASTool_bins") # check to make sure bins were generated, otherwise no need to run the rest if not os.path.exists(task_params['bin_result_directory']): log('DAS_Tool did not succeed in generating a set of bins using the input bins and parameters - skipping the creation of a new BinnedContig object.' ) log('Note: this result is sometimes expected using the DAS-Tool workflow; it is possible that DAS-Tool cannot optimize the input binned contigs.' ) log('KBase is aware of this error!') log('Currently KBase manages this run instance as an error because KBase is expecting an output set of binned contigs.' ) raise ValueError( 'No bins generated - this is one of the expected results when DAS-Tool cannot optimize the input bins, and not necessarily an error. KBase is aware of the issue where DAS-Tool runs successfully but does not produce any output set of optimized bins.' ) else: self.rename_and_standardize_bin_names() self.make_binned_contig_summary_file_for_binning_apps(task_params) generate_binned_contig_param = { 'file_directory': os.path.join(self.scratch, self.BINNER_RESULT_DIRECTORY, "das_tool_output_dir_DASTool_bins"), 'assembly_ref': params.get('assembly_ref'), 'binned_contig_name': params.get('output_binned_contig_name'), 'workspace_name': params.get('workspace_name') } binned_contig_obj_ref = self.mgu.file_to_binned_contigs( generate_binned_contig_param).get('binned_contig_obj_ref') reportVal = self.generate_report(binned_contig_obj_ref, params) returnVal = { 'result_directory': os.path.join(self.scratch, self.BINNER_RESULT_DIRECTORY), 'binned_contig_obj_ref': binned_contig_obj_ref } returnVal.update(reportVal) return returnVal
def __init__(self, config): self.scratch = os.path.abspath(config['scratch']) self.callback_url = os.environ['SDK_CALLBACK_URL'] self.mgu = MetagenomeUtils(self.callback_url) self.au = AssemblyUtil(self.callback_url) self.ws = Workspace(config['workspace-url'], token=config['token'])
class CocacolaUtil: CONCOCT_BASE_PATH = '/kb/deployment/bin/CONCOCT' COCACOLA_BASE_PATH = '/kb/module/lib/kb_cocacola/bin/COCACOLA-python' BINNER_RESULT_DIRECTORY = 'cocacola_output_dir' BINNER_BIN_RESULT_DIR = 'final_bins' MAPPING_THREADS = 16 BBMAP_MEM = '30g' def __init__(self, config): self.callback_url = os.environ['SDK_CALLBACK_URL'] self.scratch = config['scratch'] self.shock_url = config['shock-url'] self.ws_url = config['workspace-url'] self.dfu = DataFileUtil(self.callback_url) self.ru = ReadsUtils(self.callback_url) self.au = AssemblyUtil(self.callback_url) self.mgu = MetagenomeUtils(self.callback_url) def _validate_run_cocacola_params(self, task_params): """ _validate_run_cocacola_params: validates params passed to run_cocacola method """ log('Start validating run_cocacola params') # check for required parameters for p in ['assembly_ref', 'binned_contig_name', 'workspace_name', 'reads_list', 'read_mapping_tool']: if p not in task_params: raise ValueError('"{}" parameter is required, but missing'.format(p)) def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _run_command(self, command): """ _run_command: run command and print result """ os.chdir(self.scratch) log('Start executing command:\n{}'.format(command)) log('Command is running from:\n{}'.format(self.scratch)) pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True) output, stderr = pipe.communicate() exitCode = pipe.returncode if (exitCode == 0): log('Executed command:\n{}\n'.format(command) + 'Exit Code: {}\n'.format(exitCode)) else: error_msg = 'Error running command:\n{}\n'.format(command) error_msg += 'Exit Code: {}\nOutput:\n{}\nStderr:\n{}'.format(exitCode, output, stderr) raise ValueError(error_msg) sys.exit(1) return (output, stderr) # this function has been customized to return read_type variable (interleaved vs single-end library) def stage_reads_list_file(self, reads_list): """ stage_reads_list_file: download fastq file associated to reads to scratch area and return result_file_path """ log('Processing reads object list: {}'.format(reads_list)) result_file_path = [] read_type = [] # getting from workspace and writing to scratch. The 'reads' dictionary now has file paths to scratch. reads = self.ru.download_reads({'read_libraries': reads_list, 'interleaved': None})['files'] # reads_list is the list of file paths on workspace? (i.e. 12804/1/1). # "reads" is the hash of hashes where key is "12804/1/1" or in this case, read_obj and # "files" is the secondary key. The tertiary keys are "fwd" and "rev", as well as others. for read_obj in reads_list: files = reads[read_obj]['files'] # 'files' is dictionary where 'fwd' is key of file path on scratch. result_file_path.append(files['fwd']) read_type.append(files['type']) if 'rev' in files and files['rev'] is not None: result_file_path.append(files['rev']) return result_file_path, read_type def _get_contig_file(self, assembly_ref): """ _get_contig_file: get contig file from GenomeAssembly object """ contig_file = self.au.get_assembly_as_fasta({'ref': assembly_ref}).get('path') sys.stdout.flush() contig_file = self.dfu.unpack_file({'file_path': contig_file})['file_path'] return contig_file def retrieve_and_clean_assembly(self, task_params): if os.path.exists(task_params['contig_file_path']): assembly = task_params['contig_file_path'] print("FOUND ASSEMBLY ON LOCAL SCRATCH") else: # we are on njsw so lets copy it over to scratch assembly = self._get_contig_file(task_params['assembly_ref']) # remove spaces from fasta headers because that breaks bedtools assembly_clean = os.path.abspath(assembly).split('.fa')[0] + "_clean.fa" command = '/bin/bash reformat.sh in={} out={} addunderscore overwrite=true'.format(assembly, assembly_clean) log('running reformat command: {}'.format(command)) out, err = self._run_command(command) return assembly_clean def fasta_filter_contigs_generator(self, fasta_record_iter, min_contig_length): """ generates SeqRecords iterator for writing from a legacy contigset object """ rows = 0 rows_added = 0 for record in fasta_record_iter: rows += 1 if len(record.seq) >= min_contig_length: rows_added += 1 yield record def filter_contigs_by_length(self, fasta_file_path, min_contig_length): """ removes all contigs less than the min_contig_length provided """ filtered_fasta_file_path = os.path.abspath(fasta_file_path).split('.fa')[0] + "_filtered.fa" fasta_record_iter = SeqIO.parse(fasta_file_path, 'fasta') SeqIO.write(self.fasta_filter_contigs_generator(fasta_record_iter, min_contig_length), filtered_fasta_file_path, 'fasta') return filtered_fasta_file_path def generate_stats_for_genome_bins(self, task_params, genome_bin_fna_file, bbstats_output_file): """ generate_command: bbtools stats.sh command """ log("running generate_stats_for_genome_bins on {}".format(genome_bin_fna_file)) genome_bin_fna_file = os.path.join(self.scratch, self.BINNER_RESULT_DIRECTORY, genome_bin_fna_file) command = '/bin/bash stats.sh in={} format=3 > {}'.format(genome_bin_fna_file, bbstats_output_file) self._run_command(command) bbstats_output = open(bbstats_output_file, 'r').readlines()[1] n_scaffolds = bbstats_output.split('\t')[0] n_contigs = bbstats_output.split('\t')[1] scaf_bp = bbstats_output.split('\t')[2] contig_bp = bbstats_output.split('\t')[3] gap_pct = bbstats_output.split('\t')[4] scaf_N50 = bbstats_output.split('\t')[5] scaf_L50 = bbstats_output.split('\t')[6] ctg_N50 = bbstats_output.split('\t')[7] ctg_L50 = bbstats_output.split('\t')[8] scaf_N90 = bbstats_output.split('\t')[9] scaf_L90 = bbstats_output.split('\t')[10] ctg_N90 = bbstats_output.split('\t')[11] ctg_L90 = bbstats_output.split('\t')[12] scaf_max = bbstats_output.split('\t')[13] ctg_max = bbstats_output.split('\t')[14] scaf_n_gt50K = bbstats_output.split('\t')[15] scaf_pct_gt50K = bbstats_output.split('\t')[16] gc_avg = float(bbstats_output.split('\t')[17]) * 100 # need to figure out if correct gc_std = float(bbstats_output.split('\t')[18]) * 100 # need to figure out if correct log('Generated generate_stats_for_genome_bins command: {}'.format(command)) return {'n_scaffolds': n_scaffolds, 'n_contigs': n_contigs, 'scaf_bp': scaf_bp, 'contig_bp': contig_bp, 'gap_pct': gap_pct, 'scaf_N50': scaf_N50, 'scaf_L50': scaf_L50, 'ctg_N50': ctg_N50, 'ctg_L50': ctg_L50, 'scaf_N90': scaf_N90, 'scaf_L90': scaf_L90, 'ctg_N90': ctg_N90, 'ctg_L90': ctg_L90, 'scaf_max': scaf_max, 'ctg_max': ctg_max, 'scaf_n_gt50K': scaf_n_gt50K, 'scaf_pct_gt50K': scaf_pct_gt50K, 'gc_avg': gc_avg, 'gc_std': gc_std } def deinterlace_raw_reads(self, fastq): fastq_forward = fastq.split('.fastq')[0] + "_forward.fastq" fastq_reverse = fastq.split('.fastq')[0] + "_reverse.fastq" command = 'reformat.sh in={} out1={} out2={} overwrite=true'.format(fastq, fastq_forward, fastq_reverse) self._run_command(command) return (fastq_forward, fastq_reverse) def run_read_mapping_interleaved_pairs_mode(self, task_params, assembly_clean, fastq, sam): read_mapping_tool = task_params['read_mapping_tool'] log("running {} mapping in interleaved mode.".format(read_mapping_tool)) if task_params['read_mapping_tool'] == 'bbmap': command = 'bbmap.sh -Xmx{} '.format(self.BBMAP_MEM) command += 'threads={} '.format(self.MAPPING_THREADS) command += 'ref={} '.format(assembly_clean) command += 'in={} '.format(fastq) command += 'out={} '.format(sam) command += 'fast interleaved=true mappedonly nodisk overwrite' elif task_params['read_mapping_tool'] == 'bwa': (fastq_forward, fastq_reverse) = self.deinterlace_raw_reads(fastq) command = 'bwa index {} && '.format(assembly_clean) command += 'bwa mem -t {} '.format(self.MAPPING_THREADS) command += '{} '.format(assembly_clean) command += '{} '.format(fastq_forward) command += '{} > '.format(fastq_reverse) command += '{}'.format(sam) elif task_params['read_mapping_tool'] == 'bowtie2_default': (fastq_forward, fastq_reverse) = self.deinterlace_raw_reads(fastq) bt2index = os.path.basename(assembly_clean) + '.bt2' command = 'bowtie2-build -f {} '.format(assembly_clean) command += '--threads {} '.format(self.MAPPING_THREADS) command += '{} && '.format(bt2index) command += 'bowtie2 -x {} '.format(bt2index) command += '-1 {} '.format(fastq_forward) command += '-2 {} '.format(fastq_reverse) command += '--threads {} '.format(self.MAPPING_THREADS) command += '-S {}'.format(sam) elif task_params['read_mapping_tool'] == 'bowtie2_very_sensitive': (fastq_forward, fastq_reverse) = self.deinterlace_raw_reads(fastq) bt2index = os.path.basename(assembly_clean) + '.bt2' command = 'bowtie2-build -f {} '.format(assembly_clean) command += '--threads {} '.format(self.MAPPING_THREADS) command += '{} && '.format(bt2index) command += 'bowtie2 --very-sensitive -x {} '.format(bt2index) command += '-1 {} '.format(fastq_forward) command += '-2 {} '.format(fastq_reverse) command += '--threads {} '.format(self.MAPPING_THREADS) command += '-S {}'.format(sam) elif task_params['read_mapping_tool'] == 'minimap2': (fastq_forward, fastq_reverse) = self.deinterlace_raw_reads(fastq) command = 'minimap2 -ax sr -t {} '.format(self.MAPPING_THREADS) command += '{} '.format(assembly_clean) command += '{} '.format(fastq_forward) command += '{} > '.format(fastq_reverse) command += '{}'.format(sam) elif task_params['read_mapping_tool'] == 'hisat2': (fastq_forward, fastq_reverse) = self.deinterlace_raw_reads(fastq) ht2index = os.path.basename(assembly_clean) + '.ht2' command = 'hisat2-build {} '.format(assembly_clean) command += '{} && '.format(ht2index) command += 'hisat2 -x {} '.format(ht2index) command += '-1 {} '.format(fastq_forward) command += '-2 {} '.format(fastq_reverse) command += '-S {} '.format(sam) command += '--threads {}'.format(self.MAPPING_THREADS) log('running alignment command: {}'.format(command)) out, err = self._run_command(command) def run_read_mapping_unpaired_mode(self, task_params, assembly_clean, fastq, sam): read_mapping_tool = task_params['read_mapping_tool'] log("running {} mapping in single-end (unpaired) mode.".format(read_mapping_tool)) if task_params['read_mapping_tool'] == 'bbmap': command = 'bbmap.sh -Xmx{} '.format(self.BBMAP_MEM) command += 'threads={} '.format(self.MAPPING_THREADS) command += 'ref={} '.format(assembly_clean) command += 'in={} '.format(fastq) command += 'out={} '.format(sam) command += 'fast interleaved=false mappedonly nodisk overwrite' # BBMap is deterministic without the deterministic flag if using single-ended reads elif task_params['read_mapping_tool'] == 'bwa': command = 'bwa index {} && '.format(assembly_clean) command += 'bwa mem -t {} '.format(self.MAPPING_THREADS) command += '{} '.format(assembly_clean) command += '{} > '.format(fastq) command += '{}'.format(sam) elif task_params['read_mapping_tool'] == 'bowtie2_default': bt2index = os.path.basename(assembly_clean) + '.bt2' command = 'bowtie2-build -f {} '.format(assembly_clean) command += '--threads {} '.format(self.MAPPING_THREADS) command += '{} && '.format(bt2index) command += 'bowtie2 -x {} '.format(bt2index) command += '-U {} '.format(fastq) command += '--threads {} '.format(self.MAPPING_THREADS) command += '-S {}'.format(sam) elif task_params['read_mapping_tool'] == 'bowtie2_very_sensitive': bt2index = os.path.basename(assembly_clean) + '.bt2' command = 'bowtie2-build -f {} '.format(assembly_clean) command += '--threads {} '.format(self.MAPPING_THREADS) command += '{} && '.format(bt2index) command += 'bowtie2 --very-sensitive -x {} '.format(bt2index) command += '-U {} '.format(fastq) command += '--threads {} '.format(self.MAPPING_THREADS) command += '-S {}'.format(sam) elif task_params['read_mapping_tool'] == 'minimap2': command = 'minimap2 -ax sr -t {} '.format(self.MAPPING_THREADS) command += '{} '.format(assembly_clean) command += '{} > '.format(fastq) command += '{}'.format(sam) elif task_params['read_mapping_tool'] == 'hisat2': ht2index = os.path.basename(assembly_clean) + '.ht2' command = 'hisat2-build {} '.format(assembly_clean) command += '{} && '.format(ht2index) command += 'hisat2 -x {} '.format(ht2index) command += '-U {} '.format(fastq) command += '-S {} '.format(sam) command += '--threads {}'.format(self.MAPPING_THREADS) log('running alignment command: {}'.format(command)) out, err = self._run_command(command) def convert_sam_to_sorted_and_indexed_bam(self, sam): # create bam files from sam files sorted_bam = os.path.abspath(sam).split('.sam')[0] + "_sorted.bam" command = 'samtools view -F 0x04 -uS {} | '.format(sam) command += 'samtools sort - -o {}'.format(sorted_bam) log('running samtools command to generate sorted bam: {}'.format(command)) self._run_command(command) # verify we got bams if not os.path.exists(sorted_bam): log('Failed to find bam file\n{}'.format(sorted_bam)) sys.exit(1) elif(os.stat(sorted_bam).st_size == 0): log('Bam file is empty\n{}'.format(sorted_bam)) sys.exit(1) # index the bam file command = 'samtools index {}'.format(sorted_bam) log('running samtools command to index sorted bam: {}'.format(command)) self._run_command(command) return sorted_bam def generate_alignment_bams(self, task_params, assembly_clean): """ This function runs the selected read mapper and creates the sorted and indexed bam files from sam files using samtools. """ reads_list = task_params['reads_list'] (read_scratch_path, read_type) = self.stage_reads_list_file(reads_list) sorted_bam_file_list = [] # list of reads files, can be 1 or more. assuming reads are either type unpaired or interleaved # will not handle unpaired forward and reverse reads input as seperate (non-interleaved) files for i in range(len(read_scratch_path)): fastq = read_scratch_path[i] fastq_type = read_type[i] sam = os.path.basename(fastq).split('.fastq')[0] + ".sam" sam = os.path.join(self.BINNER_RESULT_DIRECTORY, sam) if fastq_type == 'interleaved': # make sure working - needs tests log("Running interleaved read mapping mode") self.run_read_mapping_interleaved_pairs_mode(task_params, assembly_clean, fastq, sam) else: # running read mapping in single-end mode log("Running unpaired read mapping mode") self.run_read_mapping_unpaired_mode(task_params, assembly_clean, fastq, sam) sorted_bam = self.convert_sam_to_sorted_and_indexed_bam(sam) sorted_bam_file_list.append(sorted_bam) return sorted_bam_file_list def generate_make_coverage_table_command(self, task_params, sorted_bam_file_list): # create the depth file for this bam # min_contig_length = task_params['min_contig_length'] sorted_bam = task_params['sorted_bam'] depth_file_path = os.path.join(self.scratch, str('cocacola_depth.txt')) command = '/kb/module/lib/kb_cocacola/bin/jgi_summarize_bam_contig_depths ' command += '--outputDepth {} '.format(depth_file_path) command += '--minContigLength {} '.format(min_contig_length) command += '--minContigDepth 1 {}'.format(sorted_bam) log('running summarize_bam_contig_depths command: {}'.format(command)) self._run_command(command) return depth_file_path def generate_cocacola_cut_up_fasta_command(self, task_params): """ generate_command: cocacola cut_up_fasta """ contig_file_path = task_params['contig_file_path'] contig_split_size = task_params['contig_split_size'] contig_split_overlap = task_params['contig_split_overlap'] log("\n\nRunning generate_cocacola_cut_up_fasta_command") command = 'python {}/scripts/cut_up_fasta.py '.format(self.CONCOCT_BASE_PATH) command += '{} '.format(contig_file_path) command += '-c {} '.format(contig_split_size) command += '-o {} '.format(contig_split_overlap) command += '--merge_last -b temp.bed > {}/split_contigs.fa'.format(self.BINNER_RESULT_DIRECTORY) log('Generated cocacola_cut_up_fasta command: {}'.format(command)) self._run_command(command) def generate_cocacola_input_table_from_bam(self, task_params): """ generate_command: cocacola generate input table """ log("\n\nRunning generate_cocacola_input_table_from_bam") command = 'python {}/scripts/gen_input_table.py '.format(self.CONCOCT_BASE_PATH) command += '{}/split_contigs.fa '.format(self.BINNER_RESULT_DIRECTORY) command += '{}/*_sorted.bam > '.format(self.BINNER_RESULT_DIRECTORY) command += '{}/coverage_table.tsv'.format(self.BINNER_RESULT_DIRECTORY) log('Generated cocacola generate input table from bam command: {}'.format(command)) calc_contigs = 0 for line in open('{}/split_contigs.fa'.format(self.BINNER_RESULT_DIRECTORY)): if line.startswith(">"): calc_contigs += 1 task_params['calc_contigs'] = calc_contigs self._run_command(command) def generate_cocacola_kmer_composition_table(self, task_params): """ generate_command: cocacola generate kmer composition table """ log("\n\nRunning generate_cocacola_kmer_composition_table") calc_contigs = task_params['calc_contigs'] kmer_size = task_params['kmer_size'] command = 'python {}/scripts/fasta_to_features.py '.format(self.CONCOCT_BASE_PATH) command += '{}/split_contigs.fa '.format(self.BINNER_RESULT_DIRECTORY) command += '{} '.format(calc_contigs) command += '{} '.format(kmer_size) command += '{}/split_contigs_kmer_{}.csv'.format(self.BINNER_RESULT_DIRECTORY, kmer_size) log('Generated cocacola generate input table from bam command: {}'.format(command)) self._run_command(command) def generate_cocacola_command(self, task_params): """ generate_command: cocacola """ min_contig_length = task_params['min_contig_length'] kmer_size = task_params['kmer_size'] log("\n\nRunning generate_cocacola_command") command = 'python {}/cocacola.py '.format(self.COCACOLA_BASE_PATH) command += '--contig_file {}/split_contigs.fa '.format(self.BINNER_RESULT_DIRECTORY) command += '--abundance_profiles {}/coverage_table.tsv '.format(self.BINNER_RESULT_DIRECTORY) command += '--composition_profiles {}/split_contigs_kmer_{}.csv '.format(self.BINNER_RESULT_DIRECTORY, kmer_size) command += '--output {}/cocacola_output_clusters_min{}.csv'.format(self.BINNER_RESULT_DIRECTORY, min_contig_length) log('Generated cocacola command: {}'.format(command)) self._run_command(command) def add_header_to_post_clustering_file(self, task_params): min_contig_length = task_params['min_contig_length'] header = "contig_id,cluster_id" with open('{}/cocacola_output_clusters_min{}_headers.csv'.format(self.BINNER_RESULT_DIRECTORY, min_contig_length), 'w') as outfile: outfile.write(header) with open('{}/cocacola_output_clusters_min{}.csv'.format(self.BINNER_RESULT_DIRECTORY, min_contig_length), 'r') as datafile: for line in datafile: outfile.write(line) def generate_cocacola_post_clustering_merging_command(self, task_params): """ generate_command: cocacola post cluster merging """ min_contig_length = task_params['min_contig_length'] log("\n\nRunning generate_cocacola_post_clustering_merging_command") command = 'python {}/scripts/merge_cutup_clustering.py '.format(self.CONCOCT_BASE_PATH) command += '{}/cocacola_output_clusters_min{}_headers.csv > '.format(self.BINNER_RESULT_DIRECTORY, min_contig_length) command += '{}/clustering_merged_min{}.csv'.format(self.BINNER_RESULT_DIRECTORY, min_contig_length) log('Generated generate_cocacola_post_clustering_merging command: {}'.format(command)) self._run_command(command) def generate_cocacola_extract_fasta_bins_command(self, task_params): """ generate_command: cocacola extract_fasta_bins """ log("\n\nRunning generate_cocacola_extract_fasta_bins_command") contig_file_path = task_params['contig_file_path'] min_contig_length = task_params['min_contig_length'] bin_result_directory = self.BINNER_RESULT_DIRECTORY + '/' + self.BINNER_BIN_RESULT_DIR self._mkdir_p(bin_result_directory) command = 'python {}/scripts/extract_fasta_bins.py '.format(self.CONCOCT_BASE_PATH) command += '{} '.format(contig_file_path) command += '{}/clustering_merged_min{}.csv '.format(self.BINNER_RESULT_DIRECTORY, min_contig_length) command += '--output_path {}/{}'.format(self.BINNER_RESULT_DIRECTORY, self.BINNER_BIN_RESULT_DIR) log('Generated generate_cocacola_extract_fasta_bins_command command: {}'.format(command)) self._run_command(command) def rename_and_standardize_bin_names(self, task_params): """ generate_command: generate renamed bins """ log("\n\nRunning rename_and_standardize_bin_names") path_to_cocacola_result_bins = os.path.abspath(self.BINNER_RESULT_DIRECTORY) + \ '/' + self.BINNER_BIN_RESULT_DIR + '/' for dirname, subdirs, files in os.walk(path_to_cocacola_result_bins): for file in files: if file.endswith('.fa'): os.rename(os.path.abspath(path_to_cocacola_result_bins) + '/' + file, os.path.abspath(path_to_cocacola_result_bins) + '/bin.' + file.split('.fa')[0].zfill(3) + '.fasta') # need to change to 4 digits def make_binned_contig_summary_file_for_binning_apps(self, task_params): """ generate_command: generate binned contig summary command """ log("\n\nRunning make_binned_contig_summary_file_for_binning_apps") path_to_cocacola_result = os.path.abspath(self.BINNER_RESULT_DIRECTORY) path_to_cocacola_result_bins = '{}/{}/'.format(path_to_cocacola_result, self.BINNER_BIN_RESULT_DIR) path_to_summary_file = path_to_cocacola_result_bins + 'binned_contig.summary' with open(path_to_summary_file, 'w+') as f: f.write("Bin name\tCompleteness\tGenome size\tGC content\n") for dirname, subdirs, files in os.walk(path_to_cocacola_result_bins): for file in files: if file.endswith('.fasta'): genome_bin_fna_file = os.path.join(self.BINNER_BIN_RESULT_DIR, file) bbstats_output_file = os.path.join(self.scratch, self.BINNER_RESULT_DIRECTORY, genome_bin_fna_file).split('.fasta')[0] + ".bbstatsout" bbstats_output = self.generate_stats_for_genome_bins(task_params, genome_bin_fna_file, bbstats_output_file) f.write('{}\t0\t{}\t{}\n'.format(genome_bin_fna_file.split("/")[-1], bbstats_output['contig_bp'], bbstats_output['gc_avg'])) f.close() log('Finished make_binned_contig_summary_file_for_binning_apps function') def generate_output_file_list(self, result_directory): """ generate_output_file_list: zip result files and generate file_links for report """ log('Start packing result files') output_files = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file = os.path.join(output_directory, 'cocacola_result.zip') with zipfile.ZipFile(result_file, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as zip_file: for dirname, subdirs, files in os.walk(result_directory): for file in files: if (file.endswith('.sam') or file.endswith('.bam') or file.endswith('.bai') or file.endswith('.summary')): continue if (dirname.endswith(self.BINNER_BIN_RESULT_DIR)): continue zip_file.write(os.path.join(dirname, file), file) if (dirname.endswith(self.BINNER_BIN_RESULT_DIR)): baseDir = os.path.basename(dirname) for file in files: full = os.path.join(dirname, file) zip_file.write(full, os.path.join(baseDir, file)) output_files.append({'path': result_file, 'name': os.path.basename(result_file), 'label': os.path.basename(result_file), 'description': 'Files generated by CONCOCT App'}) return output_files def generate_html_report(self, result_directory, assembly_ref, binned_contig_obj_ref): """ generate_html_report: generate html summary report """ log('Start generating html report') html_report = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file_path = os.path.join(output_directory, 'report.html') # get summary data from existing assembly object and bins_objects Summary_Table_Content = '' Overview_Content = '' (binned_contig_count, input_contig_count, total_bins_count) = \ self.generate_overview_info(assembly_ref, binned_contig_obj_ref, result_directory) Overview_Content += '<p>Binned contigs: {}</p>'.format(binned_contig_count) Overview_Content += '<p>Input contigs: {}</p>'.format(input_contig_count) Overview_Content += '<p>Number of bins: {}</p>'.format(total_bins_count) with open(result_file_path, 'w') as result_file: with open(os.path.join(os.path.dirname(__file__), 'report_template.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace('<p>Overview_Content</p>', Overview_Content) report_template = report_template.replace('Summary_Table_Content', Summary_Table_Content) result_file.write(report_template) html_report.append({'path': result_file_path, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for kb_cocacola App'}) return html_report def generate_overview_info(self, assembly_ref, binned_contig_obj_ref, result_directory): """ _generate_overview_info: generate overview information from assembly and binnedcontig """ # get assembly and binned_contig objects that already have some data populated in them assembly = self.dfu.get_objects({'object_refs': [assembly_ref]})['data'][0] binned_contig = self.dfu.get_objects({'object_refs': [binned_contig_obj_ref]})['data'][0] input_contig_count = assembly.get('data').get('num_contigs') binned_contig_count = 0 total_bins_count = 0 total_bins = binned_contig.get('data').get('bins') total_bins_count = len(total_bins) for bin in total_bins: binned_contig_count += len(bin.get('contigs')) return (binned_contig_count, input_contig_count, total_bins_count) def generate_report(self, binned_contig_obj_ref, task_params): """ generate_report: generate summary report """ log('Generating report') result_directory = os.path.join(self.scratch, "cocacola_output_dir") task_params['result_directory'] = result_directory output_files = self.generate_output_file_list(task_params['result_directory']) output_html_files = self.generate_html_report(task_params['result_directory'], task_params['assembly_ref'], binned_contig_obj_ref) report_params = { 'message': '', 'workspace_name': task_params['workspace_name'], 'file_links': output_files, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 266, 'report_object_name': 'kb_cocacola_report_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url) output = kbase_report_client.create_extended_report(report_params) report_output = {'report_name': output['name'], 'report_ref': output['ref']} return report_output def create_dict_from_depth_file(self, depth_file_path): # keep contig order (required by metabat2) depth_file_dict = {} with open(depth_file_path, 'r') as f: header = f.readline().rstrip().split("\t") # print('HEADER1 {}'.format(header)) # map(str.strip, header) for line in f: # deal with cases were fastq name has spaces.Assume first # non white space word is unique and use this as ID. # line = line.rstrip() vals = line.rstrip().split("\t") if ' ' in vals[0]: ID = vals[0].split()[0] else: ID = vals[0] depth_file_dict[ID] = vals[1:] depth_file_dict['header'] = header return depth_file_dict def run_cocacola(self, task_params): """ run_cocacola: cocacola app required params: assembly_ref: Metagenome assembly object reference binned_contig_name: BinnedContig object name and output file header workspace_name: the name of the workspace it gets saved to. reads_list: list of reads object (PairedEndLibrary/SingleEndLibrary) upon which CONCOCT will be run optional params: min_contig_length: minimum contig length; default 1000 ref: https://github.com/BinPro/CONCOCT/blob/develop/README.md """ log('--->\nrunning CocacolaUtil.run_cocacola\n' + 'task_params:\n{}'.format(json.dumps(task_params, indent=1))) self._validate_run_cocacola_params(task_params) # get assembly contig_file = self._get_contig_file(task_params['assembly_ref']) task_params['contig_file_path'] = contig_file # clean the assembly file so that there are no spaces in the fasta headers assembly_clean = self.retrieve_and_clean_assembly(task_params) assembly_clean_temp = self.filter_contigs_by_length(assembly_clean, task_params['min_contig_length']) task_params['contig_file_path'] = assembly_clean_temp assembly_clean = assembly_clean_temp # need to clean this up, ugly redundant variable usage # get reads (reads_list_file, read_type) = self.stage_reads_list_file(task_params['reads_list']) task_params['read_type'] = read_type task_params['reads_list_file'] = reads_list_file # prep result directory result_directory = os.path.join(self.scratch, self.BINNER_RESULT_DIRECTORY) self._mkdir_p(result_directory) cwd = os.getcwd() log('changing working dir to {}'.format(result_directory)) os.chdir(result_directory) # run alignments, and update input contigs to use the clean file # this function has an internal loop to generate a sorted bam file for each input read file self.generate_alignment_bams(task_params, assembly_clean) # not used right now # depth_file_path = self.generate_make_coverage_table_command(task_params, sorted_bam_file_list) # depth_dict = self.create_dict_from_depth_file(depth_file_path) # run cocacola prep, cut up fasta input self.generate_cocacola_cut_up_fasta_command(task_params) # run cococola prep, generate coverage tables from bam self.generate_cocacola_input_table_from_bam(task_params) # run cococola prep, generate kmer table self.generate_cocacola_kmer_composition_table(task_params) # run cocacola prep and cocacola self.generate_cocacola_command(task_params) # run command to add header to output file self.add_header_to_post_clustering_file(task_params) # run cocacola post cluster merging command self.generate_cocacola_post_clustering_merging_command(task_params) # run extract bins command self.generate_cocacola_extract_fasta_bins_command(task_params) # run fasta renaming self.rename_and_standardize_bin_names(task_params) # make binned contig summary file self.make_binned_contig_summary_file_for_binning_apps(task_params) # file handling and management os.chdir(cwd) log('changing working dir to {}'.format(cwd)) log('Saved result files to: {}'.format(result_directory)) log('Generated files:\n{}'.format('\n'.join(os.listdir(result_directory)))) # make new BinnedContig object and upload to KBase generate_binned_contig_param = { 'file_directory': os.path.join(result_directory, self.BINNER_BIN_RESULT_DIR), 'assembly_ref': task_params['assembly_ref'], 'binned_contig_name': task_params['binned_contig_name'], 'workspace_name': task_params['workspace_name'] } binned_contig_obj_ref = \ self.mgu.file_to_binned_contigs(generate_binned_contig_param).get('binned_contig_obj_ref') # generate report reportVal = self.generate_report(binned_contig_obj_ref, task_params) returnVal = { 'result_directory': result_directory, 'binned_contig_obj_ref': binned_contig_obj_ref } returnVal.update(reportVal) return returnVal
def process_kbase_objects(host_ref, virus_ref, shared_folder, callback, workspace, token): """ Convert KBase object(s) into usable files for VirMatcher :param host_ref: Putative host / microbial genomes with KBase '#/#/#' used to describe each object :param virus_ref: Viral genomes with KBase '#/#/#' used to describe each object :param shared_folder: KBase job node's "working" directory, where actual files exist :param callback: :param workspace: Workspace name :param token: Job token :return: """ dfu = DataFileUtil(callback, token=token) ws = Workspace(workspace, token=token) mgu = MetagenomeUtils(callback, token=token) au = AssemblyUtil(callback, token=token) # Need to determine KBase type in order to know how to properly proceed host_type = ws.get_object_info3({'objects': [{ 'ref': host_ref }]})['infos'][0][2].split('-')[0] virus_type = ws.get_object_info3({'objects': [{ 'ref': virus_ref }]})['infos'][0][2].split('-')[0] logging.info(f'Potential hosts identified as: {host_type}') logging.info(f'Viruses identified as: {virus_type}') # Create new directory to house virus and host files host_dir = Path(shared_folder) / 'host_files' if not host_dir.exists(): os.mkdir(host_dir) host_count = 0 if host_type == 'KBaseGenomeAnnotations.Assembly': # No info about individual genomes, so treat each as organism host_fps = au.get_assembly_as_fasta( {'ref': host_ref})['path'] # Consists of dict: path + assembly_name logging.info( f'Identified {host_type}. Each sequence will be treated as a separate organism.' ) records = SeqIO.parse(host_fps, 'fasta') for record in records: host_count += 1 tmp_fp = host_dir / f'{record.id}.fasta' # TODO Illegal filenames? SeqIO.write([record], tmp_fp, 'fasta') elif host_type == 'KBaseGenomes.Genomes': # TODO Genomes?! genome_data = ws.get_objects2({'objects': [{ 'ref': host_ref }]})['data'][0]['data'] genome_data.get('contigset_ref') or genome_data.get('assembly_ref') # elif host_type == 'KBaseSets.GenomeSet' elif host_type == 'KBaseSets.AssemblySet': obj_data = dfu.get_objects({'object_refs': [host_ref]})['data'][0] for subobj in obj_data['data']['items']: host_fp = au.get_assembly_as_fasta({'ref': subobj['ref']})['path'] if os.path.splitext(host_fp)[-1] != 'fasta': # Ensure extension always = fasta target_fn = os.path.splitext( os.path.basename(host_fp))[0].strip('_') + '.fasta' else: target_fn = os.path.basename(host_fp).strip('_') shutil.copyfile(host_fp, host_dir / target_fn) host_count += 1 elif host_type == 'KBaseMetagenomes.BinnedContigs': # This is what we want! host_kbase_dir = mgu.binned_contigs_to_file({ 'input_ref': host_ref, 'save_to_shock': 0 })['bin_file_directory'] # Dict of bin_file_dir and shock_id for (dirpath, dirnames, fns) in os.walk( host_kbase_dir): # Dirnames = all folders under dirpath for fn in fns: if os.path.splitext(fn)[-1] != 'fasta': fn = os.path.splitext(fn)[0] + '.fasta' fp = Path(dirpath) / fn shutil.copy(fp, host_dir) host_count += 1 else: raise ValueError(f'{host_type} is not supported.') logging.info(f'{host_count} potential host genomes were identified.') virus_count = 0 if virus_type == 'KBaseGenomeAnnotations.Assembly': virus_fps = au.get_assembly_as_fasta({'ref': virus_ref})['path'] records = SeqIO.parse(virus_fps, 'fasta') virus_count = len(list(records)) # for record in records: # virus_count += 1 # tmp_fp = virus_dir / f'{record.id}.fasta' # SeqIO.write([record], tmp_fp, 'fasta') else: raise ValueError(f'{virus_type} is not supported.') logging.info(f'{virus_count} potential viral genomes were identified.') # TODO Do we even need any of this data? We don't care about what the sequences are called # host_data = dfu.get_objects({'object_refs': [host_ref]})['data'][0] # virus_data = dfu.get_objects({'object_refs': [virus_ref]})['data'][0] return host_dir, virus_fps
class TypeToFasta: def __init__(self, callback_url, scratch, ws_url): self.ws_url = ws_url self.callback_url = callback_url self.scratch = scratch self.dfu = DataFileUtil(callback_url) self.mgu = MetagenomeUtils(callback_url) def type_to_fasta(self, ctx, ref_lst): fasta_dict = dict() fasta_array = [] atf = AssemblyToFasta(self.callback_url, self.scratch) # Get type info for each ref in ref_lst for idx, ref in enumerate(ref_lst): upas = [] obj = {"ref": ref} obj_info = self.ws_url.get_object_info3({"objects": [obj]}) obj_type = obj_info["infos"][0][2] # From type info get object if 'KBaseSets.GenomeSet' in obj_type: obj_data = self.dfu.get_objects({"object_refs": [ref]})['data'][0] upas = [gsi['ref'] for gsi in obj_data['data']['items']] elif 'KBaseSearch.GenomeSet' in obj_type: obj_data = self.dfu.get_objects({"object_refs": [ref]})['data'][0] upas = [gse['ref'] for gse in obj_data['data']['elements'].values()] elif "KBaseGenomes.Genome" in obj_type: upas = [ref] elif "KBaseGenomes.ContigSet" in obj_type or "KBaseGenomeAnnotations.Assembly" in obj_type: faf = [atf.assembly_as_fasta(ctx, obj)] fasta_array.extend([faf[0]['path'], ref]) elif "KBaseSets.AssemblySet" in obj_type: fasta_paths = [] obj_data = self.dfu.get_objects({"object_refs": [ref]})['data'][0] for item_upa in obj_data['data']['items']: faf = [atf.assembly_as_fasta(ctx, {"ref": item_upa['ref']})] fasta_paths.extend([faf[0]['path'], item_upa['ref']]) fasta_array = fasta_paths elif 'KBaseMetagenomes.BinnedContigs' in obj_type: fasta_paths = [] bin_file_dir = self.mgu.binned_contigs_to_file({'input_ref': ref, 'save_to_shock': 0})['bin_file_directory'] for (dirpath, dirnames, filenames) in os.walk(bin_file_dir): for fasta_file in filenames: fasta_path = os.path.join(self.scratch, fasta_file) copyfile(os.path.join(bin_file_dir, fasta_file), fasta_path) fasta_paths.extend([fasta_path, ref]) break fasta_array = fasta_paths if upas: for genome_upa in upas: genome_data = self.ws_url.get_objects2({'objects': [{"ref": genome_upa}]})['data'][0]['data'] assembly_upa = genome_upa + ';' + str(genome_data.get('contigset_ref') or genome_data.get('assembly_ref')) faf = [atf.assembly_as_fasta(ctx, {'ref': assembly_upa})] fasta_array.extend([faf[0]['path'], assembly_upa]) # return dictionary of FASTA fasta_dict["FASTA"] = fasta_array return fasta_dict