def load_fastas(config, scratch, upa): ''' ''' dfu = DataFileUtil(config['callback_url']) au = AssemblyUtil(config['callback_url']) ws = Workspace(config['workspace-url']) obj_data = dfu.get_objects({"object_refs":[upa]})['data'][0] obj_type = obj_data['info'][2] if 'KBaseSets.GenomeSet' in obj_type: upas = [gsi['ref'] for gsi in obj_data['data']['items']] elif 'KBaseSearch.GenomeSet' in obj_type: upas = [gse['ref'] for gse in obj_data['data']['elements'].values()] elif "KBaseGenomes.Genome" in obj_type: upas = [upa] elif "KBaseGenomes.ContigSet" in obj_type or "KBaseGenomeAnnotations.Assembly" in obj_type: # in this case we use the assembly file util to get the fasta file file_output = os.path.join(scratch, "input_fasta.fa") faf = au.get_assembly_as_fasta({"ref": upa}) return [(faf['path'], upa)] fasta_paths = [] for genome_upa in upas: if upa != genome_upa: genome_upa = upa + ';' + genome_upa genome_data = ws.get_objects2( {'objects':[{"ref":genome_upa}]})['data'][0]['data'] target_upa = genome_data.get('contigset_ref') or genome_data.get('assembly_ref') assembly_upa = genome_upa + ';' + target_upa faf = au.get_assembly_as_fasta({"ref":assembly_upa}) fasta_paths.append((faf['path'], assembly_upa)) return fasta_paths
def load_fastas(config, scratch: str, upa: str): ''' Returns list of (fasta_path, upa) ''' dfu = DataFileUtil(config['callback_url']) au = AssemblyUtil(config['callback_url']) mgu = MetagenomeUtils(config['callback_url']) ws = Workspace(config['workspace-url']) obj_data = dfu.get_objects({"object_refs": [upa]})['data'][0] obj_type = obj_data['info'][2] if 'KBaseSets.GenomeSet' in obj_type: upas = [gsi['ref'] for gsi in obj_data['data']['items']] elif 'KBaseSearch.GenomeSet' in obj_type: upas = [gse['ref'] for gse in obj_data['data']['elements'].values()] elif "KBaseGenomes.Genome" in obj_type: upas = [upa] elif "KBaseGenomes.ContigSet" in obj_type or "KBaseGenomeAnnotations.Assembly" in obj_type: # in this case we use the assembly file util to get the fasta file # file_output = os.path.join(scratch, "input_fasta.fa") faf = au.get_assembly_as_fasta({"ref": upa}) return [(faf['path'], upa)] elif "KBaseSets.AssemblySet" in obj_type: fasta_paths = [] for item_upa in obj_data['data']['items']: faf = au.get_assembly_as_fasta({"ref": item_upa['ref']}) fasta_paths.append((faf['path'], item_upa['ref'])) return fasta_paths elif 'KBaseMetagenomes.BinnedContigs' in obj_type: fasta_paths = [] bin_file_dir = mgu.binned_contigs_to_file({ 'input_ref': upa, 'save_to_shock': 0 })['bin_file_directory'] for (dirpath, dirnames, filenames) in os.walk(bin_file_dir): for fasta_file in filenames: fasta_path = os.path.join(scratch, fasta_file) fasta_path = os.path.splitext(fasta_path)[0] + ".fa" copyfile(os.path.join(bin_file_dir, fasta_file), fasta_path) # Should I verify that the bins have contigs? # is it possible to have empty bins? fasta_paths.append((fasta_path, upa)) break return fasta_paths else: raise Error('Input genome/metagenome reference has unhandled type') fasta_paths = [] for genome_upa in upas: genome_data = ws.get_objects2({'objects': [{ "ref": genome_upa }]})['data'][0]['data'] assembly_upa = genome_upa + ';' + str( genome_data.get('contigset_ref') or genome_data.get('assembly_ref')) faf = au.get_assembly_as_fasta({'ref': assembly_upa}) fasta_paths.append((faf['path'], assembly_upa)) return fasta_paths
class DownloadUtils: def __init__(self, callbackURL): self.callbackURL = os.environ['SDK_CALLBACK_URL'] self.au = AssemblyUtil(self.callbackURL) self.vu = VariationUtil(self.callbackURL) self.gfu = GenomeFileUtil(self.callbackURL) pass def download_genome(self, genomeref, output_dir): ''' this funciton downloads genome. :param genomeref: :param output_dir: :return: ''' file = self.au.get_assembly_as_fasta({ 'ref': genomeref, 'filename': os.path.join(output_dir, "ref_genome.fa") }) return file def get_variation(self, variation_ref): ''' This function downloads variations. :param variation_ref: :param filename: :return: ''' filepath = self.vu.get_variation_as_vcf( {'variation_ref': variation_ref})['path'] return filepath def get_gff(self, genome_ref): ''' :param genome_ref: :return: gff file path ''' file = self.gfu.genome_to_gff({'genome_ref': genome_ref}) return file['file_path'] def get_assembly(self, assembly_ref, output_dir): ''' :param assembly_ref: :param output_dir: :return: assembly file path ''' file = self.au.get_assembly_as_fasta({ 'ref': assembly_ref, 'filename': os.path.join(output_dir, "ref_genome.fa") }) return file['path']
def _build_index(self, assembly_info, validated_params): # get the assembly as a fasta file using AssemblyUtil au = AssemblyUtil(self.callback_url) fasta_info = au.get_assembly_as_fasta({'ref': assembly_info['ref']}) # make the target destination folder (check again it wasn't created yet) if os.path.exists(validated_params['output_dir']): raise ('Output directory name specified (' + validated_params['output_dir'] + ') already exists. Will not overwrite, so aborting.') os.makedirs(validated_params['output_dir']) # configure the command line args and run it cli_params = self._build_cli_params(fasta_info['path'], fasta_info['assembly_name'], validated_params) self.bwa.run('index', cli_params) # self.bwa.run('index', cli_params) for file in glob.glob(r'/kb/module/work/tmp/' + fasta_info['assembly_name'] + '.*'): print(file) shutil.copy(file, validated_params['output_dir']) index_info = {'output_dir': validated_params['output_dir'], 'index_files_basename': fasta_info['assembly_name']} # cache the result, mark if it worked or not cache_success = self._put_cached_index(assembly_info, fasta_info['assembly_name'], validated_params['output_dir'], validated_params['ws_for_cache']) if cache_success: index_info['pushed_to_cache'] = 1 else: index_info['pushed_to_cache'] = 0 return index_info
def BuildFastaFromSequenceSet(self, ctx, params): """ :param params: instance of type "BuildSeqIn" -> structure: parameter "workspace_name" of String, parameter "SequenceSetRef" of String, parameter "fasta_outpath" of String :returns: instance of type "BuildSeqOut" -> structure: parameter "fasta_outpath" of String """ # ctx is the context object # return variables are: output #BEGIN BuildFastaFromSequenceSet #END BuildFastaFromSequenceSet # At some point might do deeper type checking... dfu = DataFileUtil(self.callback_url) bu = BackgroundUtils() TU = TestUtils() if params['TESTFLAG'] and params['background']: targetpath = '/kb/module/work/tmp/testgenome.fa' TU.GetGenome(targetpath) bu.BuildBackground(targetpath) elif params['background']: ws = Workspace('https://appdev.kbase.us/services/ws') subset = ws.get_object_subset([{ 'included':['/features/[*]/location', '/features/[*]/id','/assembly_ref'], 'ref':params['genome_ref']}]) aref = subset[0]['data']['assembly_ref'] assembly_ref = {'ref': aref} print('Downloading Assembly data as a Fasta file.') assemblyUtil = AssemblyUtil(self.callback_url) fasta_file = assemblyUtil.get_assembly_as_fasta(assembly_ref)['path'] bu.BuildBackground(fasta_file) get_objects_params = {'object_refs' : [params['SequenceSetRef']]} SeqSet = dfu.get_objects(get_objects_params)['data'][0]['data'] outFile = open(params['fasta_outpath'],'w') for s in SeqSet['sequences']: sname = '>' + s['sequence_id'] + '\n' outFile.write(sname) sseq = s['sequence'] + '\n' outFile.write(sseq) outFile.close() fu=FastaUtils() if params['mask_repeats']: fu.RemoveRepeats(params['fasta_outpath'],params['fasta_outpath']) output = {'fasta_outpath' : params['fasta_outpath']} #END BuildFastaFromSequenceSet # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method BuildFastaFromSequenceSet return value ' + 'output is not type dict as required.') # return the results return [output]
class DownloadFastqUtils: def __init__(self): self.callbackURL = os.environ['SDK_CALLBACK_URL'] self.au = AssemblyUtil(self.callbackURL) self.ru = ReadsUtils(self.callbackURL) pass def _stage_input_file(self, ref, reads_type): if reads_type == 'KBaseFile.PairedEndLibrary' or 'KBaseAssembly.PairedEndLibrary': input_file_info = self.ru.download_reads({ 'read_libraries': [ref], 'interleaved': 'true' })['files'][ref] elif reads_type == 'KBaseFile.SingleEndLibrary' or 'KBaseAssembly.SingleEndLibrary': input_file_info = self.ru.download_reads({'read_libraries': [ref]})['files'][ref] else: raise ValueError("Can't download_reads() for object type: '" + str(reads_type) + "'") input_file_info['input_ref'] = ref file_location = input_file_info['files']['fwd'] interleaved = False if input_file_info['files']['type'] == 'interleaved': interleaved = True return input_file_info def download_genome(self, genomeref): file = self.au.get_assembly_as_fasta({'ref': genomeref}) return file
def _get_assembly(self, genome): if 'assembly_ref' in genome: assembly_ref = genome['assembly_ref'] else: assembly_ref = genome['contigset_ref'] log('Assembly reference = ' + assembly_ref) log('Downloading assembly') dfu = DataFileUtil(self.cfg.callbackURL) log('object_refs:' + self.genome_ref + ";" + assembly_ref) assembly_data = dfu.get_objects( {'object_refs': [self.genome_ref + ";" + assembly_ref]})['data'][0]['data'] if isinstance(assembly_data['contigs'], dict): # is an assembly circular_contigs = set([ x['contig_id'] for x in list(assembly_data['contigs'].values()) if x.get('is_circ') ]) else: # is a contig set circular_contigs = set([ x['id'] for x in assembly_data['contigs'] if x.get('replicon_geometry') == 'circular' ]) au = AssemblyUtil(self.cfg.callbackURL) assembly_file_path = au.get_assembly_as_fasta( {'ref': self.genome_ref + ";" + assembly_ref})['path'] return assembly_file_path, circular_contigs
class DownloadUtils: def __init__(self, callback_url): self.callbackURL = os.environ['SDK_CALLBACK_URL'] self.au = AssemblyUtil(self.callbackURL) self.vu = VariationUtil(self.callbackURL) pass def download_genome(self, genomeref, output_dir): ''' this funciton downloads genome. :param genomeref: :param output_dir: :return: ''' file = self.au.get_assembly_as_fasta({ 'ref': genomeref, 'filename': os.path.join(output_dir, "ref_genome.fa") }) return file def download_variations(self, variation_ref, filename): ''' This function downloads variations. :param variation_ref: :param filename: :return: ''' filepath = self.vu.get_variation_as_vcf({ 'variation_ref': variation_ref, 'filename': filename })['path'] return filepath
def jayrbolton_contig_filter(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN jayrbolton_contig_filter if not params.get('assembly_input_ref'): raise TypeError("`assembly_input_ref` is required") if not params.get('min_length') or not isinstance( params['min_length'], int): raise TypeError("`min_length` is required and needs to be an int") min_length = params['min_length'] # Initialize the assembly util client assembly_util = AssemblyUtil(self.callback_url) # download the fasta file to local disk fasta_file = assembly_util.get_assembly_as_fasta( {'ref': params['assembly_input_ref']}) filtered_path = os.path.join(self.shared_folder, 'filtered.fasta') report_client = KBaseReport(self.callback_url) result = contig_filter(fasta_file['path'], filtered_path, min_length) assembly_obj = assembly_util.save_assembly_from_fasta({ 'workspace_name': params['workspace_name'], 'file': { 'path': filtered_path, 'assembly_name': 'filtered_contigs' }, 'assembly_name': 'filtered_assembly' }) report = report_client.create_extended_report({ 'workspace_name': params['workspace_name'], 'objects_created': [{ 'ref': assembly_obj, 'description': 'filtered_assembly' }], 'message': (f"Filtered out {result['n_total'] - result['n_remaining']} " f"records out of {result['n_total']} records.") }) output = {'report_ref': report['ref'], 'report_name': report['name']} #END jayrbolton_contig_filter # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method jayrbolton_contig_filter return value ' + 'output is not type dict as required.') # return the results return [output]
def run_mash_sketch(self, ctx, params): """ Generate a sketch file from a fasta/fastq file :param params: instance of type "MashSketchParams" (* * Pass in **one of** input_path, assembly_ref, or reads_ref * input_path - string - local file path to an input fasta/fastq * assembly_ref - string - workspace reference to an Assembly type * reads_ref - string - workspace reference to a Reads type * Optionally, pass in a boolean indicating whether you are using paired-end reads. * paired_ends - boolean - whether you are passing in paired ends) -> structure: parameter "input_path" of String, parameter "assembly_ref" of String, parameter "reads_ref" of String, parameter "paired_ends" of type "boolean" (params: input_upa: workspace reference to an assembly object workspace_name: name of current workspace search_db: database to search n_max_results: number of results to return, integer between 1 and 100) :returns: instance of type "MashSketchResults" (* * Returns the local scratch file path of the generated sketch file. * Will have the extension '.msh') -> structure: parameter "sketch_path" of String """ # ctx is the context object # return variables are: results #BEGIN run_mash_sketch if 'reads_ref' in params: reads_utils = ReadsUtils(self.callbackURL) result = reads_utils.download_reads({ 'read_libraries': [params['reads_ref']], 'interleaved': 'true' }) input_path = result['files'][params['reads_ref']]['files']['fwd'] elif 'assembly_ref' in params: assembly_util = AssemblyUtil(self.callbackURL) result = assembly_util.get_assembly_as_fasta({'ref': params['assembly_ref']}) input_path = result['path'] elif 'input_path' in params: input_path = params['input_path'] else: raise ValueError( 'Invalid params; must provide one of `reads_ref`, `assembly_ref`, or `input_path`.' ) mash_utils = MashUtils(self.config, self.auth_token) output_file_path = mash_utils.mash_sketch(input_path, paired_ends=params.get('paired_ends')) results = {'sketch_path': output_file_path} #END run_mash_sketch # At some point might do deeper type checking... if not isinstance(results, dict): raise ValueError('Method run_mash_sketch return value ' + 'results is not type dict as required.') # return the results return [results]
def download_assembly(self, token, assembly_ref): try: auClient = AUClient(self.callback_url, token=token, service_ver=self.SERVICE_VER) except Exception as e: raise ValueError('Unable to instantiate auClient with callback_url: '+ self.callback_url +' ERROR: ' + str(e)) try: dfuClient = DFUClient(self.callback_url, token=token, service_ver=self.SERVICE_VER) except Exception as e: raise ValueError('Unable to instantiate dfuClient with callback_url: '+ self.callback_url +' ERROR: ' + str(e)) contig_file = auClient.get_assembly_as_fasta({'ref':assembly_ref}).get('path') sys.stdout.flush() # don't remember why this matters contig_file_path = dfuClient.unpack_file({'file_path': contig_file})['file_path'] return contig_file_path
def fetch_fasta_from_assembly(assembly_ref, ws_url, callback_url): """ From an assembly or contigset, this uses a data file util to build a FASTA file and return the path to it. """ allowed_types = [ 'KBaseFile.Assembly', 'KBaseGenomeAnnotations.Assembly', 'KBaseGenomes.ContigSet' ] if not check_ref_type(assembly_ref, allowed_types, ws_url): raise ValueError( "The reference {} cannot be used to fetch a FASTA file".format( assembly_ref)) au = AssemblyUtil(callback_url) return au.get_assembly_as_fasta({'ref': assembly_ref})
class downloaddatautils: def __init__(self): self.callbackURL = os.environ['SDK_CALLBACK_URL'] self.au = AssemblyUtil(self.callbackURL) self.vu = VariationUtil(self.callbackURL) pass def download_genome(self, params): file = self.au.get_assembly_as_fasta( {'ref': params['genome_or_assembly_ref']}) return file def download_vcf(self, params): params['input_var_ref'] = params['vcf_ref'] self.vu.export_variation_as_vcf(params)
def get_assembly_sequence(self, assembly_input_ref): # Download the input data as a Fasta # We can use the AssemblyUtils module to download a FASTA file from our Assembly data object. # The return object gives us the path to the file that was created. print('Downloading Assembly data as a Fasta file.') assemblyUtil = AssemblyUtil(self.callback_url) fasta_file = assemblyUtil.get_assembly_as_fasta( {'ref': assembly_input_ref}) cf = CreateFasta(self.config) string = '' for seq_record in SeqIO.parse(fasta_file['path'], 'fasta'): string += ">" + seq_record.id + "\n" string += cf.splitSequence(str(seq_record.seq)) string += "\n" return string
class DownloadAlignmentUtils: def __init__(self, callback_url): self.callbackURL = os.environ['SDK_CALLBACK_URL'] self.au = AssemblyUtil(self.callbackURL) pass def downloadreadalignment(self, source_ref, params, callback_url): ''' downloadreadalignment: download alignment file :param source_ref: :param params: :param callback_url: :return: ''' self.callback_url = callback_url self.ru = ReadsAlignmentUtils(self.callback_url) params['source_ref'] = source_ref params['downloadSAM'] = 1 params['destination_dir'] = '/kb/module/work/tmp' params['stats'] = { "properly_paired": 1, "multiple_alignments": 1, "singletons": 1, "alignment_rate": 1, "unmapped_reads": 1, "mapped_reads": 1, "total_reads": 1 } return self.ru.download_alignment(params) def download_genome(self, genomeref, output_dir): ''' download_genome:download genome :param genomeref: :param output_dir: :return: ''' file = self.au.get_assembly_as_fasta({ 'ref': genomeref, 'filename': os.path.join(output_dir, "ref_genome.fa") }) return file
def test_import_fasta_as_assembly_from_staging(self, download_staging_file, update_staging_service): fasta_file = 'small_fasta.fna' ws_obj_name = 'MyAssembly' params = { 'staging_file_subdir_path': fasta_file, 'workspace_name': self.getWsName(), 'assembly_name': ws_obj_name } ref = self.getImpl().import_fasta_as_assembly_from_staging( self.getContext(), params) self.assertTrue('obj_ref' in ref[0]) self.assertTrue('report_ref' in ref[0]) self.assertTrue('report_name' in ref[0]) fasta_file_path = os.path.join('/kb/module/work/tmp', fasta_file) assemblyUtil = AssemblyUtil(os.environ['SDK_CALLBACK_URL']) fasta_assembly = assemblyUtil.get_assembly_as_fasta( {'ref': self.getWsName() + "/{}".format(ws_obj_name)}) expected_data = None with open(fasta_file_path, 'r') as f: expected_data = f.read() actual_data = None with open(fasta_assembly['path'], 'r') as f: actual_data = f.read() self.assertEqual(actual_data, expected_data) get_objects_params = { 'object_refs': [ref[0].get('obj_ref')], 'ignore_errors': False } object_data = self.dfu.get_objects(get_objects_params) base_count = object_data.get('data')[0].get('data').get('base_counts') dna_size = object_data.get('data')[0].get('data').get('dna_size') self.assertEqual(dna_size, 2520) expected_base_count = {'A': 700, 'C': 558, 'T': 671, 'G': 591} self.assertDictContainsSubset(base_count, expected_base_count) self.assertDictContainsSubset(expected_base_count, base_count)
def stage_assembly_files(self, object_list): """ _stage_assembly_files: download the fasta files to the scratch area return list of file names """ log('Processing assembly object list: {}'.format(object_list)) auc = AssemblyUtil(self.callbackURL) staged_file_list = [] for assembly_upa in object_list: try: filename = auc.get_assembly_as_fasta({'ref': assembly_upa})['path'] except ServerError as assembly_error: print(str(assembly_error)) raise staged_file_list.append(filename) log('Created file list: {}'.format(staged_file_list)) return staged_file_list
class DownloadUtils: def __init__(self): self.callback_url = os.environ['SDK_CALLBACK_URL'] self.au = AssemblyUtil(self.callback_url) self.gfu = GenomeFileUtil(self.callback_url) pass def get_gff(self, genome_ref, output_dir): ''' function for downloaing gff file :param genome_ref: :param output_dir: :return: ''' gff_filename = os.path.join(output_dir + "/snp_eff/data/kbase_v1", "gene.gff") file = self.gfu.genome_to_gff({ 'genome_ref': genome_ref, 'filename': gff_filename }) return file['file_path'] def get_assembly(self, assembly_ref, output_dir): ''' function for downloaing assembly file. :param assembly_ref: :param output_dir: :return: ''' assembly_filename = os.path.join(output_dir + "/snp_eff/data/kbase_v1", "sequences.fa") file = self.au.get_assembly_as_fasta({ 'ref': assembly_ref, 'filename': assembly_filename }) return file['path']
class ProkkaUtils: def __init__(self, config): self.scratch = config["scratch"] self.ctx = config['ctx'] self.callback_url = config["SDK_CALLBACK_URL"] self.ws_client = workspaceService(config["workspace-url"]) self.gfu = GenomeFileUtil(self.callback_url) self.au = AssemblyUtil(self.callback_url) self.kbr = KBaseReport(self.callback_url) self.dfu = DataFileUtil(self.callback_url) self.genome_api = GenomeAnnotationAPI(self.callback_url) self.sso_ref = None self.sso_event = None self.ec_to_sso = {} self.output_workspace = None @staticmethod def _get_input_value(params, key): """Get value of key after checking for its existence :param params: Params dictionary haystack :param key: Key to search in Params :return: Parameter Value :raises ValueError: raises an exception if the key doesn"t exist """ if not key in params: raise ValueError("Parameter " + key + " should be set in input parameters") return params[key] @staticmethod def _get_qualifier_value(qualifier): """Get first qualifier from the list of qualifiers :param qualifier: list contents of the qualifier from BCBio GFF Tools :return: first element in the list """ return qualifier[0] if (qualifier and len(qualifier) > 0) else None def download_seed_data(self): """Download Seed Data Ontology, and set the gene_ontology reference (sso_ref) and the create a table from ec numbers to sso (ec_to_sso) :return: None """ # Download Seed Reference Data sso_ret = self.ws_client.get_objects([{ "ref": "KBaseOntology/seed_subsystem_ontology" }])[0] sso = sso_ret["data"] for sso_id in sso["term_hash"]: sso_name = sso["term_hash"][sso_id]["name"] if "(EC " in sso_name and sso_name.endswith(")"): ec = sso_name[sso_name.index("(EC ") + 4:-1].strip() sso_list = self.ec_to_sso.get(ec, None) if not sso_list: sso_list = [] self.ec_to_sso[ec] = sso_list sso_list.append(sso["term_hash"][sso_id]) print("EC found in SSO: " + str(len(self.ec_to_sso))) sso_info = sso_ret["info"] sso_ref = str(sso_info[6]) + "/" + str(sso_info[0]) + "/" + str( sso_info[4]) with open("/kb/module/work/seed_so.json", "w") as outfile: json.dump(sso, outfile, sort_keys=True, indent=4) self.sso_ref = sso_ref def inspect_assembly(self, assembly_meta, assembly_ref): """Check to see if assembly has too many contigs and might not be a metagenome or non prokaryotic dataset :param assembly_meta: information about the assembly reference :param assembly_ref: the assembly reference number :return: a tuple containing gc_content and dna_size """ gc_content = float(assembly_meta.get("GC content")) dna_size = int(assembly_meta.get("Size")) n_contigs = 0 if "N Contigs" in assembly_meta: n_contigs = int(assembly_meta.get("N Contigs")) else: contig = self.ws_client.get_objects([{"ref": assembly_ref}])[0] n_contigs = len(contig["data"]["contigs"]) if n_contigs >= 30000: message = """ Hmmm. There are over 30,000 contigs in this Assembly. It looks like you are trying to run Prokka on a metagenome or non-prokaryotic data set. If this is a metagenome data set we recommend using an App like MaxBin to first bin the contigs into genome-like bins. These bins can then be individually annotated as a single genome using Prokka. If this data comes from a Eukaryotic sample, KBase does not currently have an annotation app designed for Eukaryotes. Alternatively, you can try reducing the number of contigs using a filter app.") raise ValueError("Too many contigs for Prokka. See logs for details and suggestions """ print(message) #raise ValueError("Too many contigs for Prokka. See logs for details and suggestions") assembly_info = namedtuple("assembly_info", "gc_content dna_size") return assembly_info(gc_content, dna_size) @staticmethod def create_renamed_assembly(assembly_fasta_filepath): """Rename records to be in the format of contig_N and output a new fasta file :param assembly_fasta_filepath: :return: A tuple with The path to the fasta file with renamed contigs the number of contigs, the mapping from old ids to new ids, and the contigs as SeqRecords """ records = [] new_ids_to_old = {} contig_counter = 0 for record in SeqIO.parse(assembly_fasta_filepath, "fasta"): contig_counter += 1 old_id = record.id new_id = "contig_" + str(contig_counter) sequence = record.seq # it has type "Seq" record = SeqRecord(sequence, id=new_id, description="(" + old_id + ")") records.append(record) new_ids_to_old[new_id] = old_id renamed_assembly_fasta_filepath = assembly_fasta_filepath + "_renamed.fna" SeqIO.write(records, renamed_assembly_fasta_filepath, "fasta") renamed_assembly = namedtuple( "renamed_assembly", "filepath contig_counter new_ids_to_old records") return renamed_assembly(renamed_assembly_fasta_filepath, contig_counter, new_ids_to_old, records) def run_prokka(self, params, subject_fasta_filepath): """Run Prokka :param params: Prokka parameters :param subject_fasta_filepath: The contigs or genes to run prokka against :return: The directory with all of the prokka output files """ output_dir = "/kb/module/work/tmp/temp_" + str(uuid.uuid4()) # --kingdom [X] Annotation mode: Archaea|Bacteria|Mitochondria|Viruses (default "Bacteria") kingdom = "Bacteria" if "kingdom" in params and params["kingdom"]: kingdom = params["kingdom"] prokka_cmd_list = [ "perl", "/kb/prokka/bin/prokka", "--metagenome", "--outdir", output_dir, "--prefix", "mygenome", "--kingdom", kingdom ] # --genus [X] Genus name (triggers to use --usegenus) if "genus" in params and params["genus"]: prokka_cmd_list.extend( ["--genus", str(params["genus"]), "--usegenus"]) # --gcode [N] Genetic code / Translation table (set if --kingdom is set) (default "0") if "gcode" in params and params["gcode"]: prokka_cmd_list.extend(["--gcode", str(params["gcode"])]) else: prokka_cmd_list.extend(["--gcode", "0"]) # --gram [X] Gram: -/neg +/pos (default "") if "gram" in params and params["gram"]: raise ValueError( "gram parameter is not supported in current Prokka installation" ) # --metagenome Improve gene predictions for highly fragmented genomes (default OFF) if "metagenome" in params and params["metagenome"] == 1: prokka_cmd_list.append("--metagenome") # --rawproduct Do not clean up /product annotation (default OFF) if "rawproduct" in params and params["rawproduct"] == 1: prokka_cmd_list.append("--rawproduct") # --fast Fast mode - skip CDS /product searching (default OFF) if "fast" in params and params["fast"] == 1: prokka_cmd_list.append("--fast") # --mincontiglen [N] Minimum contig size [NCBI needs 200] (default "1") if "mincontiglen" in params and params["mincontiglen"]: prokka_cmd_list.extend( ["--mincontiglen", str(params["mincontiglen"])]) # --evalue [n.n] Similarity e-value cut-off (default "1e-06") if "evalue" in params and params["evalue"]: prokka_cmd_list.extend(["--evalue", str(params["evalue"])]) # --rfam Enable searching for ncRNAs with Infernal+Rfam (SLOW!) (default "0") if "rfam" in params and params["rfam"] == 1: prokka_cmd_list.append("--rfam") # --norrna Don"t run rRNA search (default OFF) if "norrna" in params and params["norrna"] == 1: prokka_cmd_list.append("--norrna") # --notrna Don"t run tRNA search (default OFF) if "notrna" in params and params["notrna"] == 1: prokka_cmd_list.append("--notrna") prokka_cmd_list.append(subject_fasta_filepath) print("Prokka command line: " + str(prokka_cmd_list)) #tbl2asn or some other non essential prokka binary will fail, so supress that try: check_output(prokka_cmd_list, cwd=self.scratch) except CalledProcessError as e: pprint(e) return output_dir @staticmethod def retrieve_prokka_results(output_dir): """ Gather up the relevant prokka results, load the records from the results files :param output_dir: :return: A tuple containing Sequences from the .faa .ffn files and the gff_filepath """ faa_file = output_dir + "/mygenome.faa" cds_to_prot = {} for record in SeqIO.parse(faa_file, "fasta"): cds_to_prot[record.id] = str(record.seq) ffn_file = output_dir + "/mygenome.ffn" cds_to_dna = {} for record in SeqIO.parse(ffn_file, "fasta"): cds_to_dna[record.id] = str(record.seq) gff_file = output_dir + "/mygenome.gff" if not os.path.isfile(gff_file): raise ValueError("PROKKA output GFF file is not found") prokka_results = namedtuple("prokka_results", "cds_to_prot cds_to_dna gff_filepath") return prokka_results(cds_to_prot, cds_to_dna, gff_file) def parse_prokka_results(self, **prokka_parse_parameters): """ Go through the prokka results from the input contigs and then create the features, mrnas and cdss components of the KbaseGenome.Genome object for genome annotation only. :param prokka_parse_parameters: gff_filepath, mappings :return: A tuple with Genome:features Genome:cdss Genome:mrnas report_message of genes discovered """ gff_filepath = prokka_parse_parameters["gff_filepath"] cds_to_dna = prokka_parse_parameters["cds_to_dna"] cds_to_prot = prokka_parse_parameters["cds_to_prot"] new_ids_to_old = prokka_parse_parameters["new_ids_to_old"] evidence = self.make_annotation_evidence() cdss = [] mrnas = [] features = [] non_hypothetical = 0 genes_with_ec = 0 genes_with_sso = 0 prot_lengths = [] with open(gff_filepath, "r") as f1: for rec in GFF.parse(f1): contig_id = new_ids_to_old[str(rec.id)] for ft in rec.features: loc = ft.location min_pos = int(loc.start) + 1 max_pos = int(loc.end) strand = "+" if loc.strand == 1 else "-" flen = max_pos - min_pos + 1 start = min_pos if strand == "+" else max_pos location = [[contig_id, start, strand, flen]] qualifiers = ft.qualifiers generated_id = self._get_qualifier_value( qualifiers.get("ID")) if not generated_id: # Skipping feature with no ID (mostly repeat regions) continue dna = cds_to_dna.get(generated_id) if not dna: # Skipping feature with no DNA (mostly repeat regions) continue name = self._get_qualifier_value(qualifiers.get("Name")) ec = self._get_qualifier_value(qualifiers.get("eC_number")) gene = self._get_qualifier_value(qualifiers.get("gene")) product = self._get_qualifier_value( qualifiers.get("product")) fid = generated_id aliases = [] if name: aliases.append(name) if gene: aliases.append(gene) if ec: aliases.append(ec) genes_with_ec += 1 md5 = hashlib.md5(dna).hexdigest() feature = { "id": fid, "location": location, "type": "gene", "aliases": aliases, "md5": md5, "dna_sequence": dna, "dna_sequence_length": len(dna), } if product: feature["function"] = product if product != "hypothetical protein": non_hypothetical += 1 if ec and ec in self.ec_to_sso: sso_list = self.ec_to_sso[ec] sso_terms = {} for sso_item in sso_list: sso_terms[sso_item["id"]] = { "id": sso_item["id"], "evidence": [evidence], "term_name": sso_item["name"], "ontology_ref": self.sso_ref, "term_lineage": [] } feature["ontology_terms"] = {"SSO": sso_terms} genes_with_sso += 1 cds = None mrna = None prot = cds_to_prot.get(generated_id) if prot: cds_id = fid + "_CDS" mrna_id = fid + "_mRNA" prot_len = len(prot) prot_lengths.append(prot_len) feature["protein_translation"] = prot feature["protein_translation_length"] = prot_len feature["cdss"] = [cds_id] feature["mrnas"] = [mrna_id] cds = { "id": cds_id, "location": location, "md5": md5, "parent_gene": fid, "parent_mrna": mrna_id, "function": (product if product else ""), "ontology_terms": {}, "protein_translation": prot, "protein_translation_length": prot_len, "aliases": aliases } mrna = { "id": mrna_id, "location": location, "md5": md5, "parent_gene": fid, "cds": cds_id } features.append(feature) if cds: cdss.append(cds) if mrna: mrnas.append(mrna) # Prepare report report = "" report += "Number of genes predicted: " + str(len(features)) + "\n" report += "Number of protein coding genes: " + str( len(prot_lengths)) + "\n" report += "Number of genes with non-hypothetical function: " + str( non_hypothetical) + "\n" report += "Number of genes with EC-number: " + str( genes_with_ec) + "\n" report += "Number of genes with Seed Subsystem Ontology: " + str( genes_with_sso) + "\n" report += "Average protein length: " + str( int(sum(prot_lengths) / float(len(prot_lengths)))) + " aa.\n" annotated_assembly = namedtuple("annotated_assembly", "features cdss mrnas report_message") return annotated_assembly(features, cdss, mrnas, report) def get_new_annotations(self, gff_filepath): """ :param gff_filepath: A dictionary of ids with products and ec numbers :return: """ evidence = self.make_annotation_evidence() genome = {} with open(gff_filepath, "r") as f: for rec in GFF.parse(f): gid = rec.id gene_features = {"id": id} for feature in rec.features: qualifiers = feature.qualifiers if "product" in qualifiers: gene_features["function"] = " ".join( qualifiers["product"]) if "eC_number" in qualifiers: ec_numbers = qualifiers["eC_number"] sso_terms = dict() for ec in ec_numbers: sso_list = self.ec_to_sso.get(ec, []) for sso_item in sso_list: sso_terms[sso_item["id"]] = { "id": sso_item["id"], "evidence": [evidence], "term_name": sso_item["name"], "ontology_ref": self.sso_ref, "term_lineage": [] } gene_features["ontology_terms"] = sso_terms genome[gid] = gene_features return genome def write_genome_to_fasta(self, genome_data): """ :param genome_data: :return: """ fasta_for_prokka_filepath = os.path.join( self.scratch, "features_" + str(uuid.uuid4()) + ".fasta") count = 0 with open(fasta_for_prokka_filepath, "w") as f: for item in genome_data["data"]["features"]: if "id" not in item or "dna_sequence" not in item: print("This feature does not have a valid dna sequence.") else: f.write(">" + item["id"] + "\n" + item["dna_sequence"] + "\n") count += 1 print("Finished printing to" + fasta_for_prokka_filepath) if os.stat(fasta_for_prokka_filepath).st_size == 0: raise Exception( "This genome does not contain features with DNA_SEQUENCES. Fasta file is empty." ) return fasta_for_prokka_filepath def make_sso_ontology_event(self): """ :param sso_ref: Reference to the annotation library set :return: Ontology_event to be appended to the list of genome ontology events """ time_string = str( datetime.datetime.fromtimestamp( time.time()).strftime('%Y_%m_%d_%H_%M_%S')) yml_text = open('/kb/module/kbase.yml').read() version = re.search("module-version:\n\W+(.+)\n", yml_text).group(1) return { "method": "Prokka Annotation", "method_version": version, "timestamp": time_string, "id": "SSO", "ontology_ref": self.sso_ref } def make_annotation_evidence(self): """ Create a dict for the evidence field for the genome :param sso_ref: Reference to the annotation library set :return: Ontology_event to be appended to the list of genome ontology events """ time_string = str( datetime.datetime.fromtimestamp( time.time()).strftime('%Y_%m_%d_%H_%M_%S')) yml_text = open('/kb/module/kbase.yml').read() version = re.search("module-version:\n\W+(.+)\n", yml_text).group(1) return { "method": "Prokka Annotation (Evidence)", "method_version": version, "timestamp": time_string, } def create_genome_ontology_fields(self, genome_data): """ Create ontology event fields for a genome object :param genome_data: A genome object's data filed :return: a named tuple containg the modified genome object and a new ontology event index """ # Make sure ontologies_events exist sso_event = self.make_sso_ontology_event() ontology_event_index = 0 if 'ontology_events' in genome_data['data']: genome_data['data']['ontology_events'].append(sso_event) ontology_event_index += len( genome_data['data']['ontology_events']) - 1 else: genome_data['data']['ontology_events'] = [sso_event] genome_obj_modified = namedtuple('genome_obj_modified', 'genome_data ontology_event_index') return genome_obj_modified(genome_data, ontology_event_index) @staticmethod def old_genome_ontologies(feature, new_ontology): """ Update the feature's ontologies for an old genome :param feature: Feature to update :param new_ontology: New Ontology to update with :return: The feature with the ontology updated, in the old style """ if "ontology_terms" not in feature: feature["ontology_terms"] = {"SSO": {}} if "SSO" not in feature["ontology_terms"]: feature["ontology_terms"]["SSO"] = {} for key in new_ontology.keys(): feature["ontology_terms"]["SSO"][key] = new_ontology[key] return feature @staticmethod def new_genome_ontologies(feature, new_ontology, ontology_event_index): """ Update the feature's ontologies for a new genome :param feature: Feature to update :param new_ontology: New Ontology to update with :param ontology_event_index: Ontology index to update the feature with :return: the updated feature """ if "ontology_terms" not in feature: feature["ontology_terms"] = {"SSO": {}} if "SSO" not in feature["ontology_terms"]: feature["ontology_terms"]["SSO"] = {} for key in new_ontology.keys(): id = new_ontology[key]["id"] if id in feature["ontology_terms"]["SSO"]: feature["ontology_terms"]["SSO"][id].append( ontology_event_index) else: feature["ontology_terms"]["SSO"][id] = [ontology_event_index] return feature def annotate_genome_with_new_annotations(self, **annotation_args): """ Annotate the genome with new annotations for Genome ReAnnotation :param annotation_args: genome_data from the genome obj, new_annotations from prokka, and the output_genome_name :return: A tuple containg the genome_ref, filepaths for the function and ontology summary, and stats about the annotations """ genome_data = annotation_args["genome_data"] new_annotations = annotation_args["new_annotations"] new_genome = False if 'feature_counts' in genome_data['data']: new_genome = True genome_obj_modified = self.create_genome_ontology_fields( genome_data) genome_data = genome_obj_modified.genome_data ontology_event_index = genome_obj_modified.ontology_event_index stats = { "current_functions": len(genome_data["data"]["features"]), "new_functions": 0, "found_functions": 0, "new_ontologies": 0 } function_summary_fp = os.path.join(self.scratch, "ontology_report") ontology_summary_fp = os.path.join(self.scratch, "function_report") onto_r = open(function_summary_fp, "w") func_r = open(ontology_summary_fp, "w") func_r.write("function_id current_function new_function\n") onto_r.write("function_id current_ontology new_ontology\n") ontologies_present = {"SSO": {}} for i, feature in enumerate(genome_data["data"]["features"]): fid = feature["id"] current_function = feature.get("function", "") current_functions = feature.get("functions", []) current_ontology = feature.get("ontology_terms", None) new_function = "" new_ontology = dict() if fid in new_annotations: # Set Function new_function = new_annotations[fid].get("function", "") if new_function and "hypothetical protein" not in new_function: if (new_function != current_function and new_function not in current_functions): stats['new_functions'] += 1 genome_data["data"]["features"][i][ "function"] = new_function genome_data["data"]["features"][i]["functions"] = [ new_function ] stats['found_functions'] += 1 # Set Ontologies new_ontology = new_annotations[fid].get("ontology_terms", None) if new_ontology: stats['new_ontologies'] += 1 if new_genome: # New style genome_data["data"]["features"][i] = self. \ new_genome_ontologies(feature, new_ontology, ontology_event_index) # Add to ontologies Present for key in new_ontology.keys(): oid = new_ontology[key]["id"] name = new_ontology[key].get("name", "Unknown") ontologies_present["SSO"][oid] = name else: genome_data["data"]["features"][i] = self. \ old_genome_ontologies(feature, new_ontology) if current_function: func_r.write( json.dumps([fid, [current_function], [new_function]]) + "\n") else: func_r.write( json.dumps([fid, current_functions, [new_function]]) + "\n") onto_r.write( json.dumps([fid, current_ontology, new_ontology]) + "\n") func_r.close() onto_r.close() if ontologies_present: if "ontologies_present" in genome_data["data"]: if "SSO" in genome_data["data"]["ontologies_present"]: for key, value in ontologies_present["SSO"].items(): genome_data["data"]["ontologies_present"]["SSO"][ key] = value else: genome_data["data"][ "ontologies_present"] = ontologies_present["SSO"] else: genome_data["data"]["ontologies_present"] = ontologies_present info = self.gfu.save_one_genome({ "workspace": self.output_workspace, "name": annotation_args["output_genome_name"], "data": genome_data["data"], "provenance": self.ctx.provenance() })["info"] genome_ref = str(info[6]) + "/" + str(info[0]) + "/" + str(info[4]) annotated_genome = namedtuple( "annotated_genome", "genome_ref function_summary_filepath ontology_summary_filepath stats" ) return annotated_genome(genome_ref, function_summary_fp, ontology_summary_fp, stats) def upload_file(self, filepath, message="Annotation report generated by kb_prokka"): """ Upload a file to shock :param filepath: File to upload :param message: Optional Upload Message :return: """ output_file_shock_id = self.dfu.file_to_shock({"file_path": filepath})["shock_id"] print("Uploaded filepath" + filepath + "to shock and got id" + output_file_shock_id) return { "shock_id": output_file_shock_id, "name": os.path.basename(filepath), "label": os.path.basename(filepath), "description": message } def report_annotated_genome(self, genome): """ Create report output with newly reannotated genome, and some stats :param genome: Reannotated Genome Reference, Report Files and Stats :return: Reference to Report Object """ genome_ref = genome.genome_ref stats = genome.stats file_links = [ self.upload_file(genome.ontology_summary_filepath), self.upload_file(genome.function_summary_filepath) ] report_message = ("Genome Ref:{0}\n" "Number of features sent into prokka:{1}\n" "New functions found:{2}\n" "Ontology terms found:{3}\n").format( genome_ref, stats["current_functions"], stats["new_functions"], stats["new_ontologies"]) report_info = self.kbr.create_extended_report({ "message": report_message, "objects_created": [{ "ref": genome_ref, "description": "Annotated genome" }], "file_links": file_links, "report_object_name": "kb_prokka_report_" + str(uuid.uuid4()), "workspace_name": self.output_workspace }) return { "output_genome_ref": genome_ref, "report_name": report_info["name"], "report_ref": report_info["ref"] } def annotate_genome(self, params): """ User input an existing genome to re-annotate. :param params: Reference to the genome, Output File Name, UI Parameters :return: Report with Reannotated Genome and Stats about it """ self.download_seed_data() self.output_workspace = params["output_workspace"] genome_ref = self._get_input_value(params, "object_ref") output_name = self._get_input_value(params, "output_genome_name") # genome_data = self.dfu.get_objects({"object_refs": [genome_ref]})["data"][0] genome_data = \ self.genome_api.get_genome_v1({"genomes": [{"ref": genome_ref}], 'downgrade': 0})[ "genomes"][0] fasta_for_prokka_filepath = self.write_genome_to_fasta(genome_data) output_dir = self.run_prokka(params, fasta_for_prokka_filepath) prokka_results = self.retrieve_prokka_results(output_dir) new_annotations = self.get_new_annotations(prokka_results.gff_filepath) annotated_genome = self.annotate_genome_with_new_annotations( genome_data=genome_data, new_annotations=new_annotations, output_genome_name=output_name) return self.report_annotated_genome(annotated_genome) def annotate_assembly(self, params, assembly_info): """ Annotate an assembly with Prokka. The steps include to download the assembly as a fasta file, rename the contigs, run prokka against the contigs, parse the results, and finally, create and upload a genome object. :param params: object reference, output_genome_name and output_workspace :param assembly_info: Information used to determine if the assembly is too big :return: Report with newly annotated assembly as a genome, and stats about it """ self.download_seed_data() output_workspace = params["output_workspace"] assembly_ref = self._get_input_value(params, "object_ref") output_genome_name = self._get_input_value(params, "output_genome_name") output_workspace = self._get_input_value(params, "output_workspace") assembly_info = self.inspect_assembly(assembly_info[10], assembly_ref) orig_fasta_file = self.au.get_assembly_as_fasta({"ref": assembly_ref})["path"] # Rename Assembly and Keep Track of Old Contigs renamed_assembly = self.create_renamed_assembly(orig_fasta_file) # Run Prokka with the modified, renamed fasta file output_dir = self.run_prokka(params, renamed_assembly.filepath) # Prokka_results prokka_results = self.retrieve_prokka_results(output_dir) # Parse Results annotated_assembly = self.parse_prokka_results( gff_filepath=prokka_results.gff_filepath, cds_to_dna=prokka_results.cds_to_dna, cds_to_prot=prokka_results.cds_to_prot, new_ids_to_old=renamed_assembly.new_ids_to_old) # Force defaults for optional parameters that may be set to None scientific_name = 'Unknown' if 'scientific_name' in params and params['scientific_name']: scientific_name = params['scientific_name'] domain = "Bacteria" if 'kingdom' in params and params['kingdom']: domain = params['kingdom'] gcode = 0 if 'gcode' in params and params['gcode']: gcode = params['gcode'] genome = { "id": "Unknown", "features": annotated_assembly.features, "scientific_name": scientific_name, "domain": domain, "genetic_code": gcode, "assembly_ref": assembly_ref, "cdss": annotated_assembly.cdss, "mrnas": annotated_assembly.mrnas, "source": "PROKKA annotation pipeline", "gc_content": assembly_info.gc_content, "dna_size": assembly_info.dna_size, "reference_annotation": 0 } info = self.gfu.save_one_genome({ "workspace": output_workspace, "name": output_genome_name, "data": genome, "provenance": self.ctx.provenance() })["info"] genome_ref = str(info[6]) + "/" + str(info[0]) + "/" + str(info[4]) report_message = "Genome saved to: " + output_workspace + "/" + \ output_genome_name + "\n" + annotated_assembly.report_message report_info = self.kbr.create_extended_report({ "message": report_message, "objects_created": [{ "ref": genome_ref, "description": "Annotated genome" }], "report_object_name": "kb_prokka_report_" + str(uuid.uuid4()), "workspace_name": output_workspace }) return { "output_genome_ref": genome_ref, "report_name": report_info["name"], "report_ref": report_info["ref"] }
class VCFToVariation: def __init__(self, config, scratch, callback_url ): self.scratch = config['scratch'] self.ws_url = config['workspace-url'] self.callback_url = os.environ['SDK_CALLBACK_URL'] self.dfu = DataFileUtil(self.callback_url) self.wsc = Workspace(self.ws_url) self.scratch = scratch self.callback_url = callback_url self.au = AssemblyUtil(self.callback_url) self.gapi = GenericsAPI(self.callback_url) def _parse_vcf_data(self, params): vcf_filepath = self._stage_input(params) # file is validated by this point, can assume vcf_filepath is valid reader = vcf.Reader(open(vcf_filepath, 'r')) version = float(reader.metadata['fileformat'][4:6]) genotypes = reader.samples chromosomes = [] contigs = {} totalvars = 0 for record in reader: totalvars += 1 if record.CHROM not in chromosomes: chromosomes.append(record.CHROM) if record.CHROM not in contigs.keys(): passvar = 1 if not record.FILTER else 0 contigs[record.CHROM] = { 'contig_id': record.CHROM, 'totalvariants': 1, 'passvariants': passvar, 'length': int(record.affected_end-record.affected_start), } else: contigs[record.CHROM]['totalvariants'] += 1 if not record.FILTER: contigs[record.CHROM]['passvariants'] += 1 vcf_info = { 'version': version, 'contigs': contigs, 'total_variants': totalvars, 'genotype_ids': genotypes, 'chromosome_ids': chromosomes, 'file_ref': vcf_filepath } return vcf_info def _validate_vcf_to_sample(self, vcf_genotypes, sample_ids): genos_not_found = [] vgenotypes = [x.upper().strip() for x in vcf_genotypes] sids = [x.upper().strip() for x in sample_ids] for geno in vgenotypes: if geno not in sids: genos_not_found.append(geno) if not genos_not_found: return True else: return genos_not_found def _chk_if_vcf_ids_in_assembly(self, vcf_chromosomes, assembly_chromosomes): chromos_not_in_assembly = [] pp(assembly_chromosomes) for chromo in vcf_chromosomes: if chromo not in assembly_chromosomes: chromos_not_in_assembly.append(chromo) if not chromos_not_in_assembly: return True else: return chromos_not_in_assembly def _get_vcf_version(self, vcf_filepath): with(gzip.open if is_gz_file(vcf_filepath) else open)(vcf_filepath, 'rt') as vcf: line = vcf.readline() tokens = line.split('=') if not (tokens[0].startswith('##fileformat')): log("Invalid VCF. ##fileformat line in meta is improperly formatted.") raise ValueError("Invalid VCF. ##fileformat line in meta is improperly formatted. " "Check VCF file specifications: https://samtools.github.io/hts-specs/") vcf_version = float(tokens[1][-4:].rstrip()) return vcf_version def validate_vcf(self, params): if 'genome_or_assembly_ref' not in params: raise ValueError('Genome or Assembly reference not in input parameters: \n\n'+params) if 'vcf_staging_file_path' not in params: raise ValueError('VCF staging file path not in input parameters: \n\n' + params) vcf_filepath = self._stage_input(params) vcf_version = self._get_vcf_version(vcf_filepath) # setup directorys for validation output validation_output_dir = os.path.join(self.scratch, 'validation_' + str(uuid.uuid4())) os.mkdir(validation_output_dir) # vcftools (vcf-validator) supports VCF v4.0-4.2 # https://github.com/vcftools/vcftools # EBIvariation/vcf-validator (vcf_validator_linux) supports VCF v4.1-4.3 # https://github.com/EBIvariation/vcf-validator # vcftools is only to validate VCF v4.0 if vcf_version >= 4.1: print("Using vcf_validator_linux...") validator_cmd = ["vcf_validator_linux"] validator_cmd.append("-i") validator_cmd.append(vcf_filepath) validator_cmd.append("-l") validator_cmd.append('error') print("VCF version "+str(vcf_version)+".") elif vcf_version >= 4.0: print("Using vcftools to validate...") validator_cmd = ["vcf-validator"] validator_cmd.append(vcf_filepath) print("VCF version 4.0.") else: raise ValueError('VCF Version not in file, or fileformat line malformatted, or not version >=4.0. file format line must be the ' 'first line of vcf file and in appropriate syntax. Check VCF file specifications: ' 'https://samtools.github.io/hts-specs/') print("Validator command: {}".format(validator_cmd)) p = subprocess.Popen(validator_cmd, cwd=self.scratch, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) validator_output = [] while True: line = p.stdout.readline() if not line: break if line.decode("utf-8").strip().startswith('[info]'): validator_output.append(line.decode("utf-8")) out, err = p.communicate() validation_output_filename = os.path.join(validation_output_dir, 'vcf_validation.txt') file_output_chk = [] try: if validator_output[0][:6] == '[info]': # validation by vcf_validator_linux validation_output_filename = validator_output[1].split(' ')[6].strip('\n') vo = validator_output[2].split(' ') file_output_chk = ''.join(vo[9:]).strip('\n') if not os.path.exists(validation_output_filename): raise ValueError(validation_output_filename+' does not exist!') if not file_output_chk == 'isvalid': print('\n'.join(validator_output)) raise ValueError('\n'.join(validator_output)) #TODO: more detailed validation parsing for vcf_validator_linux else: if validator_output: with open(validation_output_filename, 'w') as f: for line in validator_output: f.write(str(line)) f.close() print('\n'.join(validator_output)) raise ValueError('\n'.join(validator_output)) else: with open(validation_output_filename, 'w') as f: f.write("vcftools used to validate vcf file:\n"+vcf_filepath+"\n\File is validate as of vcf spec v4.0") f.close() # TODO: more detailed validation parsing for vcftools except IndexError: # if vcf file < v4.1, and valid it will produce index error on line 132 if validator_output: with open(validation_output_filename, 'w') as f: for line in validator_output: f.write(str(line)) f.close() print('\n'.join(validator_output)) raise ValueError('\n'.join(validator_output)) else: with open(validation_output_filename, 'w') as f: f.write("vcftools used to validate vcf file:\n" + vcf_filepath + "\n\File is validate as of vcf spec v4.0") f.close() if not os.path.exists(validation_output_filename): print('Validator did not generate log file!') raise SystemError("Validator did not generate a log file.") log("Validator output filepath: {}".format(validation_output_filename)) log("Return code from validator {}".format(p.returncode)) return validation_output_filename def _stage_input(self, params): # extract file location from input ui parameters if params['vcf_staging_file_path'].startswith('/kb/module/test/'): # variation utils unit test vcf_local_file_path = params['vcf_staging_file_path'] if vcf_local_file_path.endswith('.gz'): with gzip.open(vcf_local_file_path, 'rb') as f_in: with open(vcf_local_file_path[:-3], 'wb') as f_out: shutil.copyfileobj(f_in, f_out) vcf_local_file_path = vcf_local_file_path[:-3] else: staging_dir = '/staging' vcf_local_file_path = os.path.join(staging_dir, params['vcf_staging_file_path']) if not os.path.exists(vcf_local_file_path): raise OSError('VCF input path does not exist, or is not readable') orig_file_path = os.path.join(self.scratch, 'original_' + os.path.basename(vcf_local_file_path)) print(f'VCF: {vcf_local_file_path} Orig: {orig_file_path}') self.original_file = shutil.copy(vcf_local_file_path, orig_file_path) # TODO: use data file utils here, upload vcf to shock, use dfu. if is_gz_file(vcf_local_file_path): # /staging is read only, therefore have to copy before uncompressing if not vcf_local_file_path == os.path.join(self.scratch, params['vcf_staging_file_path']): copy = shutil.copy(vcf_local_file_path, os.path.join(self.scratch,params['vcf_staging_file_path'])) unpack = self.dfu.unpack_file({'file_path': copy}) else: unpack = {} unpack['file_path'] = os.path.join(self.scratch,params['vcf_staging_file_path']) params['vcf_local_file_path'] = unpack['file_path'] return unpack['file_path'] else: params['vcf_local_file_path'] = vcf_local_file_path return vcf_local_file_path def _create_sample_attribute_file(self, vcf_file, sample_attribute_mapping_file): """ function for creating sample attribute mapping file. """ try: with open (vcf_file, 'r') as vcf_handle: Lines = vcf_handle.readlines() for line in Lines: if(line.startswith("#CHROM")): header = line.lstrip().split("\t") try: with open (sample_attribute_mapping_file, 'w') as attribute_mapping_handle: attribute_mapping_handle.write("Attribute\tAttribute ontology ID\tUnit\tUnit ontology ID") for i in range(9,len(header)): attribute_mapping_handle.write("\t"+header[i]) #attribute_mapping_handle.write("\n") attribute_mapping_handle.write("label\t\t\t") for j in range(9,len(header)): attribute_mapping_handle.write("\t"+header[j]) #attribute_mapping_handle.write("\n") except IOError: print("Could not write to file:", sample_attribute_mapping_file) except IOError: print("Could not read file:", vcf_file) def _validate_assembly_ids(self, params): # All chromosome ids from the vcf should be in assembly # but not all assembly chromosome ids should be in vcf if ('genome_ref' in params): subset = self.wsc.get_object_subset([{ 'included': ['/assembly_ref'], 'ref': params['genome_or_assembly_ref'] }]) self.vcf_info['assembly_ref'] = subset[0]['data']['assembly_ref'] if ('assembly_ref' in params): self.vcf_info['assembly_ref'] = params['assembly_ref'] assembly_chromosome_ids_call = self.wsc.get_object_subset([{ 'included': ['/contigs'], 'ref': self.vcf_info['assembly_ref'] }]) assembly_chromosomes = assembly_chromosome_ids_call[0]['data']['contigs'].keys() vcf_chromosomes = self.vcf_info['chromosome_ids'] chk_assembly_ids = self._chk_if_vcf_ids_in_assembly(vcf_chromosomes, assembly_chromosomes) if isinstance(chk_assembly_ids, list): failed_ids = ' '.join(chk_assembly_ids) print(f'VCF contig ids: {failed_ids} are not present in assembly.') raise ValueError(f'VCF contig ids: {failed_ids} are not present in assembly.') return assembly_chromosomes def _validate_sample_ids(self, params): # All samples within the VCF file need to be in sample attribute list vcf_genotypes = self.vcf_info['genotype_ids'] sample_ids_subset = self.wsc.get_object_subset([{ 'included': ['/instances'], 'ref': params['sample_attribute_ref'] }]) sample_ids = sample_ids_subset[0]['data']['instances'].keys() validate_genotypes = self._validate_vcf_to_sample(vcf_genotypes, sample_ids) if isinstance(validate_genotypes, list): failed_genos = ' '.join(validate_genotypes) print(f'VCF genotypes: {failed_genos} are not present in sample attribute mapping.') raise ValueError(f'VCF genotypes: {failed_genos} are not present in sample attribute mapping.') return sample_ids def _construct_contig_info(self, params): """ KBaseGwasData.Variations type spec /* Contig variation data contig_id - contig identifier totalvariants - total number of variants in each contig passvariants - total number of variants that pass quality variation filter in contig length - length of contig from assembly data */ typdef structure { string contig_id; int totalvariants; int passvariants; int length; // from assembly } contig_info; """ assembly_chromosome_dict = self.wsc.get_object_subset([{ 'included': ['/contigs'], 'ref': self.vcf_info['assembly_ref'] }])[0]['data']['contigs'] contigs = [] contig_infos = self.vcf_info['contigs'] for contig_id in contig_infos: length_contig = assembly_chromosome_dict[contig_id].get("length") contig_infos[contig_id]["length"] = length_contig contigs.append(contig_infos[contig_id]) return contigs def _bgzip_vcf(self, vcf_filepath): if not os.path.exists(vcf_filepath): print (vcf_filepath + " does not exist") zip_cmd = ["bgzip", vcf_filepath] p = subprocess.Popen(zip_cmd, cwd=self.scratch, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) out, err = p.communicate() bgzip_file_path = vcf_filepath + ".gz" print (bgzip_file_path) return bgzip_file_path def _index_vcf(self, bgzip_file): output_dir = self.scratch bgzip_filepath = os.path.join(self.scratch, bgzip_file) if not os.path.exists(bgzip_filepath): print (bgzip_filepath + " does not exist") index_cmd = ["tabix", "-p", "vcf", bgzip_filepath] p = subprocess.Popen(index_cmd, cwd=self.scratch, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) out, err = p.communicate() index_file_path = bgzip_filepath + ".tbi" return index_file_path def _index_assembly(self, assembly_file): if not os.path.exists(assembly_file): print (assembly_file + " does not exist") logging.info("indexing assembly file") assembly_index_cmd = ["samtools", "faidx", assembly_file] print(assembly_index_cmd) p = subprocess.Popen(assembly_index_cmd, cwd=self.scratch, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) out, err = p.communicate() logging.info("indexing of assembly file done!") return assembly_file + ".fai" def _download_assembly(self, assembly_ref): file = self.au.get_assembly_as_fasta({ 'ref': assembly_ref }) return file def _construct_variation(self, params, contigs_info): """ KBaseGwasData.Variations type spec /* Variation object data structure num_genotypes - number of total genotypes within variant file num_variants - number of total variants within variant file contigs - list of contig ids and variant information attribute_ref - KBase reference to attribute mapping workspace object genome_ref - KBase reference to genome workspace object assembly_ref - KBase reference to assemebly workspace object vcf_handle_ref - VCF handle reference to VCF file @optional genome_ref */ typedef structure { int numgenotypes; int numvariants; list<contig_info> contigs; attribute_ref population; // KBaseExperiments.AttributeMapping genome_ref genome_ref; // KBaseGenomes.Genome assembly_ref assemby_ref; // KBaseGenomeAnnotations.Assembly vcf_handle_ref vcf_handle_ref; } Variations; :param params: KBase ui input parameters :param population: previoiusly constructed sample population data :return: constructed variation object (dictionary) """ if not self.vcf_info['file_ref'].startswith(self.scratch): new_vcf_file = os.path.join(self.scratch, os.path.basename(self.vcf_info['file_ref'])) self.vcf_info['file_ref'] = shutil.copy(self.vcf_info['file_ref'], new_vcf_file) vcf_staged_file = self.original_file bgzip_file_path = self._bgzip_vcf(vcf_staged_file) vcf_shock_file_ref = self.dfu.file_to_shock( {'file_path': bgzip_file_path, 'make_handle': 1} ) compare_md5_local_with_shock(bgzip_file_path, vcf_shock_file_ref) index_file_path = self._index_vcf(bgzip_file_path) vcf_index_shock_file_ref = self.dfu.file_to_shock( {'file_path': index_file_path, 'make_handle': 1} ) compare_md5_local_with_shock(index_file_path, vcf_index_shock_file_ref) assembly_file_path = self._download_assembly(self.vcf_info['assembly_ref'])['path'] assembly_index_file_path = self._index_assembly(assembly_file_path) assembly_index_shock_file_ref = self.dfu.file_to_shock( {'file_path': assembly_index_file_path, 'make_handle': 1} ) compare_md5_local_with_shock(assembly_index_file_path, assembly_index_shock_file_ref) variation_obj = { 'numgenotypes': int(len(self.vcf_info['genotype_ids'])), 'numvariants': int(self.vcf_info['total_variants']), 'contigs': contigs_info, 'population': params['sample_attribute_ref'], # TYPE SPEC CHANGE: need to change type spec to assembly_ref instead of assemby_ref 'assemby_ref': self.vcf_info['assembly_ref'], 'vcf_handle_ref': vcf_shock_file_ref['handle']['hid'], 'vcf_handle' : vcf_shock_file_ref['handle'], 'vcf_index_handle_ref': vcf_index_shock_file_ref['handle']['hid'], 'vcf_index_handle': vcf_index_shock_file_ref['handle'], 'assembly_index_handle_ref': assembly_index_shock_file_ref['handle']['hid'], 'assembly_index_handle': assembly_index_shock_file_ref['handle'] } if 'genome_ref' in params: variation_obj['genome_ref'] = params['genome_ref'] return variation_obj def _save_var_obj(self, params, var): """ :param params: :param var: :return: DataFileUtils object_info: objid - the numerical id of the object. name - the name of the object. type - the type of the object. save_date - the save date of the object. ver - the version of the object. saved_by - the user that saved or copied the object. wsid - the id of the workspace containing the object. workspace - the name of the workspace containing the object. chsum - the md5 checksum of the object. size - the size of the object in bytes. meta - arbitrary user-supplied metadata about the object. """ print('Saving Variation to workspace...\n') if var: if not 'variation_object_name' in params: var_obj_name = 'variation_'+str(uuid.uuid4()) else: var_obj_name = params['variation_object_name'] var_obj_info = self.dfu.save_objects({ 'id': self.dfu.ws_name_to_id(params['workspace_name']), 'objects': [{ 'type': 'KBaseGwasData.Variations', 'data': var, 'name': var_obj_name }] })[0] return var_obj_info else: raise ValueError('Variation object blank, cannot not save to workspace!') def _validate_sample_attribute_ref(self, params): #params["sample_attribute_ref"] = '' #just for testing if not params['sample_attribute_ref']: sample_attribute_mapping_file = os.path.join(self.scratch ,"sample_attribute.tsv") #hardcoded for testing self._create_sample_attribute_file(params['vcf_local_file_path'], sample_attribute_mapping_file) logging.info("Uploading sample attribute file to ref") vcf_sample_attribute_shock_file_ref = self.dfu.file_to_shock( {'file_path': sample_attribute_mapping_file, 'make_handle': 1} ) shock_id = vcf_sample_attribute_shock_file_ref['shock_id'] ws_id = self.dfu.ws_name_to_id(params['workspace_name']) import_params = { 'input_shock_id' : shock_id, 'output_ws_id': ws_id, 'output_obj_name': 'Sample_attribute'} ret = self.gapi.file_to_attribute_mapping(import_params) params['sample_attribute_ref'] = ret['attribute_mapping_ref'] def import_vcf(self, params): # VCF validation # VCF file validation file_valid_result = self.validate_vcf(params) self._validate_sample_attribute_ref(params) # VCF file parsing self.vcf_info = self._parse_vcf_data(params) # Validate vcf chromosome ids against assembly chromosome ids self._validate_assembly_ids(params) # Validate vcf genotypes against sample meta data ids self._validate_sample_ids(params) # Variation object construction # construct contigs_info contigs_info = self._construct_contig_info(params) # construct variation var = self._construct_variation(params, contigs_info) # Save variation object to workspace var_wksp_obj = self._save_var_obj(params, var) return [var_wksp_obj, var]
def run_kraken2(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_kraken2 # Download input data as FASTA or FASTQ logging.info('Calling run_kraken2') logging.info(f'params {params}') # Check for presence of input file types in params input_genomes = 'input_genomes' in params and len( params['input_genomes'] ) > 0 and None not in params['input_genomes'] input_refs = 'input_refs' in params and len( params['input_refs']) > 0 and None not in params['input_refs'] input_paired_refs = 'input_paired_refs' in params and len( params['input_paired_refs'] ) > 0 and None not in params['input_paired_refs'] for name in ['workspace_name', 'db_type']: if name not in params: raise ValueError('Parameter "' + name + '" is required but missing') if not input_genomes and not input_refs and not input_paired_refs: raise ValueError( 'You must enter either an input genome or input reads') if input_refs and input_paired_refs: raise ValueError( 'You must enter either single-end or paired-end reads, ' 'but not both') if input_genomes and (input_refs or input_paired_refs): raise ValueError( 'You must enter either an input genome or input reads, ' 'but not both') if input_genomes and (not isinstance(params['input_genomes'][0], str)): raise ValueError('Pass in a valid input genome string') if input_refs and (not isinstance(params['input_refs'], list)): raise ValueError('Pass in a list of input references') if input_paired_refs and (not isinstance(params['input_paired_refs'], list)): raise ValueError('Pass in a list of input references') logging.info(params['db_type']) logging.info( f'input_genomes {input_genomes} input_refs {input_refs} input_paired_refs {input_paired_refs}' ) input_string = [] if input_genomes: assembly_util = AssemblyUtil(self.callback_url) fasta_file_obj = assembly_util.get_assembly_as_fasta( {'ref': params['input_genomes'][0]}) logging.info(fasta_file_obj) fasta_file = fasta_file_obj['path'] input_string.append(fasta_file) if input_refs: logging.info('Downloading Reads data as a Fastq file.') logging.info(f"input_refs {params['input_refs']}") readsUtil = ReadsUtils(self.callback_url) download_reads_output = readsUtil.download_reads( {'read_libraries': params['input_refs']}) print( f"Input parameters {params['input_refs']}, {params['db_type']}" f"download_reads_output {download_reads_output}") fastq_files = [] fastq_files_name = [] for key, val in download_reads_output['files'].items(): if 'fwd' in val['files'] and val['files']['fwd']: fastq_files.append(val['files']['fwd']) fastq_files_name.append(val['files']['fwd_name']) if 'rev' in val['files'] and val['files']['rev']: fastq_files.append(val['files']['rev']) fastq_files_name.append(val['files']['rev_name']) logging.info(f"fastq files {fastq_files}") input_string.append(' '.join(fastq_files)) if input_paired_refs: logging.info('Downloading Reads data as a Fastq file.') logging.info(f"input_refs {params['input_paired_refs']}") readsUtil = ReadsUtils(self.callback_url) download_reads_output = readsUtil.download_reads( {'read_libraries': params['input_paired_refs']}) print( f"Input parameters {params['input_paired_refs']}, {params['db_type']}" f"download_reads_output {download_reads_output}") fastq_files = [] fastq_files_name = [] # input_string.append('--paired') for key, val in download_reads_output['files'].items(): if 'fwd' in val['files'] and val['files']['fwd']: fastq_files.append(val['files']['fwd']) fastq_files_name.append(val['files']['fwd_name']) if 'rev' in val['files'] and val['files']['rev']: fastq_files.append(val['files']['rev']) fastq_files_name.append(val['files']['rev_name']) # if len(fastq_files) % 2 != 0: # raise ValueError('There must be an even number of Paired-end reads files') logging.info(f"fastq files {fastq_files}") input_string.extend(fastq_files) logging.info(f'input_string {input_string}') output_dir = os.path.join(self.shared_folder, 'kraken2_output') report_file_name = 'report.txt' report_file = os.path.join(output_dir, report_file_name) if not os.path.exists(output_dir): os.makedirs(output_dir) outprefix = "kraken2" cmd = [ '/kb/module/lib/kraken2/src/kraken2.sh', '-d', '/data/kraken2/' + params['db_type'], '-o', output_dir, '-p', outprefix, '-t', '1', '-i' ] cmd.extend(input_string) # cmd = ['kraken2', '--db', '/data/kraken2/' + params['db_type'], # '--output', output_dir, '--report', report_file, # '--threads', '1'] # cmd.extend(['--confidence', str(params['confidence'])]) if 'confidence' in params else cmd logging.info(f'cmd {cmd}') p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) logging.info(f'subprocess {p.communicate()}') summary_file = os.path.join(output_dir, outprefix + '.report.csv') report_dir = os.path.join(output_dir, 'html_report') if not os.path.exists(report_dir): os.makedirs(report_dir) summary_file_dt = os.path.join(report_dir, 'kraken2.datatable.html') self._generate_DataTable(summary_file, summary_file_dt) shutil.copy2('/kb/module/lib/kraken2/src/index.html', os.path.join(report_dir, 'index.html')) shutil.copy2(os.path.join(output_dir, outprefix + '.krona.html'), os.path.join(report_dir, 'kraken2.krona.html')) shutil.move(os.path.join(output_dir, outprefix + '.tree.svg'), os.path.join(report_dir, 'kraken2.tree.svg')) html_zipped = self.package_folder(report_dir, 'index.html', 'index.html') # columns = [ # 'Percentage of fragments covered by the clade rooted at this taxon', # 'Number of fragments covered by the clade rooted at this taxon', # 'Number of fragments assigned directly to this taxon', 'rank code', # 'taxid', 'name'] # report_df = pd.read_csv(report_file, sep='\t', # header=None, names=columns) # code_dict = {'U': 'Unclassified', 'R': 'Root', 'D': 'Domain', # 'K': 'Kingdom', 'P': 'Phylum', 'C': 'Class', 'O': 'Order', # 'F': 'Family', 'G': 'Genus', 'S': 'Species'} # report_df['rank code'] = report_df['rank code'].apply( # lambda x: code_dict[x[0]] + x[1] if len(x) > 1 else code_dict[x]) # self._generate_report_table(report_df, report_html_file, output_dir) # report_df.to_html(report_html_file, classes='Kraken2_report', index=False) # html_zipped = self.package_folder(output_dir, 'report.html', # 'report') # Step 5 - Build a Report and return objects_created = [] output_files = os.listdir(output_dir) output_files_list = [] for output in output_files: if not os.path.isdir(output): output_files_list.append({ 'path': os.path.join(output_dir, output), 'name': output }) message = f"Kraken2 run finished on {input_string} against {params['db_type']}." report_params = { 'message': message, 'workspace_name': params.get('workspace_name'), 'objects_created': objects_created, 'file_links': output_files_list, 'html_links': [html_zipped], 'direct_html_link_index': 0, 'html_window_height': 460 } # STEP 6: construct the output to send back kbase_report_client = KBaseReport(self.callback_url) report_output = kbase_report_client.create_extended_report( report_params) report_output['report_params'] = report_params logging.info(report_output) # Return references which will allow inline display of # the report in the Narrative output = { 'report_name': report_output['name'], 'report_ref': report_output['ref'], 'report_params': report_output['report_params'] } #END run_kraken2 # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_kraken2 return value ' + 'output is not type dict as required.') # return the results return [output]
def run_cnelsonAppDemo(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_cnelsonAppDemo # Print statements to stdout/stderr are captured and available as the App log logging.info('Starting run_cnelsonAppDemo function. Params=' + pformat(params)) # Step 1 - Parse/examine the parameters and catch any errors # It is important to check that parameters exist and are defined, and that nice error # messages are returned to users. Parameter values go through basic validation when # defined in a Narrative App, but advanced users or other SDK developers can call # this function directly, so validation is still important. logging.info('Validating parameters.') if 'workspace_name' not in params: raise ValueError( 'Parameter workspace_name is not set in input arguments') workspace_name = params['workspace_name'] if 'assembly_input_ref' not in params: raise ValueError( 'Parameter assembly_input_ref is not set in input arguments') assembly_input_ref = params['assembly_input_ref'] if 'min_length' not in params: raise ValueError( 'Parameter min_length is not set in input arguments') min_length_orig = params['min_length'] min_length = None try: min_length = int(min_length_orig) except ValueError: raise ValueError( 'Cannot parse integer from min_length parameter (' + str(min_length_orig) + ')') if min_length < 0: raise ValueError('min_length parameter cannot be negative (' + str(min_length) + ')') # Step 2 - Download the input data as a Fasta and # We can use the AssemblyUtils module to download a FASTA file from our Assembly data object. # The return object gives us the path to the file that was created. logging.info('Downloading Assembly data as a Fasta file.') assemblyUtil = AssemblyUtil(self.callback_url) fasta_file = assemblyUtil.get_assembly_as_fasta( {'ref': assembly_input_ref}) # Step 3 - Actually perform the filter operation, saving the good contigs to a new fasta file. # We can use BioPython to parse the Fasta file and build and save the output to a file. good_contigs = [] n_total = 0 n_remaining = 0 for record in SeqIO.parse(fasta_file['path'], 'fasta'): n_total += 1 if len(record.seq) >= min_length: good_contigs.append(record) n_remaining += 1 logging.info('Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total)) filtered_fasta_file = os.path.join(self.shared_folder, 'filtered.fasta') SeqIO.write(good_contigs, filtered_fasta_file, 'fasta') # Step 4 - Save the new Assembly back to the system logging.info('Uploading filtered Assembly data.') new_assembly = assemblyUtil.save_assembly_from_fasta({ 'file': { 'path': filtered_fasta_file }, 'workspace_name': workspace_name, 'assembly_name': fasta_file['assembly_name'] }) # Step 5 - Build a Report and return reportObj = { 'objects_created': [{ 'ref': new_assembly, 'description': 'Filtered contigs' }], 'text_message': 'Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total) } report = KBaseReport(self.callback_url) report_info = report.create({ 'report': reportObj, 'workspace_name': params['workspace_name'] }) # STEP 6: contruct the output to send back output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], 'assembly_output': new_assembly, 'n_initial_contigs': n_total, 'n_contigs_removed': n_total - n_remaining, 'n_contigs_remaining': n_remaining } logging.info('returning:' + pformat(output)) #END run_cnelsonAppDemo # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_cnelsonAppDemo return value ' + 'output is not type dict as required.') # return the results return [output]
def run_ContigFilter_max(self, ctx, params): """ New app which filters contigs in an assembly using both a minimum and a maximum contig length :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_ContigFilter_max # Check that the parameters are valid for name in [ 'min_length', 'max_length', 'assembly_ref', 'workspace_name' ]: if name not in params: raise ValueError('Parameter "' + name + '" is required but missing') if not isinstance(params['min_length'], int) or (params['min_length'] < 0): raise ValueError('Min length must be a non-negative integer') if not isinstance(params['max_length'], int) or (params['max_length'] < 0): raise ValueError('Max length must be a non-negative integer') if not isinstance(params['assembly_ref'], str) or not len( params['assembly_ref']): raise ValueError('Pass in a valid assembly reference string') print(params['min_length'], params['max_length'], params['assembly_ref']) output = {} assembly_util = AssemblyUtil(self.callback_url) fasta_file = assembly_util.get_assembly_as_fasta( {'ref': params['assembly_ref']}) print(fasta_file) # Parse the downloaded file in FASTA format parsed_assembly = SeqIO.parse(fasta_file['path'], 'fasta') min_length = params['min_length'] max_length = params['max_length'] # Keep a list of contigs greater than min_length good_contigs = [] # total contigs regardless of length n_total = 0 # total contigs over the min_length n_remaining = 0 for record in parsed_assembly: n_total += 1 if len(record.seq) >= min_length and len(record.seq) <= max_length: good_contigs.append(record) n_remaining += 1 # Create a file to hold the filtered data workspace_name = params['workspace_name'] filtered_path = os.path.join(self.shared_folder, 'filtered.fasta') SeqIO.write(good_contigs, filtered_path, 'fasta') # Upload the filtered data to the workspace new_ref = assembly_util.save_assembly_from_fasta({ 'file': { 'path': filtered_path }, 'workspace_name': workspace_name, 'assembly_name': fasta_file['assembly_name'] }) # Create an output summary message for the report text_message = "".join([ 'Filtered assembly to ', str(n_remaining), ' contigs out of ', str(n_total) ]) # Data for creating the report, referencing the assembly we uploaded report_data = { 'objects_created': [{ 'ref': new_ref, 'description': 'Filtered contigs' }], 'text_message': text_message } # Initialize the report kbase_report = KBaseReport(self.callback_url) report = kbase_report.create({ 'report': report_data, 'workspace_name': workspace_name }) # Return the report reference and name in our results output = { 'report_ref': report['ref'], 'report_name': report['name'], 'n_total': n_total, 'n_remaining': n_remaining, 'filtered_assembly_ref': new_ref } #END run_ContigFilter_max # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_ContigFilter_max return value ' + 'output is not type dict as required.') # return the results return [output]
def run_metaphlan2(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_metaphlan2 # Check parameters logging.info(f'params {params}') # Check for presence of input file types in params input_genomes = 'input_genomes' in params and len( params['input_genomes'] ) > 0 and None not in params['input_genomes'] input_refs = 'input_ref' in params and len( params['input_ref']) > 0 and None not in params['input_ref'] # for name in ['workspace_name', 'db_type']: # if name not in params: # raise ValueError( # 'Parameter "' + name + '" is required but missing') if not input_genomes and not input_refs: raise ValueError( 'You must enter either an input genome or input reads') if input_refs and input_genomes: raise ValueError( 'You must enter either an input genome or input reads, ' 'but not both') if input_genomes and (not isinstance(params['input_genomes'][0], str)): raise ValueError('Pass in a valid input genome string') if input_refs and (not isinstance(params['input_ref'], list) or not len(params['input_ref'])): raise ValueError('Pass in a list of input references') # Start with base cmd and add parameters based on user input cmd = [ 'metaphlan2.py', '--bowtie2db', '/data/metaphlan2/mpa_v20_m200', '--mpa_pkl', '/data/metaphlan2/mpa_v20_m200.pkl' ] if input_genomes: assembly_util = AssemblyUtil(self.callback_url) fasta_file_obj = assembly_util.get_assembly_as_fasta( {'ref': params['input_genomes'][0]}) logging.info(fasta_file_obj) fasta_file = fasta_file_obj['path'] cmd.extend(['--input_type', 'fasta', fasta_file]) if input_refs: logging.info('Downloading Reads data as a Fastq file.') logging.info(f"Input parameters {params.items()}") readsUtil = ReadsUtils(self.callback_url) download_reads_output = readsUtil.download_reads( {'read_libraries': params['input_ref']}) print( f"Input refs {params['input_ref']} download_reads_output {download_reads_output}" ) fastq_files = [] fastq_files_name = [] for key, val in download_reads_output['files'].items(): if 'fwd' in val['files'] and val['files']['fwd']: fastq_files.append(val['files']['fwd']) fastq_files_name.append(val['files']['fwd_name']) if 'rev' in val['files'] and val['files']['rev']: fastq_files.append(val['files']['rev']) fastq_files_name.append(val['files']['rev_name']) logging.info(f"fastq files {fastq_files}") fastq_files_string = ' '.join(fastq_files) cmd.extend(['--input_type', 'fastq', fastq_files_string]) output_dir = os.path.join(self.scratch, 'metaphlan2_output') if not os.path.exists(output_dir): os.makedirs(output_dir) # insert into second to last position, before input file(s) cmd.insert( -1, '--min_alignment_len') if params['min_alignment_len'] > 0 else cmd cmd.insert(-1, str(params['min_alignment_len']) ) if params['min_alignment_len'] > 0 else cmd cmd.insert( -1, '--ignore_viruses') if params['ignore_viruses'] == 1 else cmd cmd.insert( -1, '--ignore_bacteria') if params['ignore_bacteria'] == 1 else cmd cmd.insert( -1, '--ignore_eukaryotes') if params['ignore_eukaryotes'] == 1 else cmd cmd.insert( -1, '--ignore_archaea') if params['ignore_archaea'] == 1 else cmd cmd.insert(-1, '--stat_q') cmd.insert(-1, str(params['stat_q'])) cmd.insert(-1, '--min_cu_len') cmd.insert(-1, str(params['min_cu_len'])) # append output file cmd.extend(['--bowtie2out', os.path.join(output_dir, 'report.txt')]) cmd00 = ["ls", '-la', '/data/metaphlan2/'] logging.info(f'cmd00 {cmd00}') pls = subprocess.Popen(cmd00, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) logging.info(f'subprocess {pls.communicate()}') # run pipeline logging.info(f'cmd {" ".join(cmd)}') p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) logging.info(f'subprocess {p.communicate()}') cmd = [ '/kb/module/lib/metaphlan2/src/accessories.sh', os.path.join(output_dir, 'report.txt'), output_dir, 'metaphlan2' ] p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) logging.info(f'subprocess {p.communicate()}') # get output file and convert to format for report # logging.info(f"params['input_ref'] {params['input_ref']}") report_df = pd.read_csv(os.path.join(output_dir, 'report.txt'), sep='\t') taxa_list = [ 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'strain', 'unclassified' ] abbrev_list = ['k', 'p', 'c', 'o', 'f', 'g', 's', 't', 'unclassified'] for taxa in taxa_list: report_df[taxa] = None tax_dict = dict(zip(abbrev_list, taxa_list)) # split dunderscores to get tax level and name report_df['taxonomy'] = report_df['#SampleID'].apply( lambda x: x.split('|')).apply(lambda x: [y.split('__') for y in x]) for idx, row in report_df.iterrows(): for col in row['taxonomy']: try: report_df.loc[idx, tax_dict[col[0]]] = col[1] except IndexError: report_df.loc[idx, tax_dict[col[0]]] = col[0] report_df.drop(['taxonomy', '#SampleID'], axis=1, inplace=True) report_html_file = os.path.join(output_dir, 'report.html') self._generate_report_table(report_df, report_html_file, output_dir) # report_df.to_html(report_html_file, classes='Metaphlan2_report', # index=False) html_zipped = self.package_folder(output_dir, 'report.html', 'report') # Step 5 - Build a Report and return objects_created = [] output_files = os.listdir(output_dir) output_files_list = [] for output in output_files: if not os.path.isdir(output): output_files_list.append({ 'path': os.path.join(output_dir, output), 'name': output }) message = f"MetaPhlAn2 run finished." report_params = { 'message': message, 'workspace_name': params.get('workspace_name'), 'objects_created': objects_created, 'file_links': output_files_list, 'html_links': [html_zipped], 'direct_html_link_index': 0, 'html_window_height': 460 } kbase_report_client = KBaseReport(self.callback_url) report_output = kbase_report_client.create_extended_report( report_params) report_output['report_params'] = report_params logging.info(report_output) # Return references which will allow inline display of # the report in the Narrative output = { 'report_name': report_output['name'], 'report_ref': report_output['ref'], 'report_params': report_output['report_params'] } #END run_metaphlan2 # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_metaphlan2 return value ' + 'output is not type dict as required.') # return the results return [output]
class VirSorterUtils: def __init__(self, config): self.scratch = os.path.abspath(config['scratch']) self.callback_url = os.environ['SDK_CALLBACK_URL'] self.mgu = MetagenomeUtils(self.callback_url) self.au = AssemblyUtil(self.callback_url) self.ws = Workspace(config['workspace-url'], token=config['token']) def VirSorter_help(self): command = 'wrapper_phage_contigs_sorter_iPlant.pl --help' self._run_command(command) def get_fasta(self, ref): # check type of object, i.e KBaseGenomeAnnotations.Assembly-3.0 obj_type = self.ws.get_object_info3({'objects': [{ 'ref': ref }]})['infos'][0][2] if 'assembly' in obj_type.lower(): genome_ref = ref elif 'kbasegenomes' in obj_type.lower(): data = self.ws.get_objects2({ 'objects': [{ 'ref': ref, 'included': ['assembly_ref'], 'strict_maps': 1 }] })['data'][0]['data'] genome_ref = data['assembly_ref'] else: raise ValueError( f"Input reference {ref} is of type {obj_type}. Type KBaseGenomes.Genome or " f"KBaseGenomeAnnotations.Assembly required.") return self.au.get_assembly_as_fasta({'ref': genome_ref})['path'] def run_VirSorter(self, params): params['SDK_CALLBACK_URL'] = self.callback_url params['KB_AUTH_TOKEN'] = os.environ['KB_AUTH_TOKEN'] # Get contigs from 'assembly' genome_fp = self.get_fasta(params['genomes']) command = 'wrapper_phage_contigs_sorter_iPlant.pl --data-dir /data/virsorter-data' # Add in first args command += f' -f {genome_fp} --db {params["database"]}' # Check if additional genomes were submitted if params.get('add_genomes'): add_genomes_fp = self.get_fasta(params['add_genomes']) print(f'Added genomes DETECTED: {add_genomes_fp}') command += f' --cp {add_genomes_fp}' bool_args = ['virome', 'diamond', 'keep_db', 'no_c'] # keep_db = keep-db for bool_arg in bool_args: if params[ bool_arg] == 1: # 0 is true and therefore run... though for some reason it's reversed on json if bool_arg == 'keep_db': bool_arg = 'keep-db' command += f' --{bool_arg}' self._run_command(command) report = self._generate_report( params) # Basically, do everything that's after the tool runs return report def _run_command(self, command): """ :param command: :return: """ log('Start executing command:\n{}'.format(command)) pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True) output, err = pipe.communicate() exitCode = pipe.returncode if exitCode == 0: log('Executed command:\n{}\n'.format(command) + 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output)) else: error_msg = 'Error running command:\n{}\n'.format(command) error_msg += 'Exit Code: {}\nOutput:\n{}\nError: {}'.format( exitCode, output, err) raise RuntimeError(error_msg) def _parse_summary(self, virsorter_global_fp, affi_contigs_shock_id): columns = [ 'Contig_id', 'Nb genes contigs', 'Fragment', 'Nb genes', 'Category', 'Nb phage hallmark genes', 'Phage gene enrichment sig', 'Non-Caudovirales phage gene enrichment sig', 'Pfam depletion sig', 'Uncharacterized enrichment sig', 'Strand switch depletion sig', 'Short genes enrichment sig', ] try: with open(virsorter_global_fp, 'r') as vir_fh: data = {} category = '' for line in vir_fh: if line.startswith('## Contig_id'): continue elif line.startswith( '## ' ): # If 'header' lines are consumed by 1st if, then remaining should be good category = line.split('## ')[-1].split(' -')[0] else: values = line.strip().split(',') data[values[0]] = dict(zip(columns[1:], values[1:])) except: vir_path = os.path.join(os.getcwd(), 'virsorter-out') files = os.listdir(vir_path) raise RuntimeError( f"{virsorter_global_fp} is not a file. existing files {files}." ) df = pd.DataFrame().from_dict(data, orient='index') df.index.name = columns[0] df.reset_index(inplace=True) html = df.to_html(index=False, classes='my_class table-striped" id = "my_id') # Need to file write below direct_html = html_template.substitute( html_table=html, affi_contigs_shock_id=affi_contigs_shock_id) # Find header so it can be copied to footer, as dataframe.to_html doesn't include footer start_header = Literal("<thead>") end_header = Literal("</thead>") text = start_header + SkipTo(end_header) new_text = '' for data, start_pos, end_pos in text.scanString(direct_html): new_text = ''.join(data).replace( ' style="text-align: right;"', '').replace( 'thead>', 'tfoot>\n ') + '\n</tfoot>' # Get start and end positions to insert new text end_tbody = Literal("</tbody>") end_table = Literal("</table>") insertion_pos = end_tbody + SkipTo(end_table) final_html = '' for data, start_pos, end_pos in insertion_pos.scanString(direct_html): final_html = direct_html[:start_pos + 8] + '\n' + new_text + direct_html[ start_pos + 8:] return final_html def get_assembly_contig_ids(self, assembly_ref): """get contig ids from assembly_ref""" contigs = self.ws.get_objects2( {'objects': [{ 'ref': assembly_ref, 'included': ['contigs'] }]})['data'][0]['data']['contigs'] return contigs.keys() def _generate_report(self, params): """ :param params: :return: """ # Get URL self.dfu = dfu(params['SDK_CALLBACK_URL']) # Output directory should be $PWD/virsorter-out - ASSUMES that's the output location virsorter_outdir = os.path.join(os.getcwd(), 'virsorter-out') print( f'VIRSorter output directory contents: {os.listdir(virsorter_outdir)}' ) # Replacing individual download files with BinnedContigs # kb_deseq adds output files, then builds report files and sends all of them to the workspace output_files = [] # Appended list of dicts containing attributes # Collect all the files needed to report to end-user # Get all predicted viral sequences pred_fnas = glob.glob( os.path.join(virsorter_outdir, 'Predicted_viral_sequences/VIRSorter_*.fasta')) pred_gbs = glob.glob( os.path.join(virsorter_outdir, 'Predicted_viral_sequences/VIRSorter_*.gb')) # Summary 'table' glob_signal = os.path.join(virsorter_outdir, 'VIRSorter_global-phage-signal.csv') print('Identified the following predicted viral sequences:\n{}'.format( '\n\t'.join(pred_fnas))) if len(pred_fnas) == 0: print( f"Unable to find predicted viral sequences, here are the directory's content:\n" f"{os.listdir(os.path.join(virsorter_outdir, 'Predicted_viral_sequences'))}" ) if os.path.exists(glob_signal): print(f'Identified the global phage signal: {glob_signal}') lines = -1 # Don't count header with open(glob_signal) as fh: for ln in fh: lines += 1 if lines == 0: print('But it is EMPTY!') else: print( 'Unable to find the global phage signal file. Was there an error during the run?' ) # Append error and out files from VIRSorter err_fp = os.path.join(virsorter_outdir, 'logs/err') # if os.path.exists(err_fp): # output_files.append({ # 'path': os.path.join(virsorter_outdir, 'logs/err'), # 'name': 'VIRSorter_err', # 'label': 'VIRSorter_err', # 'description': 'VIRSorter error log file, generated from the tool itself.' # }) out_fp = os.path.join(virsorter_outdir, 'logs/out') # if os.path.exists(out_fp): # output_files.append({ # 'path': os.path.join(virsorter_outdir, 'logs/out'), # 'name': 'VIRSorter_out', # 'label': 'VIRSorter_out', # 'description': 'VIRSorter output log file, generated from the tool itself.' # }) if not (os.path.exists(err_fp) or os.path.exists(out_fp)): print( 'Unable to find err and/or out files in LOG directory, contents:' ) print(os.listdir(os.path.join(virsorter_outdir, 'logs'))) # Make output directory output_dir = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_dir) # Deal with nucleotide and protein fasta pred_fna_tgz_fp = os.path.join(output_dir, 'VIRSorter_predicted_viral_fna.tar.gz') with tarfile.open( pred_fna_tgz_fp, 'w:gz') as pred_fna_tgz_fh: # Compress to minimize disk usage for pred_fna in pred_fnas: pred_fna_tgz_fh.add(pred_fna, arcname=os.path.basename(pred_fna)) output_files.append({ 'path': pred_fna_tgz_fp, 'name': os.path.basename(pred_fna_tgz_fp), 'label': os.path.basename(pred_fna_tgz_fp), 'description': 'FASTA-formatted nucleotide sequences of VIRSorter predicted viruses' }) if os.path.exists(pred_fna_tgz_fp): print( f'Generated gzipped version of the predicted viral sequences in FASTA format: ' f'{pred_fna_tgz_fp}') pred_gb_tgz_fp = os.path.join(output_dir, 'VIRSorter_predicted_viral_gb.tar.gz') with tarfile.open(pred_gb_tgz_fp, 'w:gz') as pred_gb_tgz_fh: for pred_gb in pred_gbs: pred_gb_tgz_fh.add(pred_gb, arcname=os.path.basename(pred_gb)) output_files.append({ 'path': pred_gb_tgz_fp, 'name': os.path.basename(pred_gb_tgz_fp), 'label': os.path.basename(pred_gb_tgz_fp), 'description': 'Genbank-formatted sequences of VIRSorter predicted viruses' }) if os.path.exists(pred_gb_tgz_fp): print( f'Generated gzipped version of the predicted viral sequences in Genbank format: ' f'{pred_gb_tgz_fp}') # To create BinnedContig, need to create another directory with each of the "bins" as separate files? binned_contig_output_dir = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(binned_contig_output_dir) # Before creating final HTML output, need to create BinnedContig object so other tools/users can take advantage # of its features, but also to feed more easily into other tools (e.g. vConTACT) created_objects = [] # Will store the objects that go to the workspace # load contig ids from the assembly input # assembly_contig_ids = self.get_assembly_contig_ids(self.assembly_ref) assembly_contig_ids = self.get_assembly_contig_ids( params['genomes']) # Will fail for Genome summary_fp = os.path.join( binned_contig_output_dir, 'VIRSorter.summary') # Anything that ends in .summary with open(summary_fp, 'w') as summary_fh: summary_writer = csv.writer(summary_fh, delimiter='\t', quoting=csv.QUOTE_MINIMAL) summary_writer.writerow( ['Bin name', 'Completeness', 'Genome size', 'GC content']) for category_fp in pred_fnas: # _get_bin_ids from MetaGenomeUtils requires files to follow the header.0xx.fasta convention category = os.path.basename(category_fp).split( 'cat-')[-1].split('.')[0] dest_fn = 'VirSorter.{}.fasta'.format(category.zfill(3)) dest_fp = os.path.join(output_dir, dest_fn) binned_contig_fp = os.path.join(binned_contig_output_dir, dest_fn) genome_size = 0 gc_content = [] # Need stats for summary file # Also need to adjust sequence name so binnedContig object can retrieve sequences adjusted_sequences = [] with open(category_fp, 'rU') as category_fh: for record in SeqIO.parse(category_fh, 'fasta'): seq = record.seq gc_content.append(SeqUtils.GC(seq)) genome_size += len(seq) # This is very dirty, but need to change name to match original contigs record.id = record.id.replace('VIRSorter_', '').replace( '-circular', '').split('-cat_')[0] if 'gene' in record.id: # Prophage record.id = record.id.split('_gene')[0] record.id = record.id.rsplit('_', 1)[0] # here we make sure that the id's line up with contig ids in the input assembly object if record.id not in assembly_contig_ids: for assembly_contig_id in assembly_contig_ids: # first check if record.id is substring of current contig id, # then check if current contig id is substring of record.id # NOTE: this is not a perfect way of checking and will likely # fail in some circumstances. # A more complete check would be to make sure there is a 1:1 # mapping of contig id's in the assembly object as compared to # the binned contig object (the fasta files defined here). if (record.id in assembly_contig_id) or ( assembly_contig_id in record.id): record.id = assembly_contig_id break record.description = '' record.name = '' adjusted_sequences.append(record) if genome_size != 0: # Empty file summary_writer.writerow([ dest_fn, '100%', genome_size, (sum(gc_content) / len(gc_content)) ]) print('Copying {} to results directory'.format( os.path.basename(category_fp))) # Yes, need both. One is to get file_links in report. Second is for binnedContigs object shutil.copyfile(category_fp, dest_fp) # Write renamed sequences with open(binned_contig_fp, 'w') as binned_contig_fh: SeqIO.write(adjusted_sequences, binned_contig_fh, 'fasta') result = self.au.save_assembly_from_fasta({ 'file': { 'path': dest_fp }, 'workspace_name': params['workspace_name'], 'assembly_name': 'VirSorter-Category-{}'.format(category) }) created_objects.append({ "ref": result, "description": "KBase Assembly object from VIRSorter" }) # Create BinnedContigs object, but 1st, a little metadata generate_binned_contig_param = { 'file_directory': binned_contig_output_dir, 'assembly_ref': params['genomes'], # params.get('genomes'), self.assembly_ref 'binned_contig_name': params['binned_contig_name'], 'workspace_name': params['workspace_name'] } binned_contig_object_ref = self.mgu.file_to_binned_contigs( generate_binned_contig_param).get('binned_contig_obj_ref') # Add binned contigs reference here, as it was already created above created_objects.append({ "ref": binned_contig_object_ref, "description": "BinnedContigs from VIRSorter" }) # Save VIRSorter_affi-contigs.tab for DRAM-v affi_contigs_fp = os.path.join(virsorter_outdir, 'Metric_files', 'VIRSorter_affi-contigs.tab') affi_contigs_shock_id = self.dfu.file_to_shock( {'file_path': affi_contigs_fp})['shock_id'] # Use global signal (i.e. summary) file and create HTML-formatted version raw_html = self._parse_summary(glob_signal, affi_contigs_shock_id) html_fp = os.path.join(output_dir, 'index.html') with open(html_fp, 'w') as html_fh: html_fh.write(raw_html) report_shock_id = self.dfu.file_to_shock({ 'file_path': output_dir, 'pack': 'zip' })['shock_id'] html_report = [{ 'shock_id': report_shock_id, 'name': os.path.basename(html_fp), 'label': os.path.basename(html_fp), 'description': 'HTML summary report for VIRSorter-predicted viral genomes.' }] report_params = { 'message': 'Here are the results from your VIRSorter run. Above, you\'ll find a report with ' 'all the identified (putative) viral genomes, and below, links to the report as ' 'well as files generated.', 'workspace_name': params['workspace_name'], 'html_links': html_report, 'direct_html_link_index': 0, 'report_object_name': 'VIRSorter_report_{}'.format(str(uuid.uuid4())), 'file_links': output_files, 'objects_created': created_objects, } kbase_report_client = KBaseReport(params['SDK_CALLBACK_URL'], token=params['KB_AUTH_TOKEN']) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'], 'result_directory': binned_contig_output_dir, 'binned_contig_obj_ref': binned_contig_object_ref } return report_output def _mkdir_p(self, path): """ :param path: :return: """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise
class vConTACTUtils: def __init__(self, config): self.scratch = os.path.abspath(config['scratch']) self.callback_url = os.environ['SDK_CALLBACK_URL'] self.token = os.environ['KB_AUTH_TOKEN'] self.scratch = os.path.abspath(config['scratch']) self.ws = Workspace(config['workspace-url'], token=self.token) self.genome_api = GenomeAnnotationAPI(self.callback_url) self.au = AssemblyUtil(self.callback_url) def vcontact_help(self): command = "vcontact --help" self._run_command(command) def execute(self, command: list): """ :param command: Command suitable for running in subprocess, must use a ['ls', '-l'] format :return: Response from command """ # logger.info('Running command: {}'.format(command)) print('Running command: {}'.format(' '.join(command))) res = subprocess.run(command, shell=False, encoding='utf-8', check=True) return res def run_vcontact(self, params): # Determine KBase "inputs" for vConTACT2 genome = params['genome'] obj_type = self.ws.get_object_info3({'objects': [{'ref': genome}]})['infos'][0][2] if 'assembly' in obj_type.lower(): # If KBaseGenomeAnnotations.Assembly # Assembly requires annotation genome_fp = self.au.get_assembly_as_fasta({'ref': genome})['path'] proteins_fp = os.path.join(self.scratch, 'proteins.faa') proteins_gbk = os.path.join(self.scratch, 'proteins.gbk') gene2genome_fp = os.path.join(self.scratch, 'gene2genome.csv') prodigal_cmd = ['prodigal', '-a', proteins_fp, '-o', proteins_gbk, '-f', 'gbk', '-i', genome_fp, '-p', 'meta'] res = self.execute(prodigal_cmd) records = {} with open(proteins_fp, 'r') as proteins_fh: for record in SeqIO.parse(proteins_fh, 'fasta'): records[len(records)] = { 'protein_id': record.id, 'contig_id': record.id.rsplit('_', 1)[0], 'keywords': 'None' } g2g_df = pd.DataFrame.from_dict(records, orient='index') g2g_df.to_csv(gene2genome_fp, index=False) # Pass filepaths to the app and run params['gene2genome'] = gene2genome_fp params['sequences'] = proteins_fp elif 'kbasegenomes' in obj_type.lower(): # If KBaseGenomes.Genome genome_data = self.genome_api.get_genome_v1({"genomes": [{"ref": genome}]}) # Convert genome data into "reasonable" parse form and write to scratch filesystem gene2genome, sequences = self.genome_to_inputs(genome_data) gene2genome_fp, sequences_fp = self.write_inputs(gene2genome, sequences) # Pass filepaths to the app and run params['gene2genome'] = gene2genome_fp params['sequences'] = sequences_fp elif 'binnedcontigs' in obj_type.lower(): # If KBaseMetagenomes.BinnedContigs print('KBaseMetagenomes.BinnedContigs hasnt been enabled. Check back later.') exit(1) else: print('Unknown error in identifying object types') print('Available database files') print(os.listdir('/miniconda/lib/python3.7/site-packages/vcontact2/data/')) # Just iterate through all parameters mappings = { 'gene2genome': '--proteins-fp', 'sequences': '--raw-proteins', 'db': '--db', 'pcs_mode': '--pcs-mode', 'vcs_mode': '--vcs-mode', 'blast_evalue': '--blast-evalue', 'pc_max_overlap': '--max-overlap', 'pc_penalty': '--penalty', 'pc_haircut': '--haircut', 'pc_inflation': '--pc-inflation', 'vc_inflation': '--vc-inflation', 'vc_density': '--min-density', 'vc_min_size': '--min-size', 'vc_max_overlap': '--vc-overlap', 'vc_penalty': '--vc-penalty', 'vc_haircut': '--vc-haircut', 'merge_method': '--merge-method', 'similarity': '--similarity', 'seed_method': '--seed-method', 'min_significance': '--sig', 'max_significance': '--max-sig', 'module_inflation': '--mod-inflation', 'mod_significance': '--mod-sig', 'module_min_shared': '--mod-shared-min', 'link_significance': '--link-sig', 'link_proportion': '--link-prop' } bool_args = ['optimize', 'permissive'] # Should create build_command? command = 'vcontact2 --output-dir outdir' # Binaries command += ' --diamond-bin /usr/local/bin/diamond --c1-bin /usr/local/bin/cluster_one-1.0.jar' for param, cmd in mappings.items(): command += ' {} {}'.format(cmd, params[param]) self._run_command(command) report = self._generate_report(params) return report def _run_command(self, command): """ _run_command: run command and print result """ log('Start executing command:\n{}'.format(command)) pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True) output = pipe.communicate()[0] exitCode = pipe.returncode if (exitCode == 0): log('Executed command:\n{}\n'.format(command) + 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output)) else: error_msg = 'Error running command:\n{}\n'.format(command) error_msg += 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output) raise ValueError(error_msg) def genome_to_inputs(self, genome): """ genome_to_inputs: convert genome annotation data (~json) to file inputs required by vConTACT :param genome: :return: """ records = [] gene2genome = OrderedDict() genome_data = genome['genomes'][0] for item in genome_data['data']['features']: if 'id' not in item: continue print('This feature does not have a valid id') elif 'dna_sequence' not in item or 'protein_translation' not in item: continue print('This feature {} does not have a valid DNA sequence.'.format(item['id'])) else: # Create FASTA file if item['type'] == 'gene': desc = (item['functions'] if item.get('functions', None) else item.get('function', '')) gene_record = SeqRecord(Seq(item['protein_translation']), id=item['id'], description=desc) records.append(gene_record) # Build gene2genome gene2genome.update({ item['id']: { # 'contig_id': genome_data['data']['contig_ids'][0], 'contig_id': item['location'][0][0], 'protein_id': item['id'], 'keywords': item['function'] } }) return gene2genome, records def write_inputs(self, mapping, sequences): fasta_for_proteins_fp = os.path.join(self.scratch, 'vConTACT_proteins.fasta') with open(fasta_for_proteins_fp, 'w') as fasta_for_proteins_fh: SeqIO.write(sequences, fasta_for_proteins_fh, 'fasta') genes_to_genomes_mapping_fp = os.path.join(self.scratch, 'vConTACT_gene2genome.csv') with open(genes_to_genomes_mapping_fp, 'w') as genes_to_genomes_mapping_fh: fields = ['contig_id', 'protein_id', 'keywords'] writer = csv.DictWriter(genes_to_genomes_mapping_fh, fieldnames=fields) writer.writeheader() for gene in mapping.keys(): writer.writerow(mapping[gene]) return genes_to_genomes_mapping_fp, fasta_for_proteins_fp def _generate_report(self, params): """ _generate_report: generate summary report This will contain ALL the logic to generate the report, including areas that should/will be re-factored later """ # Get self.dfu = dfu(self.callback_url) # Get filepath of summary file summary_fp = os.path.join(os.getcwd(), 'outdir', 'genome_by_genome_overview.csv') summary_df = pd.read_csv(summary_fp, header=0, index_col=0) html = summary_df.to_html(index=False, classes='my_class table-striped" id = "my_id') # Need to file write below direct_html = html_template.substitute(html_table=html) # Find header so it can be copied to footer, as dataframe.to_html doesn't include footer start_header = Literal("<thead>") end_header = Literal("</thead>") text = start_header + SkipTo(end_header) new_text = '' for data, start_pos, end_pos in text.scanString(direct_html): new_text = ''.join(data).replace(' style="text-align: right;"', '').replace('thead>', 'tfoot>\n ') + '\n</tfoot>' # Get start and end positions to insert new text end_tbody = Literal("</tbody>") end_table = Literal("</table>") insertion_pos = end_tbody + SkipTo(end_table) final_html = '' for data, start_pos, end_pos in insertion_pos.scanString(direct_html): final_html = direct_html[:start_pos + 8] + '\n' + new_text + direct_html[start_pos + 8:] output_dir = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_dir) result_fp = os.path.join(output_dir, 'index.html') with open(result_fp, 'w') as result_fh: result_fh.write(final_html) report_shock_id = self.dfu.file_to_shock({ 'file_path': output_dir, 'pack': 'zip' })['shock_id'] html_report = [{ 'shock_id': report_shock_id, 'name': os.path.basename(result_fp), 'label': os.path.basename(result_fp), 'description': 'HTML summary report for vConTACT2' }] report_params = {'message': 'Basic message to show in the report', 'workspace_name': params['workspace_name'], 'html_links': html_report, 'direct_html_link_index': 0, 'report_object_name': 'vConTACT_report_{}'.format(str(uuid.uuid4())), # Don't use until have files to attach to report # 'file_links': [{}], # Don't use until data objects that are created as result of running app # 'objects_created': [{'ref': matrix_obj_ref, # 'description': 'Imported Matrix'}], } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = {'report_name': output['name'], 'report_ref': output['ref']} return report_output def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ # https://stackoverflow.com/a/600612/643675 if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise
def stage_input(self, input_ref, fasta_file_extension): ''' Stage input based on an input data reference for CheckM input_ref can be a reference to an Assembly, BinnedContigs, or (not yet implemented) a Genome This method creates a directory in the scratch area with the set of Fasta files, names will have the fasta_file_extension parameter tacked on. ex: staged_input = stage_input('124/15/1', 'fna') staged_input {"input_dir": '...'} ''' # config #SERVICE_VER = 'dev' SERVICE_VER = 'release' [OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I] = range(11) # object_info tuple ws = Workspace(self.ws_url) # 1) generate a folder in scratch to hold the input suffix = str(int(time.time() * 1000)) input_dir = os.path.join(self.scratch, 'bins_' + suffix) all_seq_fasta = os.path.join(self.scratch, 'all_sequences_' + suffix + '.' + fasta_file_extension) if not os.path.exists(input_dir): os.makedirs(input_dir) # 2) based on type, download the files obj_name = self.get_data_obj_name (input_ref) type_name = self.get_data_obj_type (input_ref) # auClient try: auClient = AssemblyUtil(self.callbackURL, token=self.ctx['token'], service_ver=SERVICE_VER) except Exception as e: raise ValueError('Unable to instantiate auClient with callbackURL: '+ self.callbackURL +' ERROR: ' + str(e)) # setAPI_Client try: #setAPI_Client = SetAPI (url=self.callbackURL, token=self.ctx['token']) # for SDK local. local doesn't work for SetAPI setAPI_Client = SetAPI (url=self.serviceWizardURL, token=self.ctx['token']) # for dynamic service except Exception as e: raise ValueError('Unable to instantiate setAPI_Client with serviceWizardURL: '+ self.serviceWizardURL +' ERROR: ' + str(e)) # mguClient try: mguClient = MetagenomeUtils(self.callbackURL, token=self.ctx['token'], service_ver=SERVICE_VER) except Exception as e: raise ValueError('Unable to instantiate mguClient with callbackURL: '+ self.callbackURL +' ERROR: ' + str(e)) # Standard Single Assembly # if type_name in ['KBaseGenomeAnnotations.Assembly', 'KBaseGenomes.ContigSet']: # create file data filename = os.path.join(input_dir, obj_name + '.' + fasta_file_extension) auClient.get_assembly_as_fasta({'ref': input_ref, 'filename': filename}) if not os.path.isfile(filename): raise ValueError('Error generating fasta file from an Assembly or ContigSet with AssemblyUtil') # make sure fasta file isn't empty min_fasta_len = 1 if not self.fasta_seq_len_at_least(filename, min_fasta_len): raise ValueError('Assembly or ContigSet is empty in filename: '+str(filename)) # AssemblySet # elif type_name == 'KBaseSets.AssemblySet': # read assemblySet try: assemblySet_obj = setAPI_Client.get_assembly_set_v1 ({'ref':input_ref, 'include_item_info':1}) except Exception as e: raise ValueError('Unable to get object from workspace: (' + input_ref +')' + str(e)) assembly_refs = [] assembly_names = [] for assembly_item in assemblySet_obj['data']['items']: this_assembly_ref = assembly_item['ref'] # assembly obj info try: this_assembly_info = ws.get_object_info_new ({'objects':[{'ref':this_assembly_ref}]})[0] this_assembly_name = this_assembly_info[NAME_I] except Exception as e: raise ValueError('Unable to get object from workspace: (' + this_assembly_ref +'): ' + str(e)) assembly_refs.append(this_assembly_ref) assembly_names.append(this_assembly_name) # create file data (name for file is what's reported in results) for ass_i,assembly_ref in enumerate(assembly_refs): this_name = assembly_names[ass_i] filename = os.path.join(input_dir, this_name + '.' + fasta_file_extension) auClient.get_assembly_as_fasta({'ref': assembly_ref, 'filename': filename}) if not os.path.isfile(filename): raise ValueError('Error generating fasta file from an Assembly or ContigSet with AssemblyUtil') # make sure fasta file isn't empty min_fasta_len = 1 if not self.fasta_seq_len_at_least(filename, min_fasta_len): raise ValueError('Assembly or ContigSet is empty in filename: '+str(filename)) # Binned Contigs # elif type_name == 'KBaseMetagenomes.BinnedContigs': # download the bins as fasta and set the input folder name bin_file_dir = mguClient.binned_contigs_to_file({'input_ref': input_ref, 'save_to_shock': 0})['bin_file_directory'] os.rename(bin_file_dir, input_dir) # make sure fasta file isn't empty self.set_fasta_file_extensions(input_dir, fasta_file_extension) for (dirpath, dirnames, filenames) in os.walk(input_dir): for fasta_file in filenames: fasta_path = os.path.join (input_dir,fasta_file) min_fasta_len = 1 if not self.fasta_seq_len_at_least(fasta_path, min_fasta_len): raise ValueError('Binned Assembly is empty for fasta_path: '+str(fasta_path)) break # Genome and GenomeSet # elif type_name == 'KBaseGenomes.Genome' or type_name == 'KBaseSearch.GenomeSet': genome_obj_names = [] genome_sci_names = [] genome_assembly_refs = [] if type_name == 'KBaseGenomes.Genome': genomeSet_refs = [input_ref] else: # get genomeSet_refs from GenomeSet object genomeSet_refs = [] try: genomeSet_object = ws.get_objects2({'objects':[{'ref':input_ref}]})['data'][0]['data'] except Exception as e: raise ValueError('Unable to fetch '+str(input_ref)+' object from workspace: ' + str(e)) #to get the full stack trace: traceback.format_exc() # iterate through genomeSet members for genome_id in genomeSet_object['elements'].keys(): if 'ref' not in genomeSet_object['elements'][genome_id] or \ genomeSet_object['elements'][genome_id]['ref'] == None or \ genomeSet_object['elements'][genome_id]['ref'] == '': raise ValueError('genome_ref not found for genome_id: '+str(genome_id)+' in genomeSet: '+str(input_ref)) else: genomeSet_refs.append(genomeSet_object['elements'][genome_id]['ref']) # genome obj data for i,this_input_ref in enumerate(genomeSet_refs): try: objects = ws.get_objects2({'objects':[{'ref':this_input_ref}]})['data'] genome_obj = objects[0]['data'] genome_obj_info = objects[0]['info'] genome_obj_names.append(genome_obj_info[NAME_I]) genome_sci_names.append(genome_obj['scientific_name']) except: raise ValueError ("unable to fetch genome: "+this_input_ref) # Get genome_assembly_ref if ('contigset_ref' not in genome_obj or genome_obj['contigset_ref'] == None) \ and ('assembly_ref' not in genome_obj or genome_obj['assembly_ref'] == None): msg = "Genome "+genome_obj_names[i]+" (ref:"+input_ref+") "+genome_sci_names[i]+" MISSING BOTH contigset_ref AND assembly_ref. Cannot process. Exiting." raise ValueError (msg) continue elif 'assembly_ref' in genome_obj and genome_obj['assembly_ref'] != None: msg = "Genome "+genome_obj_names[i]+" (ref:"+input_ref+") "+genome_sci_names[i]+" USING assembly_ref: "+str(genome_obj['assembly_ref']) print (msg) genome_assembly_refs.append(genome_obj['assembly_ref']) elif 'contigset_ref' in genome_obj and genome_obj['contigset_ref'] != None: msg = "Genome "+genome_obj_names[i]+" (ref:"+input_ref+") "+genome_sci_names[i]+" USING contigset_ref: "+str(genome_obj['contigset_ref']) print (msg) genome_assembly_refs.append(genome_obj['contigset_ref']) # create file data (name for file is what's reported in results) for ass_i,assembly_ref in enumerate(genome_assembly_refs): this_name = genome_obj_names[ass_i] filename = os.path.join(input_dir, this_name + '.' + fasta_file_extension) auClient.get_assembly_as_fasta({'ref': assembly_ref, 'filename': filename}) if not os.path.isfile(filename): raise ValueError('Error generating fasta file from an Assembly or ContigSet with AssemblyUtil') # make sure fasta file isn't empty min_fasta_len = 1 if not self.fasta_seq_len_at_least(filename, min_fasta_len): raise ValueError('Assembly or ContigSet is empty in filename: '+str(filename)) # Unknown type slipped through # else: raise ValueError('Cannot stage fasta file input directory from type: ' + type_name) # create summary fasta file with all bins self.cat_fasta_files(input_dir, fasta_file_extension, all_seq_fasta) return {'input_dir': input_dir, 'folder_suffix': suffix, 'all_seq_fasta': all_seq_fasta}
def download_long(self, console, warnings, token, wsname, lib, min_long_read_length): try: # object info try: wsClient = Workspace(self.workspaceURL, token=token) except Exception as e: raise ValueError("unable to instantiate wsClient. " + str(e)) [ OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I ] = range(11) # object_info tuple obj_id = {'ref': lib if '/' in lib else (wsname + '/' + lib)} lib_obj_info = wsClient.get_object_info_new({'objects': [obj_id]})[0] lib_obj_type = lib_obj_info[TYPE_I] lib_obj_type = re.sub('-[0-9]+\.[0-9]+$', "", lib_obj_type) # remove trailing version lib_ref = str(lib_obj_info[WSID_I])+'/' + \ str(lib_obj_info[OBJID_I])+'/'+str(lib_obj_info[VERSION_I]) if lib_obj_type == 'KBaseGenomes.ContigSet' or lib_obj_type == 'KBaseGenomeAnnotations.Assembly': # download using assembly util / data file util self.log(console, "Getting long reads (from contigs object).\n") auClient = AssemblyUtil(url=self.callbackURL, token=token) dfuClient = DataFileUtil(url=self.callbackURL, token=token) contigFile = auClient.get_assembly_as_fasta({ 'ref': lib_ref }).get('path') long_reads_path = dfuClient.unpack_file( {'file_path': contig_file})['file_path'] self.log( warnings, "Warning: Long reads are in FASTA format, so short read check was not performed." ) else: ruClient = ReadsUtils(url=self.callbackURL, token=token) self.log(console, "Getting long reads (from reads library object).\n") result = ruClient.download_reads({ 'read_libraries': [lib_ref], 'interleaved': 'false' }) long_reads_path = result['files'][lib_ref]['files']['fwd'] [n_reads, n_reads_short ] = self.filter_short_fastq(console, long_reads_path, min_long_read_length) if (n_reads_short > 0): self.log( warnings, "Warning: Of " + str(n_reads) + " long reads, " + str(n_reads_short) + " are shorter than " + str(min_long_read_length) + "; consider using the filtlong app to filter out shorter reads." ) except Exception as e: raise ValueError('Unable to download long reads\n' + str(e)) return long_reads_path
def run_rmrContigFilter(self, ctx, params): """ Example app which filters contigs in an assembly using both a minimum contig length :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_rmrContigFilter # Print statements to stdout/stderr are captured and available as the App log logging.info('Starting run_rmrContigFilter function. Params=' + pformat(params)) # Step 1 - Parse/examine the parameters and catch any errors # It is important to check that parameters exist and are defined, and that nice error # messages are returned to users. Parameter values go through basic validation when # defined in a Narrative App, but advanced users or other SDK developers can call # this function directly, so validation is still important. logging.info('Validating parameters.') if 'workspace_name' not in params: raise ValueError( 'Parameter workspace_name is not set in input arguments') workspace_name = params['workspace_name'] if 'assembly_input_ref' not in params: raise ValueError( 'Parameter assembly_input_ref is not set in input arguments') assembly_input_ref = params['assembly_input_ref'] if 'min_length' not in params: raise ValueError( 'Parameter min_length is not set in input arguments') min_length_orig = params['min_length'] min_length = None try: min_length = int(min_length_orig) except ValueError: raise ValueError( 'Cannot parse integer from min_length parameter (' + str(min_length_orig) + ')') if min_length < 0: raise ValueError('min_length parameter cannot be negative (' + str(min_length) + ')') # Step 2 - Download the input data as a Fasta and # We can use the AssemblyUtils module to download a FASTA file from our Assembly data object. # The return object gives us the path to the file that was created. logging.info('Downloading Assembly data as a Fasta file.') assemblyUtil = AssemblyUtil(self.callback_url) fasta_file = assemblyUtil.get_assembly_as_fasta( {'ref': assembly_input_ref}) # Step 3 - Actually perform the filter operation, saving the good contigs to a new fasta file. # We can use BioPython to parse the Fasta file and build and save the output to a file. good_contigs = [] n_total = 0 n_remaining = 0 for record in SeqIO.parse(fasta_file['path'], 'fasta'): n_total += 1 if len(record.seq) >= min_length: good_contigs.append(record) n_remaining += 1 logging.info('Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total)) filtered_fasta_file = os.path.join(self.shared_folder, 'filtered.fasta') SeqIO.write(good_contigs, filtered_fasta_file, 'fasta') # Step 4 - Save the new Assembly back to the system logging.info('Uploading filtered Assembly data.') new_assembly = assemblyUtil.save_assembly_from_fasta({ 'file': { 'path': filtered_fasta_file }, 'workspace_name': workspace_name, 'assembly_name': fasta_file['assembly_name'] }) # Step 4b - Build html report # create html string # write string to file to self.shared_folder # upload to shock # send to report html_header = "<!DOCTYPE html><html><head><meta charset=\"UTF-8\"><title>title</title></head><body><table>" html_footer = "</table></body></html>" tableentries = "<tr><th>ID</th><th>A %</th><th>C %</th><th>T %</th><th>G %</th></tr>" for contig in good_contigs: Acount = contig.seq.upper().count('A') Ccount = contig.seq.upper().count('C') Tcount = contig.seq.upper().count('T') Gcount = contig.seq.upper().count('G') total = Acount + Ccount + Tcount + Gcount Aper = 100 * (Acount / total) Cper = 100 * (Ccount / total) Gper = 100 * (Gcount / total) Tper = 100 * (Tcount / total) tmprow = "<tr><td>" + contig.id + "</td><td>" + str(round( Aper, 2)) + "</td><td>" + str(round(Cper, 2)) + "</td><td>" + str( round(Tper, 2)) + "</td><td>" + str(round( Gper, 2)) + "</td></tr>" tableentries += tmprow # Create the html string html_str = html_header + tableentries + html_footer # Write the html string to a file in the shared folder html_file_dir = os.path.join(self.shared_folder, 'html') if not os.path.isdir(html_file_dir): os.mkdir(html_file_dir) html_file_path = os.path.join(html_file_dir, 'output_table.html') html_file = open(html_file_path, "w") html_file.write(html_str) html_file.close() """ Will try to not use shock first # Upload the html file to shock dfu = DataFileUtil(self.callback_url) try: shock_html_upload = dfu.file_to_shock({'file_path': html_file_dir, 'make_handle': 0, 'pack':'zip'}) except: raise ValueError('Unable to upload html file to shock with DataFileUtil') """ # Step 5 - Build a Report and return """ Old Report .create method: https://github.com/kbaseapps/KBaseReportPy/blob/master/lib/KBaseReportPy/KBaseReportPyImpl.py reportObj = { 'objects_created': [{'ref': new_assembly, 'description': 'Filtered contigs'}], 'text_message': 'Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total) } report = KBaseReport(self.callback_url) report_info = report.create({'report': reportObj, 'workspace_name': params['workspace_name']}) """ # New report .create_extended_report reportObj = { 'objects_created': [{ 'ref': new_assembly, 'description': 'Filtered contigs' }], 'message': 'Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total), 'direct_html': None, 'direct_html_link_index': 0, 'file_links': [], #'html_links': [{'shock-id': shock_html_upload['shock_id'], 'name': 'output-table.html', 'label': 'contig table'}], 'html_links': [{ 'path': html_file_dir, 'name': 'output_table.html', 'description': 'HTML report for contig filtering' }], 'workspace_name': params['workspace_name'], } report = KBaseReport(self.callback_url) report_info = report.create_extended_report(reportObj) # STEP 6: contruct the output to send back output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], 'assembly_output': new_assembly, 'n_initial_contigs': n_total, 'n_contigs_removed': n_total - n_remaining, 'n_contigs_remaining': n_remaining } logging.info('returning:' + pformat(output)) #END run_rmrContigFilter # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_rmrContigFilter return value ' + 'output is not type dict as required.') # return the results return [output]
def run_rmrContigFilter_max(self, ctx, params): """ New app which filters contigs in an assembly using both a minimum and a maximum contig length :param params: instance of type "rmrContigFiltermaxinput" -> structure: parameter "output_workspace" of String, parameter "assembly_input_ref" of type "data_obj_ref", parameter "output_assembly_name" of String, parameter "min_length" of Long, parameter "max_length" of Long, parameter "report_ref" of String, parameter "report_name" of String :returns: instance of type "ReportResultsmax" -> structure: parameter "objNameOrId" of type "assembly_ref", parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_rmrContigFilter_max # Print statements to stdout/stderr are captured and available as the App log logging.info('Starting run_rmrContigFilter_max function. Params=' + pformat(params)) # Step 1 - Parse/examine the parameters and catch any errors # It is important to check that parameters exist and are defined, and that nice error # messages are returned to users. Parameter values go through basic validation when # defined in a Narrative App, but advanced users or other SDK developers can call # this function directly, so validation is still important. logging.info('Validating parameters.') if 'output_workspace' not in params: raise ValueError( 'Parameter output_workspace is not set in input arguments') workspace_name = params['output_workspace'] if 'assembly_input_ref' not in params: raise ValueError( 'Parameter assembly_input_ref is not set in input arguments') assembly_input_ref = params['assembly_input_ref'] if 'min_length' not in params: raise ValueError( 'Parameter min_length is not set in input arguments') min_length_orig = params['min_length'] min_length = None try: min_length = int(min_length_orig) except ValueError: raise ValueError( 'Cannot parse integer from min_length parameter (' + str(min_length_orig) + ')') if min_length < 0: raise ValueError('min_length parameter cannot be negative (' + str(min_length) + ')') if 'max_length' not in params: raise ValueError( 'Parameter max_length is not set in input arguments') max_length_orig = params['max_length'] max_length = None try: max_length = int(max_length_orig) except ValueError: raise ValueError( 'Cannot parse integer from max_length parameter (' + str(max_length_orig) + ')') if max_length < 0: raise ValueError('max_length parameter cannot be negative (' + str(max_length) + ')') if min_length >= max_length: raise ValueError( 'max_length cannot be less than or equal to min_length') # Step 2 - Download the input data as a Fasta and # We can use the AssemblyUtils module to download a FASTA file from our Assembly data object. # The return object gives us the path to the file that was created. logging.info('Downloading Assembly data as a Fasta file.') assemblyUtil = AssemblyUtil(self.callback_url) fasta_file = assemblyUtil.get_assembly_as_fasta( {'ref': assembly_input_ref}) # Step 3 - Actually perform the filter operation, saving the good contigs to a new fasta file. # We can use BioPython to parse the Fasta file and build and save the output to a file. good_contigs = [] n_total = 0 n_remaining = 0 for record in SeqIO.parse(fasta_file['path'], 'fasta'): n_total += 1 if len(record.seq) >= min_length and len(record.seq) <= max_length: good_contigs.append(record) n_remaining += 1 logging.info('Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total)) filtered_fasta_file = os.path.join(self.shared_folder, 'filtered.fasta') SeqIO.write(good_contigs, filtered_fasta_file, 'fasta') # Step 4 - Save the new Assembly back to the system logging.info('Uploading filtered Assembly data.') new_assembly = assemblyUtil.save_assembly_from_fasta({ 'file': { 'path': filtered_fasta_file }, 'workspace_name': workspace_name, #'assembly_name': fasta_file['assembly_name'] 'assembly_name': params['output_assembly_name'] }) # Step 5 - Build a Report and return report = KBaseReport(self.callback_url) # This is the old plain text report given in the SDK tutorial #reportObj = { # 'objects_created': [{'ref': new_assembly, 'description': 'Filtered contigs'}], # 'text_message': 'Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total) #} # This is the old plain text report, we need report.create_extended_report for our new output # report_info = report.create({'report': reportObj, 'workspace_name': params['workspace_name']}) # STEP 6: contruct the output to send back # We want to output the new assembly in an assembly viewer, to show the dynamic table # associated with the new assembly. We also want to keep our report text. report_info = report.create_extended_report({ "message": 'Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total), "objects_created": [{ 'ref': new_assembly, 'description': 'Filtered contigs' }], #"workspace_id": params['workspace_id'], "workspace_name": params["output_workspace"] }) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], 'objNameOrId': params["output_assembly_name"], #'n_initial_contigs': n_total, #'n_contigs_removed': n_total - n_remaining, #'n_contigs_remaining': n_remaining, 'wsNameOrId': params['output_workspace'], #'workspace_id': report_info['ws_id'] } logging.info('returning:' + pformat(output)) # This will print the ref # to the new assembly created from the filter # print("\n\nNEW ASSEMBLY: "+new_assembly+"\n\n") #END run_rmrContigFilter_max # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_rmrContigFilter_max return value ' + 'output is not type dict as required.') # return the results return [output]