def run_PFLOTRAN(self, ctx, params): """ Thi function enables users to run a pflotran simulation from an input plfotran model and fbamodel chemistry :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_PFLOTRAN report = KBaseReport(self.callback_url) report_info = report.create({'report': {'objects_created':[], 'text_message': params['parameter_1']}, 'workspace_name': params['workspace_name']}) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], } #END run_PFLOTRAN # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_PFLOTRAN return value ' + 'output is not type dict as required.') # return the results return [output]
def run_cjneely101MetaSanity(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_cjneely101MetaSanity report = KBaseReport(self.callback_url) report_info = report.create({ 'report': { 'objects_created': [], 'text_message': params['parameter_1'] }, 'workspace_name': params['workspace_name'] }) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], } #END run_cjneely101MetaSanity # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_cjneely101MetaSanity return value ' + 'output is not type dict as required.') # return the results return [output]
def run_kb_Bwa(self, ctx, params): """ :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_kb_Bwa report = KBaseReport(self.callback_url) report_info = report.create({ 'report': { 'objects_created': [], 'text_message': "report submitted" }, 'workspace_name': params['output_workspace'] }) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], } #END run_kb_Bwa # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_kb_Bwa return value ' + 'output is not type dict as required.') # return the results return [output]
def run_akExMod(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_akExMod id = params['parameter_1'] # Initialize DataFileUtil client and get an object by reference. self.callback_url = os.environ['SDK_CALLBACK_URL'] self.dfu = DataFileUtil(self.callback_url) FBAModel_ref = params['parameter_1'] FBAModel_data = self.dfu.get_objects({'object_refs': [FBAModel_ref]})['data'][0] FBAModel_obj = FBAModel_data['data'] FBAModel_meta = FBAModel_data['info'][10] #To test if model is input. #print('Printing the objective') #pprint(FBAModel_obj) #print('Printing the reference ID') #pprint(FBAModel_ref) #print('Printing the data') #pprint(FBAModel_data) #print('Printing the meta') #pprint(FBAModel_meta) if id.find('biomass'): print('Yes, string found in file') else: print('String not found in file') report = KBaseReport(self.callback_url) report_info = report.create({'report': {'objects_created':[], 'text_message': params['parameter_1']}, 'workspace_name': params['workspace_name']}) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], } #END run_akExMod # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_akExMod return value ' + 'output is not type dict as required.') # return the results return [output]
def run_kb_ldannotate(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_kb_ldannotate logging.info("validating input parameters") self.cld.validate_params(params) output_dir = os.path.join(self.shared_folder, str(uuid.uuid4())) os.mkdir(output_dir) #parsing input parameters vcf_file = params.get("vcf_ref") gff_file = params.get("gff_ref") candidate_snp_file = params.get("candidate_snps") feature_type = params.get("feature_type") threshold = params.get("threshold") output_file = params.get("output_file") #cmd = self.lau.build_ldannotate_command(vcf_file, gff_file, candidate_snp_file, feature_type, threshold, output_file, output_dir) #self.lau.run_ldannotate_command(cmd) self.cld.create_output_file(vcf_file, gff_file, candidate_snp_file, feature_type, threshold, output_file, output_dir) report = KBaseReport(self.callback_url) report_info = report.create({ 'report': { 'objects_created': [], 'text_message': 'Nice Report' }, 'workspace_name': params['workspace_name'] }) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], } #END run_kb_ldannotate # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_kb_ldannotate return value ' + 'output is not type dict as required.') # return the results return [output]
def run_alans_job(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_alans_job SERVICE_VER = 'release' print("i'm so cool i'm so fashionble") import time time.sleep(10) print("i'm so cool i'm so fashionble") #TODO ADD ,service_ver='fake' report = KBaseReport(self.callback_url) # dfu = DataFileUtil(self.callback_url) # dwf = {'download_type' : 'Google Drive', # 'file_url' : 'www.google.com'} # filepath = dfu.download_web_file(params=dwf) # print("Filepath is", filepath) print("About to open refdata") with open("/data/kmer") as f: data = f.readlines() print("All done!") print(data) report_info = report.create({ 'report': { 'objects_created': [], 'text_message': f'The app is done. We didnt do anything' }, 'workspace_name': params['workspace_name'] }) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], } #END run_alans_job # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_alans_job return value ' + 'output is not type dict as required.') # return the results return [output]
def run_nkk_compHelloWorld(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_nkk_compHelloWorld sim_dir = '~/../simulation' os.system('ls') import pandas as pd # Read inputs from .tsv file df = pd.read_csv(params['Input_File'], sep='\t') ids = df['id'] InChIes = df['structure'] import inchi_to_submission as its import extract_properties_mulliken_charges_mol2 as mul import compound_parsing as com its.inchi_to_dft(ids, InChIes) length = len(ids) for i in range(length): os.chdir('./' + ids[i] + '/dft') file1 = open('nwchem.out', 'r') nAtoms = mul.getNumberOfAtoms(file1) energy = mul.getInternalEnergy0K(file1) charge = mul.getMullikenCharge(file1, nAtoms) file1.close() mul.nAtoms = nAtoms mul.E0K = energy mul.calculate(ids[i]) for j in range(length): os.chdir('./' + ids[j] + '/dft') os.system('ls') #with open(ids[j]+'_Mulliken.mol2') as IN: #xx = com._make_compound_info(open(ids[j]+'_Mulliken.mol2')) #print(xx) #os.chdir('../..') report = KBaseReport(self.callback_url) report_info = report.create({ 'report': { 'objects_created': [], 'text_message': params['Input_File'], 'text_message': params['calculation_type'] }, 'workspace_name': params['workspace_name'] }) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], } return [output] #END run_nkk_compHelloWorld # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_nkk_compHelloWorld return value ' + 'output is not type dict as required.') # return the results return [output]
import logging import os from installed_clients.KBaseReportClient import KBaseReport #END_HEADER #BEGIN_CLASS_HEADER #END_CLASS_HEADER #BEGIN_CONSTRUCTOR self.callback_url = os.environ['SDK_CALLBACK_URL'] self.shared_folder = config['scratch'] logging.basicConfig(format='%(created)s %(levelname)s: %(message)s', level=logging.INFO) #END_CONSTRUCTOR #BEGIN run_imramboVAMB report = KBaseReport(self.callback_url) report_info = report.create({'report': {'objects_created':[], 'text_message': params['parameter_1']}, 'workspace_name': params['workspace_name']}) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], } #END run_imramboVAMB #BEGIN_STATUS returnVal = {'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH} #END_STATUS
def run_VariationMerge(self, ctx, params): """ :param params: instance of type "inparams" (This example function accepts any number of parameters and returns results in a KBaseReport) -> structure: parameter "obj_name" of String, parameter "workspace_name" of String, parameter "vcflist" of list of String :returns: instance of type "OutResults" -> structure: parameter "output_obj_ref" of String, parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_VariationMerge self.ws = Workspace(url=self.ws_url, token=ctx['token']) print(params) vcf_flist = [] assembly_ref_set = set() sampleset_ref_set = set() genome_set_ref_set = set() for i in range(len(params['vcflist'])): variation_ref = params['vcflist'][i] variation_obj = self.ws.get_objects2( {'objects': [{ 'ref': variation_ref }]})['data'][0] print(variation_obj['data']['assembly_ref']) if 'assembly_ref' in variation_obj['data']: assembly_ref = variation_obj['data']['assembly_ref'] assembly_ref_set.add(assembly_ref) elif 'genome_ref' in variation_obj['data']: genome_ref = variation_obj['data']['genome_ref'] genome_set_ref_set.add(genome_ref) print(params['vcflist'][i]) vcf_filename = "/kb/module/work/tmp/variation" + str(i) + ".vcf.gz" vcf_flist.append(vcf_filename) inparams = {} inparams['variation_ref'] = variation_ref inparams['filename'] = vcf_filename self.vu.get_variation_as_vcf(inparams) os.rename("/kb/module/work/tmp/variation.vcf.gz", vcf_filename) self.mu.index_vcf(vcf_filename) var_object_ref = params['vcflist'][i] data = self.ws.get_objects2({ 'objects': [{ "ref": var_object_ref, 'included': ['/sample_set_ref'] }] })['data'][0]['data'] sampleset_ref_set.add(data['sample_set_ref']) #Raising exception if (len(genome_set_ref_set) == 0 and len(assembly_ref_set) != 1): raise Exception( "variation objects are from different assembly refs") elif (len(sampleset_ref_set) != 1): raise Exception( "variation objects are from different sample set refs") elif (len(assembly_ref_set) == 0 and len(genome_set_ref_set) != 1): raise Exception( "variation objects are from different genome set refs") merged_file = os.path.join(self.shared_folder, "merged_gatk_variation_jmc2_test.vcf") self.mu.merge_vcf(vcf_flist, merged_file) save_variation_params = { 'workspace_name': params['workspace_name'], 'genome_or_assembly_ref': assembly_ref_set.pop(), 'sample_set_ref': sampleset_ref_set.pop(), 'sample_attribute_name': 'sample_attr', 'vcf_staging_file_path': merged_file, 'variation_object_name': params['variation_object_name'] } self.vu.save_variation_from_vcf(save_variation_params) report = KBaseReport(self.callback_url) report_info = report.create({ 'report': { 'objects_created': [], 'text_message': 'success' }, 'workspace_name': params['workspace_name'] }) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], } #END run_VariationMerge # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_VariationMerge return value ' + 'output is not type dict as required.') # return the results return [output]
def filter_contigs(self, ctx, params): """ The actual function is declared using 'funcdef' to specify the name and input/return arguments to the function. For all typical KBase Apps that run in the Narrative, your function should have the 'authentication required' modifier. :param params: instance of type "FilterContigsParams" (A 'typedef' can also be used to define compound or container objects, like lists, maps, and structures. The standard KBase convention is to use structures, as shown here, to define the input and output of your function. Here the input is a reference to the Assembly data object, a workspace to save output, and a length threshold for filtering. To define lists and maps, use a syntax similar to C++ templates to indicate the type contained in the list or map. For example: list <string> list_of_strings; mapping <string, int> map_of_ints;) -> structure: parameter "assembly_input_ref" of type "assembly_ref" (A 'typedef' allows you to provide a more specific name for a type. Built-in primitive types include 'string', 'int', 'float'. Here we define a type named assembly_ref to indicate a string that should be set to a KBase ID reference to an Assembly data object.), parameter "workspace_name" of String, parameter "min_length" of Long :returns: instance of type "FilterContigsResults" (Here is the definition of the output of the function. The output can be used by other SDK modules which call your code, or the output visualizations in the Narrative. 'report_name' and 'report_ref' are special output fields- if defined, the Narrative can automatically render your Report.) -> structure: parameter "report_name" of String, parameter "report_ref" of String, parameter "assembly_output" of type "assembly_ref" (A 'typedef' allows you to provide a more specific name for a type. Built-in primitive types include 'string', 'int', 'float'. Here we define a type named assembly_ref to indicate a string that should be set to a KBase ID reference to an Assembly data object.), parameter "n_initial_contigs" of Long, parameter "n_contigs_removed" of Long, parameter "n_contigs_remaining" of Long """ # ctx is the context object # return variables are: output #BEGIN filter_contigs # Print statements to stdout/stderr are captured and available as the App log print('Starting Filter Contigs function. Params=') pprint(params) # Step 1 - Parse/examine the parameters and catch any errors # It is important to check that parameters exist and are defined, and that nice error # messages are returned to users. Parameter values go through basic validation when # defined in a Narrative App, but advanced users or other SDK developers can call # this function directly, so validation is still important. print('Validating parameters.') if 'workspace_name' not in params: raise ValueError('Parameter workspace_name is not set in input arguments') workspace_name = params['workspace_name'] if 'assembly_input_ref' not in params: raise ValueError('Parameter assembly_input_ref is not set in input arguments') assembly_input_ref = params['assembly_input_ref'] if 'min_length' not in params: raise ValueError('Parameter min_length is not set in input arguments') min_length_orig = params['min_length'] min_length = None try: min_length = int(min_length_orig) except ValueError: raise ValueError('Cannot parse integer from min_length parameter (' + str(min_length_orig) + ')') if min_length < 0: raise ValueError('min_length parameter cannot be negative (' + str(min_length) + ')') # Step 2 - Download the input data as a Fasta and # We can use the AssemblyUtils module to download a FASTA file from our Assembly data object. # The return object gives us the path to the file that was created. print('Downloading Assembly data as a Fasta file.') assemblyUtil = AssemblyUtil(self.callback_url) fasta_file = assemblyUtil.get_assembly_as_fasta({'ref': assembly_input_ref}) # Step 3 - Actually perform the filter operation, saving the good contigs to a new fasta file. # We can use BioPython to parse the Fasta file and build and save the output to a file. good_contigs = [] n_total = 0 n_remaining = 0 for record in SeqIO.parse(fasta_file['path'], 'fasta'): n_total += 1 if len(record.seq) >= min_length: good_contigs.append(record) n_remaining += 1 print('Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total)) filtered_fasta_file = os.path.join(self.shared_folder, 'filtered.fasta') SeqIO.write(good_contigs, filtered_fasta_file, 'fasta') # Step 4 - Save the new Assembly back to the system print('Uploading filtered Assembly data.') new_assembly = assemblyUtil.save_assembly_from_fasta({'file': {'path': filtered_fasta_file}, 'workspace_name': workspace_name, 'assembly_name': fasta_file['assembly_name'] }) # Step 5 - Build a Report and return reportObj = { 'objects_created': [{'ref': new_assembly, 'description': 'Filtered contigs'}], 'text_message': 'Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total) } report = KBaseReport(self.callback_url) report_info = report.create({'report': reportObj, 'workspace_name': params['workspace_name']}) # STEP 6: contruct the output to send back output = {'report_name': report_info['name'], 'report_ref': report_info['ref'], 'assembly_output': new_assembly, 'n_initial_contigs': n_total, 'n_contigs_removed': n_total - n_remaining, 'n_contigs_remaining': n_remaining } print('returning:' + pformat(output)) #END filter_contigs # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method filter_contigs return value ' + 'output is not type dict as required.') # return the results return [output]
def run_barseqR(self, ctx, params): """ Args: :param params: instance of mapping from String to unspecified object ctx: client_ip: None or 'str', user_id: str, 'authenticated': 1, 'token': str, 'module': None, 'method': None, 'call_id': None, 'rpc_context': None, 'provenance':list<prov_d> prov_d: (d) service: (str) 'method': 'please_never_use_it_in_production', 'method_params': []}]} :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_barseqR # SETUP - Unrelated to inputs -------- logging.basicConfig(level=logging.DEBUG) logging.info("Call back url: " + str(self.callback_url)) # We create important classes dfu = DataFileUtil(self.callback_url) logging.info("DFU VARS-- " * 8) logging.info(vars(dfu)) gfu = GenomeFileUtil(self.callback_url) smpl_s = SampleService(self.callback_url) myToken = os.environ.get('KB_AUTH_TOKEN', None) ws = Workspace(self.ws_url, token=myToken) ws_id = ws.get_workspace_info({'workspace': params['workspace_name']})[0] logging.info(os.environ) logging.info('ws-url') logging.info(self.ws_url) logging.info('ctx') logging.info(ctx) # We create indir, outdir, sets_dir (Input, Output, Sets) indir = os.path.join(self.shared_folder, "indir") os.mkdir(indir) outdir = os.path.join(self.shared_folder, "outdir") os.mkdir(outdir) sets_dir = os.path.join(indir, "sets_dir") os.mkdir(sets_dir) metadir = '/kb/module/lib/RunDir/metadata' if not (os.path.isdir(metadir)): raise Exception( "metadata directory not found at: {}".format(metadir)) # We prepare locations of input files poolfile_path = os.path.join(indir, "pool.n10") gene_table_fp = os.path.join(indir, "genes.GC") exps_file = os.path.join(indir, "FEBA_Barseq.tsv") # END SETUP # VALIDATE PARAMS: logging.info("PARAMS:") logging.info(params) # From Util.validate python file val_par = validate_params(params) ''' val_par contains keys: genome_ref poolfile_ref exps_ref sets_ref output_name workspace_name ''' val_par['username'] = ctx['user_id'] # DOWNLOAD FILES download_dict = { "dfu": dfu, "gfu": gfu, "ws": ws, "smpl_s": smpl_s, "sets_dir": sets_dir, "poolfile_path": poolfile_path, "gene_table_fp": gene_table_fp, "exps_file": exps_file, "scratch_dir": self.shared_folder } # We copy input files to proper directories. # vp must contain genome_ref, poolfile_ref, exps_ref, sets_refs (list) # DownloadResults must contain keys 'org', 'set_names_list', 'set_fps_list' # set_names_list value contains the names of the sets without extensions DownloadResults = download_files(val_par, download_dict) logging.debug(json.dumps(DownloadResults, indent=2)) # Get args in this format: # [-org, org_name, -indir, Scratch_Dir_Input, -metadir, Fixed meta dir, # -outdir, scratch_dir_output, -sets_dir, within scratch_dir_input, # -sets, set1 (sets_dir), set2 (sets_dir), set3 (sets_dir), ... ] # Note meta dir is called metadata and is in RunDir # Running the entire program: arg_list = [ "-org", DownloadResults['org'], '-indir', indir, '-metadir', metadir, '-outdir', outdir, '-sets_dir', sets_dir, '-sets' ] arg_list += DownloadResults['set_names_list'] RunBarSeq(arg_list) # Returning files to user report = KBaseReport(self.callback_url) report_info = report.create({ 'report': { 'objects_created': [], 'text_message': params['parameter_1'] }, 'workspace_name': params['workspace_name'] }) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], } #END run_barseqR # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_barseqR return value ' + 'output is not type dict as required.') # return the results return [output]
def run_michael_shafferContigFilter_max(self, ctx, params): """ :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_michael_shafferContigFilter_max # sanitize inputs for name in ['max_length', 'assembly_input_ref', 'workspace_name']: if name not in params: raise ValueError('Parameter %s is required but missing' % name) if not isinstance(params['max_length'], int) or (params['max_length'] < 0): raise ValueError('Max length must be a non-negative integer') if not isinstance(params['assembly_input_ref'], str) or not len(params['assembly_input_ref']): raise ValueError('Pass in a valid assembly reference string') # get files assembly_util = AssemblyUtil(self.callback_url) fasta_file = assembly_util.get_assembly_as_fasta({'ref': params['assembly_input_ref']}) # filter fasta parsed_assembly = SeqIO.parse(fasta_file['path'], 'fasta') max_length = params['max_length'] good_contigs = [] n_total = 0 for record in parsed_assembly: n_total += 1 if len(record.seq) < max_length: good_contigs.append(record) # Upload the filtered data to the workspace workspace_name = params['workspace_name'] filtered_path = os.path.join(self.shared_folder, 'filtered.fasta') SeqIO.write(good_contigs, filtered_path, 'fasta') new_ref = assembly_util.save_assembly_from_fasta({ 'file': {'path': filtered_path}, 'workspace_name': workspace_name, 'assembly_name': fasta_file['assembly_name'] }) # generate report message = "Filtering assembly remove contigs greater than %s bp removed %s out of %s contigs (%s remaining)" % \ (max_length, n_total-len(good_contigs), n_total, len(good_contigs)) report_data = {'objects_created': [{'ref': new_ref, 'description': 'Filtered contigs'}], 'text_message': message} kbase_report = KBaseReport(self.callback_url) report = kbase_report.create({'report': report_data, 'workspace_name': workspace_name}) # set output output = {'report_ref': report['ref'], 'report_name': report['name'], 'n_total': n_total, 'n_kept': len(good_contigs), 'filtered_assembly_ref': new_ref} #END run_michael_shafferContigFilter_max # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_michael_shafferContigFilter_max return value ' + 'output is not type dict as required.') # return the results return [output]
def run_cnelsonAppDemo(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_cnelsonAppDemo # Print statements to stdout/stderr are captured and available as the App log logging.info('Starting run_cnelsonAppDemo function. Params=' + pformat(params)) # Step 1 - Parse/examine the parameters and catch any errors # It is important to check that parameters exist and are defined, and that nice error # messages are returned to users. Parameter values go through basic validation when # defined in a Narrative App, but advanced users or other SDK developers can call # this function directly, so validation is still important. logging.info('Validating parameters.') if 'workspace_name' not in params: raise ValueError( 'Parameter workspace_name is not set in input arguments') workspace_name = params['workspace_name'] if 'assembly_input_ref' not in params: raise ValueError( 'Parameter assembly_input_ref is not set in input arguments') assembly_input_ref = params['assembly_input_ref'] if 'min_length' not in params: raise ValueError( 'Parameter min_length is not set in input arguments') min_length_orig = params['min_length'] min_length = None try: min_length = int(min_length_orig) except ValueError: raise ValueError( 'Cannot parse integer from min_length parameter (' + str(min_length_orig) + ')') if min_length < 0: raise ValueError('min_length parameter cannot be negative (' + str(min_length) + ')') # Step 2 - Download the input data as a Fasta and # We can use the AssemblyUtils module to download a FASTA file from our Assembly data object. # The return object gives us the path to the file that was created. logging.info('Downloading Assembly data as a Fasta file.') assemblyUtil = AssemblyUtil(self.callback_url) fasta_file = assemblyUtil.get_assembly_as_fasta( {'ref': assembly_input_ref}) # Step 3 - Actually perform the filter operation, saving the good contigs to a new fasta file. # We can use BioPython to parse the Fasta file and build and save the output to a file. good_contigs = [] n_total = 0 n_remaining = 0 for record in SeqIO.parse(fasta_file['path'], 'fasta'): n_total += 1 if len(record.seq) >= min_length: good_contigs.append(record) n_remaining += 1 logging.info('Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total)) filtered_fasta_file = os.path.join(self.shared_folder, 'filtered.fasta') SeqIO.write(good_contigs, filtered_fasta_file, 'fasta') # Step 4 - Save the new Assembly back to the system logging.info('Uploading filtered Assembly data.') new_assembly = assemblyUtil.save_assembly_from_fasta({ 'file': { 'path': filtered_fasta_file }, 'workspace_name': workspace_name, 'assembly_name': fasta_file['assembly_name'] }) # Step 5 - Build a Report and return reportObj = { 'objects_created': [{ 'ref': new_assembly, 'description': 'Filtered contigs' }], 'text_message': 'Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total) } report = KBaseReport(self.callback_url) report_info = report.create({ 'report': reportObj, 'workspace_name': params['workspace_name'] }) # STEP 6: contruct the output to send back output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], 'assembly_output': new_assembly, 'n_initial_contigs': n_total, 'n_contigs_removed': n_total - n_remaining, 'n_contigs_remaining': n_remaining } logging.info('returning:' + pformat(output)) #END run_cnelsonAppDemo # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_cnelsonAppDemo return value ' + 'output is not type dict as required.') # return the results return [output]
def run_kb_GATK(self, ctx, params): """ run_kb_GATK:This function accepts any number of parameters and returns results in a KBaseReport :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_kb_GATK self.gu.validate_params(params) print(params) ''' for tesitng only logging.info("start testing") sam_file = "/kb/module/work/reads_alignment.sam" output_dir = "/kb/module/work" self.gu.duplicate_marking(output_dir, sam_file) logging.info("stop testing") return 1 stop testing ''' source_ref = params['alignment_ref'] alignment_out = self.du.downloadreadalignment(source_ref, params, self.callback_url) #sam_file = os.path.join(alignment_out['destination_dir'], "reads_alignment.sam") bam_file = os.path.join(alignment_out['destination_dir'], "reads_alignment.bam") ''' ''' #Todo Reading sample set and sample strains information strain_info = params['strain_info'] output_dir = os.path.join(self.shared_folder, str(uuid.uuid4())) os.mkdir(output_dir) #TODO: to get genome_or_assembly_ref from alignment ref. genome_or_assembly_ref = params['assembly_or_genome_ref'] obj_type = self.wsc.get_object_info3({ 'objects':[{ 'ref': genome_or_assembly_ref }]})['infos'][0][2] if ('KBaseGenomes.Genome' in obj_type): genome_ref = genome_or_assembly_ref subset = self.wsc.get_object_subset([{ 'included': ['/assembly_ref'], 'ref': genome_ref }]) assembly_ref = subset[0]['data']['assembly_ref'] elif ('KBaseGenomeAnnotations.Assembly' in obj_type): assembly_ref = genome_or_assembly_ref else: raise ValueError(obj_type + ' is not the right input for this method. ' + 'Valid input include KBaseGenomes.Genome or ' + 'KBaseGenomeAnnotations.Assembly ') assembly_file = self.du.download_genome(assembly_ref, output_dir)['path'] #Todo: check time for building index file or donwload from cache. #Todo: To discuss about cache_id to be used. #Todo: In case of copying genome, find the way of finding original genome (ref id) for getting original cache id. self.gu.build_genome(assembly_file) self.gu.index_assembly(assembly_file) self.gu.generate_sequence_dictionary(assembly_file) #self.gu.duplicate_marking(output_dir, sam_file) self.gu.duplicate_marking(output_dir, bam_file) self.gu.collect_alignment_and_insert_size_metrics(assembly_file, output_dir) #Todo: avoid writing intermediate fies to save space and time I/O. self.gu.variant_calling(assembly_file, output_dir) self.gu.extract_variants(assembly_file, output_dir) ''' work_dir = "/kb/module/work/9884583c-719f-48b9-800c-3e5047737901" shutil.copytree(work_dir, "/kb/module/work/tmp/9884583c-719f-48b9-800c-3e5047737901") output_dir = "/kb/module/work/tmp/9884583c-719f-48b9-800c-3e5047737901" assembly_file = "/kb/module/work/tmp/9884583c-719f-48b9-800c-3e5047737901/ref_genome.fa" ''' self.gu.filter_SNPs(assembly_file, "filtered_snps.vcf", output_dir, params) self.gu.filter_Indels(assembly_file, "filtered_indels.vcf", output_dir, params) self.gu.exclude_filtered_variants(output_dir) self.gu.base_quality_score_recalibration(assembly_file, "recal_data.table", output_dir) self.gu.apply_BQSR(assembly_file, "recal_data.table", output_dir) self.gu.base_quality_score_recalibration(assembly_file, "post_recal_data.table", output_dir) #self.gu.analyze_covariates(output_dir) self.gu.apply_BQSR(assembly_file, "post_recal_data.table", output_dir) self.gu.filter_SNPs(assembly_file, "filtered_snps_final.vcf", output_dir, params) #Todo: To save indels also using VariationUtils or merge with snps and sort them with chr & pos and save using variaiotiontuils. #Todo: To get an example for saving structural variants(specially CNV) and compare with standard vcf output. self.gu.filter_Indels(assembly_file, "filtered_indels_final.vcf", output_dir, params) vcf_filepath = self.gu.index_vcf_file(output_dir + "/filtered_snps_final.vcf") reheader_vcf_file = self.gu.reheader(vcf_filepath, strain_info) #Todo : check existence of final filtered finals snps. #Todo : chnage assembly_or_genome_ref to genome_or_assembly_ref #Todo: to derive name of sample_attribute_name from sample set ref by prefixing/suffixing. Attribute mapping should have one sample. save_variation_params = {'workspace_name': params['workspace_name'], 'genome_or_assembly_ref': params['assembly_or_genome_ref'], 'sample_set_ref':params['input_sample_set'], 'sample_attribute_name':'sample_attr', 'vcf_staging_file_path': reheader_vcf_file, 'variation_object_name': params['variation_object_name'] } self.vu.save_variation_from_vcf(save_variation_params) report = KBaseReport(self.callback_url) report_info = report.create({'report': {'objects_created': [], 'text_message': 'Success'}, 'workspace_name': params['workspace_name']}) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], } #END run_kb_GATK # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_kb_GATK return value ' + 'output is not type dict as required.') # return the results return [output]
def run_omreegalozpathway_completeness(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_omreegalozpathway_completeness #Preparing report client report_client = KBaseReport(self.callback_url) #Original report info report_info = report_client.create({ 'report': { 'objects_created': [], 'text_message': params['main_input_ref'] }, 'workspace_name': params['workspace_name'] }) token = os.environ.get('KB_AUTH_TOKEN', None) #Checking the input params if "main_input_ref" in params: main_input_ref = params['main_input_ref'] else: logging.info( 'the reference number is not in the params, program must end.') raise Exception("main_input_ref not in params") #Creating the workspace client object ws = Workspace(self.ws_url, token=token) #Getting information about the main input ref obj_info = ws.get_object_info3({'objects': [{'ref': main_input_ref}]}) #Catching errors: if "infos" in obj_info: #Getting information from object reference number object_name = obj_info["infos"][0][1] object_type = obj_info["infos"][0][2] ws_name = obj_info["infos"][0][7] #Logging: logging.debug("Object Type: " + object_type) logging.debug("Object Name: " + object_name) logging.debug("Workspace Name: " + ws_name) else: logging.info( "The function ws.get_object_info3 failed to download the right information. The program must abort." ) raise Exception("Could not find infos in obj_info") #We create the output file name and add information to it later. output_file_name = 'pathways_measurements' #This part is a hack, need to check type of data more accurately. if object_type[:17] == 'KBaseFBA.FBAModel': logging.info("Succesfully recognized type as FBA Model") #Preparing the output file name which we return to the user output_file_name += '_fba_model' #Creating an fba tools object fba_t = fba_tools(self.callback_url) # Getting the TSV file from the object X = fba_t.export_model_as_tsv_file({"input_ref": main_input_ref}) # Logging logging.info( "the object output from fba tools export model as tsv file:") logging.info(X) #Locating where the reactions tsv was placed (Not well done- replace this with a robust form) reactions_file_path = os.path.join( self.shared_folder, object_name + '/' + object_name + '-reactions.tsv') #Preparing an output path for a future function output_path = os.path.join(self.shared_folder, output_file_name + '.tsv') #This function performs the percentage calculation work for FBAModel Object Types. html_path = reactions_file_to_pathway_reactions_and_percentages( reactions_file_path, output_path, object_name) # Using KBase Gene Families- Domain Annotation elif object_type[:34] == "KBaseGeneFamilies.DomainAnnotation": logging.info("Succesfully recognized type as Domain Annotation") output_file_name += '_domain_annotation' #We get the object using workspace's get_objects2 function obj = ws.get_objects2({'objects': [{'ref': main_input_ref}]}) #Within the way the object dictionary is given, what we are looking for is in the location as follows: Y = obj['data'][0]['data']['data'] #Preparing our own output_file_path with Domain Annotation instead of FBAModel (why?) output_file_path = os.path.join(self.shared_folder, output_file_name + '.tsv') #This function (written for the module) finds percentages of pathway completeness. html_path = TIGRFAM_file_to_pathway_reactions_and_percentages( Y, output_file_path, object_name) else: logging.info("Object type unknown") raise Exception( "Could not recognize ref to object- Check if object is FBA Model or Domain Annotation type. If so, the error is in the program, not the input - contact [email protected]." ) html_dict = [{"path": html_path, "name": 'Completeness_Table'}] #Preparing final report: report = report_client.create_extended_report({ 'direct_html_link_index': 0, 'message': 'Here are the pathway completeness results', 'workspace_name': ws_name, 'html_links': html_dict }) output = { 'report_name': report['name'], 'report_ref': report['ref'], } #END run_omreegalozpathway_completeness # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError( 'Method run_omreegalozpathway_completeness return value ' + 'output is not type dict as required.') # return the results return [output]
def run_kb_ReadSim(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of type "Inparams" -> structure: parameter "workspace_name" of String, parameter "input_sample_set" of String, parameter "strain_info" of String, parameter "assembly_or_genome_ref" of String, parameter "base_error_rate" of String, parameter "outer_distance" of String, parameter "standard_deviation" of String, parameter "num_read_pairs" of String, parameter "len_first_read" of String, parameter "len_second_read" of String, parameter "mutation_rate" of String, parameter "frac_indels" of String, parameter "variation_object_name" of String, parameter "output_read_object" of String :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_kb_ReadSim output_dir = self.shared_folder print(params) self.su.validate_simreads_params(params) genome_or_assembly_ref = params['assembly_or_genome_ref'] obj_type = self.wsc.get_object_info3( {'objects': [{ 'ref': genome_or_assembly_ref }]})['infos'][0][2] if ('KBaseGenomes.Genome' in obj_type): genome_ref = genome_or_assembly_ref subset = self.wsc.get_object_subset([{ 'included': ['/assembly_ref'], 'ref': genome_ref }]) assembly_ref = subset[0]['data']['assembly_ref'] elif ('KBaseGenomeAnnotations.Assembly' in obj_type): assembly_ref = genome_or_assembly_ref else: raise ValueError(obj_type + ' is not the right input for this method. ' + 'Valid input include KBaseGenomes.Genome or ' + 'KBaseGenomeAnnotations.Assembly ') self.du.download_genome(assembly_ref, output_dir) ref_genome = os.path.join(self.shared_folder, "ref_genome.fa") output_fwd_paired_file_path = os.path.join(self.shared_folder, "raed1.fq") output_rev_paired_file_path = os.path.join(self.shared_folder, "raed2.fq") self.eu.check_path_exists(ref_genome) self.su.simreads(ref_genome, output_fwd_paired_file_path, output_rev_paired_file_path, params) self.eu.check_path_exists(output_fwd_paired_file_path) self.eu.check_path_exists(output_rev_paired_file_path) retVal = self.ru.upload_reads({ 'wsname': params['workspace_name'], 'name': params['output_read_object'], 'sequencing_tech': 'illumina', 'fwd_file': output_fwd_paired_file_path, 'rev_file': output_rev_paired_file_path }) logfile = os.path.join(self.shared_folder, "variant.txt") self.eu.check_path_exists(logfile) vcf_file = self.su.format_vcf(logfile) self.eu.check_path_exists(vcf_file) save_variation_params = { 'workspace_name': params['workspace_name'], 'genome_or_assembly_ref': params['assembly_or_genome_ref'], 'sample_set_ref': params['input_sample_set'], 'sample_attribute_name': 'sample_attr', 'vcf_staging_file_path': vcf_file, 'variation_object_name': params['variation_object_name'] } self.vu.save_variation_from_vcf(save_variation_params) report = KBaseReport(self.callback_url) report_info = report.create({ 'report': { 'objects_created': [], 'text_message': 'Success' }, 'workspace_name': params['workspace_name'] }) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], } #END run_kb_ReadSim # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_kb_ReadSim return value ' + 'output is not type dict as required.') # return the results return [output]
def run_VariationAnalyzer(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of type "InputParams" -> structure: parameter "obj_name" of String, parameter "workspace_name" of String, parameter "fastq_ref" of String, parameter "map_qual" of Long, parameter "base_qual" of Long, parameter "min_cov" of Long, parameter "min_qual" of Long :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_VariationAnalyzer self.su.validate_params(params) logging.info("Downloading Fastq File") fastq_file = self.dfu._stage_input_file(params['fastq_ref'], "paired_end") logging.info("Downloading assembly file") genome_assembly = self.dfu.download_genome( params['genome_or_assembly_ref']) self.su.deinterleave(fastq_file['files']['fwd'], self.shared_folder) sample_name = "snippy_output" #hardcoded to match with attribute mapping file snippy_output = self.shared_folder + "/" + sample_name cmd = self.su.build_snippy_command(genome_assembly['path'], snippy_output, self.shared_folder) self.su.run_snippy_command(cmd) params[ 'vcf_staging_file_path'] = self.shared_folder + "/" + sample_name + "/snps.vcf" self.vu.save_variation_from_vcf(params) report = KBaseReport(self.callback_url) report_info = report.create({ 'report': { 'objects_created': [], 'text_message': params['fastq_ref'] }, 'workspace_name': params['workspace_name'] }) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], } #END run_VariationAnalyzer # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_VariationAnalyzer return value ' + 'output is not type dict as required.') # return the results return [output]
def run_MotifSuite(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_MotifSuite report = KBaseReport(self.callback_url) mfmd_obj = MotifFindermfmd(self.callback_url) homer_obj = MotifFinderHomer(self.callback_url) meme_obj = MotifFinderMEME(self.callback_url) gibbs_obj = MotifFinderGibbs(self.callback_url) ensemble_obj = MotifEnsemble(self.callback_url) '''result = homer_obj.DiscoverMotifsFromSequenceSet(params) print('Homer RESULT:') pprint(result)''' '''if os.path.exists('/kb/module/work/homer_out'): shutil.rmtree('/kb/module/work/homer_out') shutil.copytree('/kb/module/work/tmp/', '/kb/module/work/homer_out/') result = meme_obj.DiscoverMotifsFromSequenceSet(params) print('MEME RESULT:') pprint(result) ''' result = mfmd_obj.DiscoverMotifsFromSequenceSet(params) print('MFMD RESULT:') pprint(result) '''result = ensemble_obj.MotifEnsemble(params) print('Ensemble RESULT:') print(result) if os.path.exists('/kb/module/work/meme_out'): shutil.rmtree('/kb/module/work/meme_out') shutil.copytree('/kb/module/work/tmp/', '/kb/module/work/meme_out/') result = gibbs_obj.ExtractPromotersFromFeatureSetandDiscoverMotifs(params) print('Gibbs RESULT:') pprint(result) if os.path.exists('/kb/module/work/gibbs_out'): shutil.rmtree('/kb/module/work/gibbs_out') shutil.copytree('/kb/module/work/tmp/', '/kb/module/work/gibbs_out/') #fix issue for MotifFindermfmd in catalogue result = mfmd_obj.DiscoverMotifsFromSequenceSet(params) print('MFMD RESULT:') pprint(result) MSU=MotifSuiteUtil() params['motifset_refs']= MSU.get_obj_refs() result = ensemble_obj.MotifEnsemble(params) print('Ensemble RESULT:') print(result) ''' report_info = report.create({'report': {'objects_created':[], 'text_message': params['workspace_name']}, 'workspace_name': params['workspace_name']}) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], } #END run_MotifSuite # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_MotifSuite return value ' + 'output is not type dict as required.') # return the results return [output]
def save_variation_from_vcf(self, ctx, params): """ Save a variation (and trait?) object to Kbase given a reference genome, object output name, Variant Call Format (VCF) file, and sample attribute file. :param params: instance of type "save_variation_input" (## funcdef save_variation_from_vcf ## required input params: genome_or_assembly_ref: KBaseGenomes.Genome or KBaseGenomeAnnotations.Assembly object reference *** variation input data *** vcf_staging_file_path: path to location data associated with samples variation_object_name: output name for KBase variation object *** sample input data *** sample_attribute_ref: x/y/z reference to kbase sample attribute optional params: NA output report: report_name report_ref HTML visualization: Manhattan plot *** Visualization *** plot_maf: generate histogram of minor allele frequencies plot_hwe: generate histogram of Hardy-Weinberg Equilibrium p-values) -> structure: parameter "workspace_name" of String, parameter "genome_or_assembly_ref" of type "obj_ref" (An X/Y/Z style reference), parameter "vcf_staging_file_path" of type "filepath" (KBase file path to staging files), parameter "variation_object_name" of String, parameter "sample_attribute_ref" of type "obj_ref" (An X/Y/Z style reference) :returns: instance of type "save_variation_output" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: report #BEGIN save_variation_from_vcf genome_or_assembly_ref = params['genome_or_assembly_ref'] ws_url = self.config['workspace-url'] wsc = Workspace(ws_url) obj_type = wsc.get_object_info3( {'objects': [{ 'ref': genome_or_assembly_ref }]})['infos'][0][2] if ('KBaseGenomes.Genome' in obj_type): params['genome_ref'] = genome_or_assembly_ref elif ('KBaseGenomeAnnotations.Assembly' in obj_type): params['assembly_ref'] = genome_or_assembly_ref else: raise ValueError( obj_type + ' is not the right input for this method. Valid input include KBaseGenomes.Genome or KBaseGenomeAnnotations.Assembly ' ) vtv = VCFToVariation(self.config, self.shared_folder, self.callback_url) var_obj = vtv.import_vcf(params) var_obj_ref = str(var_obj[0][6]) + "/" + str( var_obj[0][0]) + "/" + str(var_obj[0][4]) upload_message = "Variation object created." upload_message += "\nObject #" + str(var_obj[0][0]) upload_message += "\nObject name: " + str(var_obj[0][1]) upload_message += "\nGenotypes in variation: " + str( var_obj[1]['numgenotypes']) upload_message += "\nVariants in VCF file: " + str( var_obj[1]['numvariants']) report_obj = { 'objects_created': [{ 'ref': var_obj_ref, 'description': 'Variation object from VCF file.' }], 'text_message': upload_message } report_client = KBaseReport(self.callback_url) report_create = report_client.create({ 'report': report_obj, 'workspace_name': params['workspace_name'] }) report = { "report_name": report_create['name'], "report_ref": report_create['ref'], "workspace_name": params["workspace_name"] } #END save_variation_from_vcf # At some point might do deeper type checking... if not isinstance(report, dict): raise ValueError('Method save_variation_from_vcf return value ' + 'report is not type dict as required.') # return the results return [report]
def run_poolfileupload(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_poolfileupload params['shared_folder'] = self.shared_folder token = os.environ.get('KB_AUTH_TOKEN', None) ws = Workspace(self.ws_url, token=token) params['workspace_id'] = ws.get_workspace_info( {'workspace': params['workspace_name']})[0] params['ws_obj'] = ws params['username'] = ctx['user_id'] params['output_name'] = check_output_name(params['output_name']) if 'pool_file_type' not in params: raise Exception("Did not get param pool_file_type") else: pft = params['pool_file_type'] if pft == 'poolfile': pfu = poolfileuploadUtil(params) result = pfu.upload_poolfile() elif pft == 'poolcount': pcfu = poolcountfileuploadUtil(params) result = pcfu.upload_poolcountfile() elif pft == 'experiments': expsfu = expsfileuploadUtil(params) result = expsfu.upload_expsfile() else: raise Exception("Did not recognize pool_file_type for upload") text_message = "Finished uploading file \n" text_message += "{} saved as {} on {}\n".format( result['Name'], result['Type'], result['Date']) report = KBaseReport(self.callback_url) report_info = report.create({ 'report': { 'objects_created': [], 'text_message': text_message }, 'workspace_name': params['workspace_name'] }) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], } #END run_poolfileupload # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_poolfileupload return value ' + 'output is not type dict as required.') # return the results return [output]
def run_ContigFilter_max(self, ctx, params): """ New app which filters contigs in an assembly using both a minimum and a maximum contig length :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_ContigFilter_max # Check that the parameters are valid for name in [ 'min_length', 'max_length', 'assembly_ref', 'workspace_name' ]: if name not in params: raise ValueError('Parameter "' + name + '" is required but missing') if not isinstance(params['min_length'], int) or (params['min_length'] < 0): raise ValueError('Min length must be a non-negative integer') if not isinstance(params['max_length'], int) or (params['max_length'] < 0): raise ValueError('Max length must be a non-negative integer') if not isinstance(params['assembly_ref'], str) or not len( params['assembly_ref']): raise ValueError('Pass in a valid assembly reference string') print(params['min_length'], params['max_length'], params['assembly_ref']) output = {} assembly_util = AssemblyUtil(self.callback_url) fasta_file = assembly_util.get_assembly_as_fasta( {'ref': params['assembly_ref']}) print(fasta_file) # Parse the downloaded file in FASTA format parsed_assembly = SeqIO.parse(fasta_file['path'], 'fasta') min_length = params['min_length'] max_length = params['max_length'] # Keep a list of contigs greater than min_length good_contigs = [] # total contigs regardless of length n_total = 0 # total contigs over the min_length n_remaining = 0 for record in parsed_assembly: n_total += 1 if len(record.seq) >= min_length and len(record.seq) <= max_length: good_contigs.append(record) n_remaining += 1 # Create a file to hold the filtered data workspace_name = params['workspace_name'] filtered_path = os.path.join(self.shared_folder, 'filtered.fasta') SeqIO.write(good_contigs, filtered_path, 'fasta') # Upload the filtered data to the workspace new_ref = assembly_util.save_assembly_from_fasta({ 'file': { 'path': filtered_path }, 'workspace_name': workspace_name, 'assembly_name': fasta_file['assembly_name'] }) # Create an output summary message for the report text_message = "".join([ 'Filtered assembly to ', str(n_remaining), ' contigs out of ', str(n_total) ]) # Data for creating the report, referencing the assembly we uploaded report_data = { 'objects_created': [{ 'ref': new_ref, 'description': 'Filtered contigs' }], 'text_message': text_message } # Initialize the report kbase_report = KBaseReport(self.callback_url) report = kbase_report.create({ 'report': report_data, 'workspace_name': workspace_name }) # Return the report reference and name in our results output = { 'report_ref': report['ref'], 'report_name': report['name'], 'n_total': n_total, 'n_remaining': n_remaining, 'filtered_assembly_ref': new_ref } #END run_ContigFilter_max # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_ContigFilter_max return value ' + 'output is not type dict as required.') # return the results return [output]
class nmdc_mg_assembly: def __init__(self, callbaack_url, scratch, wdl='../../metaAssembly/'): self.callback_url = callbaack_url self.scratch = scratch self.special = special(self.callback_url) self.ru = ReadsUtils(self.callback_url) self.au = AssemblyUtil(self.callback_url) self.report = KBaseReport(self.callback_url) self.wdl_base = wdl def validate_params(self, params): pass def fetch_reads_files(self, reads_upas): """ From a list of reads UPAs, uses ReadsUtils to fetch the reads as files. Returns them as a dictionary from reads_upa -> filename """ if reads_upas is None: raise ValueError("reads_upas must be a list of UPAs") if len(reads_upas) == 0: raise ValueError("reads_upas must contain at least one UPA") reads_info = self.ru.download_reads(({ 'read_libraries': reads_upas, 'interleaved': 'true', 'gzipped': None }))['files'] file_set = dict() for reads in reads_info: file_set[reads] = reads_info[reads]['files']['fwd'] return file_set def run_wdl(self, rf): print(os.getcwd()) wdl_files = ['jgi_assembly.wdl'] for f in wdl_files: src = self.wdl_base + f dst = './' + f shutil.copy(src, dst) ins = { "jgi_metaASM.input_file": [rf.replace(self.scratch, './')], "jgi_metaASM.rename_contig_prefix": "contig", "jgi_metaASM.outdir": "/out/" } input_file = os.path.join(self.scratch, 'inputs.json') with open(input_file, 'w') as f: f.write(json.dumps(ins)) p = {'workflow': wdl_files[0], 'inputs': 'inputs.json'} res = self.special.wdl(p) print('wdl: ' + str(res)) def _fix_path(self, orig): ind = orig.find('cromwell-executions') return os.path.join(self.scratch, orig[ind:]) def upload_assembly(self, file_path_orig, workspace_name, assembly_name): """ From a list of file paths, uploads them to KBase, generates Assembly objects, then returns the generated UPAs. """ file_path = self._fix_path(file_path_orig) if not file_path: raise ValueError("file_path must be defined") if not os.path.exists(file_path): raise ValueError( "The given assembly file '{}' does not exist".format( file_path)) if not workspace_name: raise ValueError("workspace_name must be defined") if not assembly_name: raise ValueError("assembly_name must be defined") assembly_upa = self.au.save_assembly_from_fasta({ "file": { "path": file_path }, "workspace_name": workspace_name, "assembly_name": assembly_name }) return assembly_upa def _upload_pipeline_result(self, pipeline_result, workspace_name, assembly_name, filtered_reads_name=None, cleaned_reads_name=None, skip_rqcfilter=False, input_reads=None): """ This is very tricky and uploads (optionally!) a few things under different cases. 1. Uploads assembly - this always happens after a successful run. 2. Cleaned reads - passed RQCFilter / BFC / SeqTK - optional, if cleaned_reads_name isn't None 3. Filtered reads - passed RQCFilter - optional, if filtered_reads_name isn't None AND skip_rqcfilter is False returns a dict of UPAs with the following keys: - assembly_upa - the assembly (always) - filtered_reads_upa - the RQCFiltered reads (optionally) - cleaned_reads_upa - the RQCFiltered -> BFC -> SeqTK cleaned reads (optional) """ # upload the assembly uploaded_assy_upa = self.file_util.upload_assembly( pipeline_result["spades"]["contigs_file"], workspace_name, assembly_name) upload_result = {"assembly_upa": uploaded_assy_upa} # upload filtered reads if we didn't skip RQCFilter (otherwise it's just a copy) if filtered_reads_name and not skip_rqcfilter: # unzip the cleaned reads because ReadsUtils won't do it for us. decompressed_reads = os.path.join(self.output_dir, "filtered_reads.fastq") pigz_command = "{} -d -c {} > {}".format( PIGZ, pipeline_result["rqcfilter"]["filtered_fastq_file"], decompressed_reads) p = subprocess.Popen(pigz_command, cwd=self.scratch_dir, shell=True) exit_code = p.wait() if exit_code != 0: raise RuntimeError( "Unable to decompress filtered reads for validation! Can't upload them, either!" ) filtered_reads_upa = self.file_util.upload_reads( decompressed_reads, workspace_name, filtered_reads_name, input_reads) upload_result["filtered_reads_upa"] = filtered_reads_upa # upload the cleaned reads if cleaned_reads_name: # unzip the cleaned reads because ReadsUtils won't do it for us. decompressed_reads = os.path.join(self.output_dir, "cleaned_reads.fastq") pigz_command = "{} -d -c {} > {}".format( PIGZ, pipeline_result["seqtk"]["cleaned_reads"], decompressed_reads) p = subprocess.Popen(pigz_command, cwd=self.scratch_dir, shell=True) exit_code = p.wait() if exit_code != 0: raise RuntimeError( "Unable to decompress cleaned reads for validation! Can't upload them, either!" ) cleaned_reads_upa = self.file_util.upload_reads( decompressed_reads, workspace_name, cleaned_reads_name, input_reads) upload_result["cleaned_reads_upa"] = cleaned_reads_upa return upload_result def assemble(self, params): self.validate_params(params) workspace_name = params['workspace_name'] assembly_name = params['output_assembly_name'] # Stage Data files = self.fetch_reads_files([params["reads_upa"]]) reads_files = list(files.values()) # Run WDL self.run_wdl(reads_files[0]) # Check if things ran mfile = os.path.join(self.scratch, 'meta.json') print(mfile) if not os.path.exists(mfile): raise OSError("Failed to run workflow") with open(mfile) as f: pipeline_output = json.loads(f.read()) out = pipeline_output["calls"]["jgi_metaASM.create_agp"][0]["outputs"] print(out) # Generate Output Objects contigs_fn = out['outcontigs'] upa = self.upload_assembly(contigs_fn, workspace_name, assembly_name) upload_kwargs = {} print("upload complete") # Do report report_info = self.report.create({ 'report': { 'objects_created': [], 'text_message': "Assemble metagenomic reads" }, 'workspace_name': workspace_name }) return { 'report_name': report_info['name'], 'report_ref': report_info['ref'], }
def run_simplebatch(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of type "SimpleBatchParams" -> structure: parameter "batch_inputs" of type "batch_params" -> list of type "app_params" -> mapping from String to unspecified object, parameter "method_name" of String :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_simplebatch report = KBaseReport(self.callback_url) #TODO Always request WSID? #"simpleapp.simple_add" method_name = "simpleapp.simple_add" #params['method_name'] wsid = "TODO" #TODO Get Service_Ver service_ver = "dev" batched_app_params = params['app_params'] job_ids = [] statuses = [] for i, app_param in enumerate(batched_app_params): print(f"About to submit job with params {app_param}") rjp = { "method": method_name, "params": [app_param], "service_ver": service_ver, "wsid": wsid, "app_id": "RanWithBatch", } try: job_id = self.ee2.run_job(params=rjp) status = "queued" except Exception: job_id = "failed to submit" status = "failure" job_ids.append(job_id) statuses.append(status) #TODO Create table with refresh buttons or autorefresh, which uses cookie or environment # Send this as a report report_info = report.create({ 'report': { 'objects_created': [], 'text_message': params['parameter_1'] }, 'workspace_name': params['workspace_name'] }) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], } #END run_simplebatch # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_simplebatch return value ' + 'output is not type dict as required.') # return the results return [output]
def run_kb_GATK(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_kb_GATK source_ref = params['alignment_ref'] alignment_out = self.du.downloadreadalignment(source_ref, params, self.callback_url) sam_file = os.path.join(alignment_out['destination_dir'], "reads_alignment.sam") ''' #Todo Reading sample set and sample strains information ''' ''' command.extend(["-filter-name", "\"QD_filter\"", "-filter", "\"QD", "<", params['snp_filter']['snp_qd_filter'] + "\""]) command.extend(["-filter-name", "\"FS_filter\"", "-filter", "\"FS", "<", params['snp_filter']['snp_fs_filter'] + "\""]) command.extend(["-filter-name", "\"MQ_filter\"", "-filter", "\"MQ", "<", params['snp_filter']['snp_mq_filter'] + "\""]) command.extend(["-filter-name", "\"SOR_filter\"", "-filter", "\"SOR", "<", params['snp_filter']['snp_sor_filter'] + "\""]) command.extend(["-filter-name", "\"MQRankSum_filter\"", "-filter", "\"MQRankSum", "<", params['snp_filter']['snp_mqrankSum_filter'] + "\""]) command.extend(["-filter-name", "\"ReadPosRankSum_filter\"", "-filter", "\"ReadPosRankSum", "<", params['snp_filter']['snp_readposranksum_filter'] + "\""]) ''' print(params) strain_info = params['strain_info'] output_dir = os.path.join(self.shared_folder, str(uuid.uuid4())) os.mkdir(output_dir) genome_or_assembly_ref = params['assembly_or_genome_ref'] obj_type = self.wsc.get_object_info3( {'objects': [{ 'ref': genome_or_assembly_ref }]})['infos'][0][2] if ('KBaseGenomes.Genome' in obj_type): genome_ref = genome_or_assembly_ref subset = self.wsc.get_object_subset([{ 'included': ['/assembly_ref'], 'ref': genome_ref }]) assembly_ref = subset[0]['data']['assembly_ref'] elif ('KBaseGenomeAnnotations.Assembly' in obj_type): assembly_ref = genome_or_assembly_ref else: raise ValueError(obj_type + ' is not the right input for this method. ' + 'Valid input include KBaseGenomes.Genome or ' + 'KBaseGenomeAnnotations.Assembly ') assembly_file = self.du.download_genome(assembly_ref, output_dir)['path'] #output_dir = output_dir + "/" #Todo: check time for building index file or donwload from cache. #Todo: To discuss about cache_id to be used. #Todo: In case of copying genome, find the way of finding original genome (ref id) for getting original cache id. self.gu.build_genome(assembly_file) self.gu.index_assembly(assembly_file) self.gu.generate_sequence_dictionary(assembly_file) self.gu.duplicate_marking(output_dir, sam_file) #self.gu.sort_bam_index(output_dir) self.gu.collect_alignment_and_insert_size_metrics( assembly_file, output_dir) #self.gu.analyze_covariates(output_dir) #Todo: avoid writing intermediate fies to save space and time I/O. self.gu.variant_calling(assembly_file, output_dir) self.gu.extract_variants(assembly_file, output_dir) self.gu.filter_SNPs(assembly_file, "filtered_snps.vcf", output_dir, params) self.gu.filter_Indels(assembly_file, "filtered_indels.vcf", output_dir, params) self.gu.exclude_filtered_variants(output_dir) self.gu.base_quality_score_recalibration(assembly_file, "recal_data.table", output_dir) self.gu.apply_BQSR(assembly_file, "recal_data.table", output_dir) self.gu.base_quality_score_recalibration(assembly_file, "post_recal_data.table", output_dir) self.gu.apply_BQSR(assembly_file, "post_recal_data.table", output_dir) self.gu.filter_SNPs(assembly_file, "filtered_snps_final.vcf", output_dir, params) #Todo: To save indels also using VariationUtils or merge with snps and sort them with chr & pos and save using variaiotiontuils. #Todo: To get an example for saving structural variants(specially CNV) and compare with standard vcf output. self.gu.filter_Indels(assembly_file, "filtered_indels_final.vcf", output_dir, params) ''' os.system("grep '##fileformat' " + output_dir + "/filtered_snps_final.vcf > " + output_dir + "/sample.vcf") cmd = "grep -v '##' " + output_dir + "/filtered_snps_final.vcf >> " + output_dir + "/sample.vcf" os.system(cmd) # TODO : need to remove system command after fixing variationUtils. ''' vcf_filepath = self.gu.index_vcf_file(output_dir + "/filtered_snps_final.vcf") reheader_vcf_file = self.gu.reheader(vcf_filepath, strain_info) #Todo : check existence of final filtered finals snps. #Todo : chnage assembly_or_genome_ref to genome_or_assembly_ref #Todo: to derive name of sample_attribute_name from sample set ref by prefixing/suffixing. Attribute mapping should have one sample. save_variation_params = { 'workspace_name': params['workspace_name'], 'genome_or_assembly_ref': params['assembly_or_genome_ref'], 'sample_set_ref': params['input_sample_set'], 'sample_attribute_name': 'sample_attr', 'vcf_staging_file_path': reheader_vcf_file, 'variation_object_name': params['variation_object_name'] } self.vu.save_variation_from_vcf(save_variation_params) report = KBaseReport(self.callback_url) report_info = report.create({ 'report': { 'objects_created': [], 'text_message': 'Success' }, 'workspace_name': params['workspace_name'] }) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], } #END run_kb_GATK # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_kb_GATK return value ' + 'output is not type dict as required.') # return the results return [output]