class VariationMerge: ''' Module Name: VariationMerge Module Description: A KBase module: VariationMerge ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.0.1" GIT_URL = "https://github.com/kbasecollaborations/VariationMerge.git" GIT_COMMIT_HASH = "918495236305bcae5e2ded0be6ed18d71defd678" #BEGIN_CLASS_HEADER #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.callback_url = os.environ['SDK_CALLBACK_URL'] self.shared_folder = config['scratch'] logging.basicConfig(format='%(created)s %(levelname)s: %(message)s', level=logging.INFO) self.ws_url = config['workspace-url'] self.vu = VariationUtil(self.callback_url) self.mu = MergeVcfUtils() #END_CONSTRUCTOR pass def run_VariationMerge(self, ctx, params): """ :param params: instance of type "inparams" (This example function accepts any number of parameters and returns results in a KBaseReport) -> structure: parameter "obj_name" of String, parameter "workspace_name" of String, parameter "vcflist" of list of String :returns: instance of type "OutResults" -> structure: parameter "output_obj_ref" of String, parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_VariationMerge self.ws = Workspace(url=self.ws_url, token=ctx['token']) print(params) vcf_flist = [] assembly_ref_set = set() sampleset_ref_set = set() genome_set_ref_set = set() for i in range(len(params['vcflist'])): variation_ref = params['vcflist'][i] variation_obj = self.ws.get_objects2( {'objects': [{ 'ref': variation_ref }]})['data'][0] print(variation_obj['data']['assembly_ref']) if 'assembly_ref' in variation_obj['data']: assembly_ref = variation_obj['data']['assembly_ref'] assembly_ref_set.add(assembly_ref) elif 'genome_ref' in variation_obj['data']: genome_ref = variation_obj['data']['genome_ref'] genome_set_ref_set.add(genome_ref) print(params['vcflist'][i]) vcf_filename = "/kb/module/work/tmp/variation" + str(i) + ".vcf.gz" vcf_flist.append(vcf_filename) inparams = {} inparams['variation_ref'] = variation_ref inparams['filename'] = vcf_filename self.vu.get_variation_as_vcf(inparams) os.rename("/kb/module/work/tmp/variation.vcf.gz", vcf_filename) self.mu.index_vcf(vcf_filename) var_object_ref = params['vcflist'][i] data = self.ws.get_objects2({ 'objects': [{ "ref": var_object_ref, 'included': ['/sample_set_ref'] }] })['data'][0]['data'] sampleset_ref_set.add(data['sample_set_ref']) #Raising exception if (len(genome_set_ref_set) == 0 and len(assembly_ref_set) != 1): raise Exception( "variation objects are from different assembly refs") elif (len(sampleset_ref_set) != 1): raise Exception( "variation objects are from different sample set refs") elif (len(assembly_ref_set) == 0 and len(genome_set_ref_set) != 1): raise Exception( "variation objects are from different genome set refs") merged_file = os.path.join(self.shared_folder, "merged_gatk_variation_jmc2_test.vcf") self.mu.merge_vcf(vcf_flist, merged_file) save_variation_params = { 'workspace_name': params['workspace_name'], 'genome_or_assembly_ref': assembly_ref_set.pop(), 'sample_set_ref': sampleset_ref_set.pop(), 'sample_attribute_name': 'sample_attr', 'vcf_staging_file_path': merged_file, 'variation_object_name': params['variation_object_name'] } self.vu.save_variation_from_vcf(save_variation_params) report = KBaseReport(self.callback_url) report_info = report.create({ 'report': { 'objects_created': [], 'text_message': 'success' }, 'workspace_name': params['workspace_name'] }) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], } #END run_VariationMerge # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_VariationMerge return value ' + 'output is not type dict as required.') # return the results return [output] def status(self, ctx): #BEGIN_STATUS returnVal = { 'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH } #END_STATUS return [returnVal]
class VariationAnnotation: ''' Module Name: VariationAnnotation Module Description: A KBase module: VariationAnnotation ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.0.1" GIT_URL = "https://github.com/man4ish/VariationAnnotation.git" GIT_COMMIT_HASH = "233ab11cd942b99c960f7b83aaee2b3800685bb4" #BEGIN_CLASS_HEADER def build_genome_index(self, genome_ref): #Downloads gff, fasta and puts it in the right directory # and returns the genome_index name that can be used by snpeff.jar #TODO: READ GENOME TAXONOMY from genome_ref and # TODO: Get genome taxonomy/classification from user so that There # is no confusion. pass #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.callback_url = os.environ['SDK_CALLBACK_URL'] self.scratch = config['scratch'] logging.basicConfig(format='%(created)s %(levelname)s: %(message)s', level=logging.INFO) self.VU = VariationUtil(self.callback_url) self.SU = SnpEffUtils() self.DU = DownloadUtils() self.HU = htmlreportutils() self.config = config #self.snpeff=<path_to_snpeff> #END_CONSTRUCTOR pass def annotate_variants(self, ctx, params): """ This method extracts VCF from variation object, runs SNPEFF workflow (http://snpeff.sourceforge.net/SnpEff_manual.html) and annotate and predict the effects of genetic variants (such as amino acid changes) :param params: instance of type "input_params" (variation_ref: Reference to Variation object out_variation_name: Name by which the output object will be saved) -> structure: parameter "variation_ref" of String, parameter "out_variation_name" of String :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN annotate_variants # Validate the parameters # Extract vcf from variation using VariationUtil # output_dir = os.path.join(self.scratch, str(uuid.uuid4())) # os.mkdir(output_dir) # #filename = os.path.join(output_dir, "variation.vcf.gz") # print(filename) # vcf_path = self.VU.get_variation_as_vcf({ # 'variation_ref': params['variation_ref'], # 'filename':filename # }) # TODO current vcf path is hard coded for testing which need to be removed. self.SU.validate_params(params) vcf_path = "/kb/module/work/variation.vcf.gz" print(vcf_path) # TODO: Need to think through how to get this from the USERS # because variation_ref may or may not have a genome_ref field filled in # our spec.json may require some work # There is a chance that user may provide wrong genome as input if we don't deal with this properly # params['genome_ref'] # Download gff and assembly based on geome_ref #gff_path = ..... #assembly_path ... workspace = params['workspace_name'] self.ws_url = self.config['workspace-url'] self.ws = Workspace(self.ws_url, token=ctx['token']) # TODO current file name is hard coded but that need to be changed later. filename = "/kb/module/work/variation.vcf" output_dir = os.path.join(self.scratch, str(uuid.uuid4())) os.mkdir(output_dir) shutil.copytree("/kb/module/deps/snp_eff", output_dir + "/snp_eff") variation_ref = params['variation_ref'] variation_obj = self.ws.get_objects2({'objects': [{'ref': variation_ref}]})['data'][0] data = self.ws.get_objects2( {'objects':[{"ref":variation_ref, 'included': ['/sample_set_ref']}]})['data'][0]['data'] sample_set_ref = data['sample_set_ref'] assembly_ref = variation_obj['data']['assembly_ref'] assembly_path = self.DU.get_assembly(assembly_ref, output_dir) gff_ref = params['genome_ref'] gff_path = self.DU.get_gff(gff_ref, output_dir) # Todo: It is temporary fix but need to find logical removal of exons based on coordinates. fix_cmd = "grep -v \"exon\" "+ gff_path + " > /kb/module/work/tmp/output.gff" print(fix_cmd) os.system(fix_cmd) #os.system("cp /kb/module/work/tmp/output.gff " + os.path.join(output_dir, "/snp_eff/data/kbase_v1/genes.gff")) #shutil.copyfile("/kb/module/work/tmp/output.gff", output_dir + "/snp_eff/data/kbase_v1/genes.gff") vcf_path = self.VU.get_variation_as_vcf({ 'variation_ref': params['variation_ref'], 'filename': filename }) new_gff_path = "/kb/module/work/tmp/output.gff" genome_index_name = self.SU.build_genome(new_gff_path, assembly_path, output_dir) annotated_vcf_path = self.SU.annotate_variants(genome_index_name, vcf_path['path'], params, output_dir) ''' params['vcf_staging_file_path'] = annotated_vcf_path params['variation_object_name'] = params['output_object_name'] params['genome_or_assembly_ref'] = params['genome_ref'] ''' save_variation_params = {'workspace_name': params['workspace_name'], 'genome_or_assembly_ref': params['genome_ref'], 'sample_set_ref': sample_set_ref, 'sample_attribute_name':'sample_attr', 'vcf_staging_file_path': annotated_vcf_path, 'variation_object_name': params['output_object_name'] } variantion_ref = self.VU.save_variation_from_vcf(save_variation_params)['variation_ref'] created_objects = [] created_objects.append({ "ref": variation_ref, "description": "Variation Object" }) #self.VU. #upload file to shock # TODO: Add parameters for snpeff in parameters # Parse the snpeff parameters from params and build snpeff command # TODO: We are hardcoding this for now print("\n\n\n") print("$$$$$$$$" + output_dir + "$$$$$$$$$") arr = os.listdir(output_dir + "/snp_eff") for files in arr: print("########" + files + "###########") print("\n\n\n") #os.rename(os.path.join(output_dir, "snp_eff/snpEff_summary.html"), os.path.join(output_dir, "snp_eff/index.html")) snp_eff_resultdir = os.path.join(output_dir, "snp_eff_results") os.mkdir(snp_eff_resultdir) #shutil.copyfile(os.path.join(output_dir, "snp_eff/index.html"), os.path.join(snp_eff_resultdir, "index.html")) shutil.copyfile(os.path.join(output_dir, "snp_eff/snpEff_genes.txt"), os.path.join(snp_eff_resultdir, "snpEff_genes.txt")) #report_dirpath = os.path.join(output_dir, "snp_eff") logging.info("creating html report ...") output = self.HU.create_html_report(self.callback_url, snp_eff_resultdir, workspace) # output = self.HU.create_html_report(self.callback_url, snp_eff_resultdir, workspace, created_objects) ''' report = KBaseReport(self.callback_url) output = { "x":vcf_path } ''' #END annotate_variants # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method annotate_variants return value ' + 'output is not type dict as required.') # return the results return [output] def status(self, ctx): #BEGIN_STATUS returnVal = {'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH} #END_STATUS return [returnVal]
class VariationAnalyzer: ''' Module Name: VariationAnalyzer Module Description: A KBase module: VariationAnalyzer ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.0.1" GIT_URL = "" GIT_COMMIT_HASH = "" #BEGIN_CLASS_HEADER #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.callback_url = os.environ['SDK_CALLBACK_URL'] self.shared_folder = config['scratch'] logging.basicConfig(format='%(created)s %(levelname)s: %(message)s', level=logging.INFO) self.dfu = DownloadFastqUtils() self.su = SnippyUtils() self.vu = VariationUtil(self.callback_url) #END_CONSTRUCTOR pass def run_VariationAnalyzer(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of type "InputParams" -> structure: parameter "obj_name" of String, parameter "workspace_name" of String, parameter "fastq_ref" of String, parameter "map_qual" of Long, parameter "base_qual" of Long, parameter "min_cov" of Long, parameter "min_qual" of Long :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_VariationAnalyzer self.su.validate_params(params) logging.info("Downloading Fastq File") fastq_file = self.dfu._stage_input_file(params['fastq_ref'], "paired_end") logging.info("Downloading assembly file") genome_assembly = self.dfu.download_genome( params['genome_or_assembly_ref']) self.su.deinterleave(fastq_file['files']['fwd'], self.shared_folder) sample_name = "snippy_output" #hardcoded to match with attribute mapping file snippy_output = self.shared_folder + "/" + sample_name cmd = self.su.build_snippy_command(genome_assembly['path'], snippy_output, self.shared_folder) self.su.run_snippy_command(cmd) params[ 'vcf_staging_file_path'] = self.shared_folder + "/" + sample_name + "/snps.vcf" self.vu.save_variation_from_vcf(params) report = KBaseReport(self.callback_url) report_info = report.create({ 'report': { 'objects_created': [], 'text_message': params['fastq_ref'] }, 'workspace_name': params['workspace_name'] }) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], } #END run_VariationAnalyzer # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_VariationAnalyzer return value ' + 'output is not type dict as required.') # return the results return [output] def status(self, ctx): #BEGIN_STATUS returnVal = { 'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH } #END_STATUS return [returnVal]
class kb_ReadSim: ''' Module Name: kb_ReadSim Module Description: A KBase module: kb_ReadSim ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.0.1" GIT_URL = "https://github.com/kbasecollaborations/kb_ReadSim.git" GIT_COMMIT_HASH = "c9c0185e34d25be57cc6e1c901d8801fbc0f4784" #BEGIN_CLASS_HEADER #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.callback_url = os.environ['SDK_CALLBACK_URL'] self.shared_folder = config['scratch'] self.du = DownloadUtils(self.callback_url) self.su = SimUtils() self.ru = ReadsUtils(self.callback_url) self.vu = VariationUtil(self.callback_url) self.eu = VcfEvalUtils() self.hu = htmlreportutils() self.ws_url = config['workspace-url'] self.wsc = Workspace(self.ws_url) logging.basicConfig(format='%(created)s %(levelname)s: %(message)s', level=logging.INFO) #END_CONSTRUCTOR pass def run_kb_ReadSim(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of type "Inparams" -> structure: parameter "workspace_name" of String, parameter "input_sample_set" of String, parameter "strain_info" of String, parameter "assembly_or_genome_ref" of String, parameter "base_error_rate" of String, parameter "outer_distance" of String, parameter "standard_deviation" of String, parameter "num_read_pairs" of String, parameter "len_first_read" of String, parameter "len_second_read" of String, parameter "mutation_rate" of String, parameter "frac_indels" of String, parameter "variation_object_name" of String, parameter "output_read_object" of String :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_kb_ReadSim output_dir = self.shared_folder print(params) self.su.validate_simreads_params(params) genome_or_assembly_ref = params['assembly_or_genome_ref'] obj_type = self.wsc.get_object_info3( {'objects': [{ 'ref': genome_or_assembly_ref }]})['infos'][0][2] if ('KBaseGenomes.Genome' in obj_type): genome_ref = genome_or_assembly_ref subset = self.wsc.get_object_subset([{ 'included': ['/assembly_ref'], 'ref': genome_ref }]) assembly_ref = subset[0]['data']['assembly_ref'] elif ('KBaseGenomeAnnotations.Assembly' in obj_type): assembly_ref = genome_or_assembly_ref else: raise ValueError(obj_type + ' is not the right input for this method. ' + 'Valid input include KBaseGenomes.Genome or ' + 'KBaseGenomeAnnotations.Assembly ') self.du.download_genome(assembly_ref, output_dir) ref_genome = os.path.join(self.shared_folder, "ref_genome.fa") output_fwd_paired_file_path = os.path.join(self.shared_folder, "raed1.fq") output_rev_paired_file_path = os.path.join(self.shared_folder, "raed2.fq") self.eu.check_path_exists(ref_genome) self.su.simreads(ref_genome, output_fwd_paired_file_path, output_rev_paired_file_path, params) self.eu.check_path_exists(output_fwd_paired_file_path) self.eu.check_path_exists(output_rev_paired_file_path) retVal = self.ru.upload_reads({ 'wsname': params['workspace_name'], 'name': params['output_read_object'], 'sequencing_tech': 'illumina', 'fwd_file': output_fwd_paired_file_path, 'rev_file': output_rev_paired_file_path }) logfile = os.path.join(self.shared_folder, "variant.txt") self.eu.check_path_exists(logfile) vcf_file = self.su.format_vcf(logfile) self.eu.check_path_exists(vcf_file) save_variation_params = { 'workspace_name': params['workspace_name'], 'genome_or_assembly_ref': params['assembly_or_genome_ref'], 'sample_set_ref': params['input_sample_set'], 'sample_attribute_name': 'sample_attr', 'vcf_staging_file_path': vcf_file, 'variation_object_name': params['variation_object_name'] } self.vu.save_variation_from_vcf(save_variation_params) report = KBaseReport(self.callback_url) report_info = report.create({ 'report': { 'objects_created': [], 'text_message': 'Success' }, 'workspace_name': params['workspace_name'] }) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], } #END run_kb_ReadSim # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_kb_ReadSim return value ' + 'output is not type dict as required.') # return the results return [output] def run_eval_variantcalling(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of type "Evalparams" -> structure: parameter "workspace_name" of String, parameter "sim_varobject_name" of String, parameter "calling_varobject_name" of String, parameter "output_var_object" of String :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_eval_variantcalling print(params) self.eu.validate_eval_params(params) report_dir = os.path.join(self.shared_folder, str(uuid.uuid4())) os.mkdir(report_dir) self.ws = Workspace(url=self.ws_url, token=ctx['token']) var_object_ref1 = params['varobject_ref1'] sampleset_ref1 = self.ws.get_objects2({ 'objects': [{ "ref": var_object_ref1, 'included': ['/sample_set_ref'] }] })['data'][0]['data']['sample_set_ref'] var_object_ref2 = params['varobject_ref2'] sampleset_ref2 = self.ws.get_objects2({ 'objects': [{ "ref": var_object_ref2, 'included': ['/sample_set_ref'] }] })['data'][0]['data']['sample_set_ref'] if (sampleset_ref1 != sampleset_ref2): raise Exception( "Variation objects are from different sample set\n") assembly_ref_set = set() genomeset_ref_set = set() variation_obj1 = self.ws.get_objects2( {'objects': [{ 'ref': var_object_ref1 }]})['data'][0] if 'assembly_ref' in variation_obj1['data']: assembly_ref1 = variation_obj1['data']['assembly_ref'] assembly_ref_set.add(assembly_ref1) elif 'genome_ref' in variation_obj1['data']: genome_ref1 = variation_obj1['data']['genome_ref'] genomeset_ref_set.add(genome_ref1) variation_obj2 = self.ws.get_objects2( {'objects': [{ 'ref': var_object_ref2 }]})['data'][0] if 'assembly_ref' in variation_obj2['data']: assembly_ref2 = variation_obj2['data']['assembly_ref'] assembly_ref_set.add(assembly_ref2) elif 'genome_ref' in variation_obj2['data']: genome_ref2 = variation_obj2['data']['genome_ref'] genomeset_ref_set.add(genome_ref2) assembly_or_genome_ref = None if (not genomeset_ref_set and len(assembly_ref_set) != 1): raise Exception( "variation objects are from different assembly refs") elif (not assembly_ref_set and len(genomeset_ref_set) != 1): raise Exception("variation objects are from different genome refs") simvarfile = os.path.join(report_dir, "simvarinat.vcf.gz") simvarpath = self.du.download_variations(var_object_ref1, simvarfile) os.rename(simvarpath, simvarfile) self.eu.index_vcf(simvarfile) callingvarfile = os.path.join(report_dir, "callingvarinat.vcf.gz") callingvarpath = self.du.download_variations(var_object_ref2, callingvarfile) os.rename(callingvarpath, callingvarfile) self.eu.index_vcf(callingvarfile) eval_results = self.eu.variant_evalation(simvarfile, callingvarfile, report_dir) unique_vcf1 = eval_results['unique1'] self.eu.check_path_exists(unique_vcf1) unique_vcf2 = eval_results['unique2'] self.eu.check_path_exists(unique_vcf2) common_vcf = eval_results['common'] self.eu.check_path_exists(common_vcf) image_path = self.eu.plot_venn_diagram(report_dir, unique_vcf1, unique_vcf2, common_vcf) self.eu.check_path_exists(image_path) ''' if(len(assembly_ref_set) != 0): assembly_or_genome_ref = assembly_ref_set.pop() elif(len(genomeset_ref_set) != 0): assembly_or_genome_ref = genomeset_ref_set.pop() logging.info("Saving Unique1 vcf\n") save_unique_variation_params1 = {'workspace_name': params['workspace_name'], 'genome_or_assembly_ref': assembly_or_genome_ref, 'sample_set_ref': sampleset_ref1, 'sample_attribute_name': 'sample_unique_attr1', 'vcf_staging_file_path': unique_vcf1, 'variation_object_name': params['output_variant_object'] + "_sample1_unique" } self.vu.save_variation_from_vcf(save_unique_variation_params1) logging.info("Saving done\n") logging.info("Saving Unique2 vcf\n") save_unique_variation_params2 = {'workspace_name': params['workspace_name'], 'genome_or_assembly_ref': assembly_or_genome_ref, 'sample_set_ref': sampleset_ref1, 'sample_attribute_name': 'sample_unique_attr2', 'vcf_staging_file_path': unique_vcf2, 'variation_object_name': params['output_variant_object'] + "_sample2_unique" } self.vu.save_variation_from_vcf(save_unique_variation_params2) logging.info("Saving done\n") logging.info("Saving Common vcf\n") save_common_variation_params = {'workspace_name': params['workspace_name'], 'genome_or_assembly_ref': assembly_or_genome_ref, 'sample_set_ref': sampleset_ref1, 'sample_attribute_name': 'sample_common_attr', 'vcf_staging_file_path': common_vcf, 'variation_object_name': params['output_variant_object'] + "_sample1_sample2_common" } self.vu.save_variation_from_vcf(save_common_variation_params) logging.info("Saving done\n") ''' workspace = params['workspace_name'] output = self.hu.create_html_report(self.callback_url, report_dir, workspace) #END run_eval_variantcalling # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_eval_variantcalling return value ' + 'output is not type dict as required.') # return the results return [output] def status(self, ctx): #BEGIN_STATUS returnVal = { 'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH } #END_STATUS return [returnVal]
class kb_GATK: ''' Module Name: kb_GATK Module Description: A KBase module: kb_GATK ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.0.1" GIT_URL = "https://github.com/kbasecollaborations/kb_GATK.git" GIT_COMMIT_HASH = "5e6e4bdca9a7749bba0abab081736c56007212ed" #BEGIN_CLASS_HEADER #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.callback_url = os.environ['SDK_CALLBACK_URL'] self.shared_folder = config['scratch'] self.ws_url = config['workspace-url'] self.wsc = Workspace(self.ws_url) self.gu = GATKUtils() logging.basicConfig(format='%(created)s %(levelname)s: %(message)s', level=logging.INFO) self.vu = VariationUtil(self.callback_url) self.du = DownloadAlignmentUtils(self.callback_url) #END_CONSTRUCTOR pass def run_kb_GATK(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_kb_GATK source_ref = params['alignment_ref'] alignment_out = self.du.downloadreadalignment(source_ref, params, self.callback_url) sam_file = os.path.join(alignment_out['destination_dir'], "reads_alignment.sam") ''' #Todo Reading sample set and sample strains information ''' ''' command.extend(["-filter-name", "\"QD_filter\"", "-filter", "\"QD", "<", params['snp_filter']['snp_qd_filter'] + "\""]) command.extend(["-filter-name", "\"FS_filter\"", "-filter", "\"FS", "<", params['snp_filter']['snp_fs_filter'] + "\""]) command.extend(["-filter-name", "\"MQ_filter\"", "-filter", "\"MQ", "<", params['snp_filter']['snp_mq_filter'] + "\""]) command.extend(["-filter-name", "\"SOR_filter\"", "-filter", "\"SOR", "<", params['snp_filter']['snp_sor_filter'] + "\""]) command.extend(["-filter-name", "\"MQRankSum_filter\"", "-filter", "\"MQRankSum", "<", params['snp_filter']['snp_mqrankSum_filter'] + "\""]) command.extend(["-filter-name", "\"ReadPosRankSum_filter\"", "-filter", "\"ReadPosRankSum", "<", params['snp_filter']['snp_readposranksum_filter'] + "\""]) ''' print(params) strain_info = params['strain_info'] output_dir = os.path.join(self.shared_folder, str(uuid.uuid4())) os.mkdir(output_dir) genome_or_assembly_ref = params['assembly_or_genome_ref'] obj_type = self.wsc.get_object_info3( {'objects': [{ 'ref': genome_or_assembly_ref }]})['infos'][0][2] if ('KBaseGenomes.Genome' in obj_type): genome_ref = genome_or_assembly_ref subset = self.wsc.get_object_subset([{ 'included': ['/assembly_ref'], 'ref': genome_ref }]) assembly_ref = subset[0]['data']['assembly_ref'] elif ('KBaseGenomeAnnotations.Assembly' in obj_type): assembly_ref = genome_or_assembly_ref else: raise ValueError(obj_type + ' is not the right input for this method. ' + 'Valid input include KBaseGenomes.Genome or ' + 'KBaseGenomeAnnotations.Assembly ') assembly_file = self.du.download_genome(assembly_ref, output_dir)['path'] #output_dir = output_dir + "/" #Todo: check time for building index file or donwload from cache. #Todo: To discuss about cache_id to be used. #Todo: In case of copying genome, find the way of finding original genome (ref id) for getting original cache id. self.gu.build_genome(assembly_file) self.gu.index_assembly(assembly_file) self.gu.generate_sequence_dictionary(assembly_file) self.gu.duplicate_marking(output_dir, sam_file) #self.gu.sort_bam_index(output_dir) self.gu.collect_alignment_and_insert_size_metrics( assembly_file, output_dir) #self.gu.analyze_covariates(output_dir) #Todo: avoid writing intermediate fies to save space and time I/O. self.gu.variant_calling(assembly_file, output_dir) self.gu.extract_variants(assembly_file, output_dir) self.gu.filter_SNPs(assembly_file, "filtered_snps.vcf", output_dir, params) self.gu.filter_Indels(assembly_file, "filtered_indels.vcf", output_dir, params) self.gu.exclude_filtered_variants(output_dir) self.gu.base_quality_score_recalibration(assembly_file, "recal_data.table", output_dir) self.gu.apply_BQSR(assembly_file, "recal_data.table", output_dir) self.gu.base_quality_score_recalibration(assembly_file, "post_recal_data.table", output_dir) self.gu.apply_BQSR(assembly_file, "post_recal_data.table", output_dir) self.gu.filter_SNPs(assembly_file, "filtered_snps_final.vcf", output_dir, params) #Todo: To save indels also using VariationUtils or merge with snps and sort them with chr & pos and save using variaiotiontuils. #Todo: To get an example for saving structural variants(specially CNV) and compare with standard vcf output. self.gu.filter_Indels(assembly_file, "filtered_indels_final.vcf", output_dir, params) ''' os.system("grep '##fileformat' " + output_dir + "/filtered_snps_final.vcf > " + output_dir + "/sample.vcf") cmd = "grep -v '##' " + output_dir + "/filtered_snps_final.vcf >> " + output_dir + "/sample.vcf" os.system(cmd) # TODO : need to remove system command after fixing variationUtils. ''' vcf_filepath = self.gu.index_vcf_file(output_dir + "/filtered_snps_final.vcf") reheader_vcf_file = self.gu.reheader(vcf_filepath, strain_info) #Todo : check existence of final filtered finals snps. #Todo : chnage assembly_or_genome_ref to genome_or_assembly_ref #Todo: to derive name of sample_attribute_name from sample set ref by prefixing/suffixing. Attribute mapping should have one sample. save_variation_params = { 'workspace_name': params['workspace_name'], 'genome_or_assembly_ref': params['assembly_or_genome_ref'], 'sample_set_ref': params['input_sample_set'], 'sample_attribute_name': 'sample_attr', 'vcf_staging_file_path': reheader_vcf_file, 'variation_object_name': params['variation_object_name'] } self.vu.save_variation_from_vcf(save_variation_params) report = KBaseReport(self.callback_url) report_info = report.create({ 'report': { 'objects_created': [], 'text_message': 'Success' }, 'workspace_name': params['workspace_name'] }) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], } #END run_kb_GATK # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_kb_GATK return value ' + 'output is not type dict as required.') # return the results return [output] def status(self, ctx): #BEGIN_STATUS returnVal = { 'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH } #END_STATUS return [returnVal]