示例#1
0
class VariationMerge:
    '''
    Module Name:
    VariationMerge

    Module Description:
    A KBase module: VariationMerge
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.0.1"
    GIT_URL = "https://github.com/kbasecollaborations/VariationMerge.git"
    GIT_COMMIT_HASH = "918495236305bcae5e2ded0be6ed18d71defd678"

    #BEGIN_CLASS_HEADER
    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.shared_folder = config['scratch']
        logging.basicConfig(format='%(created)s %(levelname)s: %(message)s',
                            level=logging.INFO)
        self.ws_url = config['workspace-url']

        self.vu = VariationUtil(self.callback_url)
        self.mu = MergeVcfUtils()
        #END_CONSTRUCTOR
        pass

    def run_VariationMerge(self, ctx, params):
        """
        :param params: instance of type "inparams" (This example function
           accepts any number of parameters and returns results in a
           KBaseReport) -> structure: parameter "obj_name" of String,
           parameter "workspace_name" of String, parameter "vcflist" of list
           of String
        :returns: instance of type "OutResults" -> structure: parameter
           "output_obj_ref" of String, parameter "report_name" of String,
           parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_VariationMerge

        self.ws = Workspace(url=self.ws_url, token=ctx['token'])

        print(params)

        vcf_flist = []
        assembly_ref_set = set()
        sampleset_ref_set = set()
        genome_set_ref_set = set()
        for i in range(len(params['vcflist'])):
            variation_ref = params['vcflist'][i]

            variation_obj = self.ws.get_objects2(
                {'objects': [{
                    'ref': variation_ref
                }]})['data'][0]
            print(variation_obj['data']['assembly_ref'])

            if 'assembly_ref' in variation_obj['data']:
                assembly_ref = variation_obj['data']['assembly_ref']
                assembly_ref_set.add(assembly_ref)
            elif 'genome_ref' in variation_obj['data']:
                genome_ref = variation_obj['data']['genome_ref']
                genome_set_ref_set.add(genome_ref)

            print(params['vcflist'][i])
            vcf_filename = "/kb/module/work/tmp/variation" + str(i) + ".vcf.gz"
            vcf_flist.append(vcf_filename)

            inparams = {}
            inparams['variation_ref'] = variation_ref
            inparams['filename'] = vcf_filename

            self.vu.get_variation_as_vcf(inparams)
            os.rename("/kb/module/work/tmp/variation.vcf.gz", vcf_filename)
            self.mu.index_vcf(vcf_filename)
            var_object_ref = params['vcflist'][i]
            data = self.ws.get_objects2({
                'objects': [{
                    "ref": var_object_ref,
                    'included': ['/sample_set_ref']
                }]
            })['data'][0]['data']
            sampleset_ref_set.add(data['sample_set_ref'])

        #Raising exception

        if (len(genome_set_ref_set) == 0 and len(assembly_ref_set) != 1):
            raise Exception(
                "variation objects are from different assembly refs")
        elif (len(sampleset_ref_set) != 1):
            raise Exception(
                "variation objects are from different sample set refs")
        elif (len(assembly_ref_set) == 0 and len(genome_set_ref_set) != 1):
            raise Exception(
                "variation objects are from different genome set refs")

        merged_file = os.path.join(self.shared_folder,
                                   "merged_gatk_variation_jmc2_test.vcf")
        self.mu.merge_vcf(vcf_flist, merged_file)

        save_variation_params = {
            'workspace_name': params['workspace_name'],
            'genome_or_assembly_ref': assembly_ref_set.pop(),
            'sample_set_ref': sampleset_ref_set.pop(),
            'sample_attribute_name': 'sample_attr',
            'vcf_staging_file_path': merged_file,
            'variation_object_name': params['variation_object_name']
        }
        self.vu.save_variation_from_vcf(save_variation_params)

        report = KBaseReport(self.callback_url)
        report_info = report.create({
            'report': {
                'objects_created': [],
                'text_message': 'success'
            },
            'workspace_name': params['workspace_name']
        })
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref'],
        }
        #END run_VariationMerge

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_VariationMerge return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]
class VariationAnnotation:
    '''
    Module Name:
    VariationAnnotation

    Module Description:
    A KBase module: VariationAnnotation
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.0.1"
    GIT_URL = "https://github.com/man4ish/VariationAnnotation.git"
    GIT_COMMIT_HASH = "233ab11cd942b99c960f7b83aaee2b3800685bb4"

    #BEGIN_CLASS_HEADER
    def build_genome_index(self, genome_ref):
        #Downloads gff, fasta and puts it in the right directory
        # and returns the genome_index name that can be used by snpeff.jar
        #TODO: READ GENOME TAXONOMY from genome_ref and
        # TODO: Get genome taxonomy/classification from user so that There
        # is no confusion.
        pass
    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.scratch = config['scratch']
        logging.basicConfig(format='%(created)s %(levelname)s: %(message)s',
                            level=logging.INFO)
        self.VU = VariationUtil(self.callback_url)
        self.SU = SnpEffUtils()
        self.DU = DownloadUtils()
        self.HU = htmlreportutils()
        self.config = config
        #self.snpeff=<path_to_snpeff>
        #END_CONSTRUCTOR
        pass

    def annotate_variants(self, ctx, params):
        """
        This method extracts VCF from variation object,
        runs SNPEFF workflow (http://snpeff.sourceforge.net/SnpEff_manual.html)
        and annotate and predict the effects of genetic variants
        (such as amino acid changes)
        :param params: instance of type "input_params" (variation_ref:
           Reference to Variation object out_variation_name: Name by which
           the output object will be saved) -> structure: parameter
           "variation_ref" of String, parameter "out_variation_name" of String
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN annotate_variants
        # Validate the parameters
        # Extract vcf from variation using VariationUtil
        #    output_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        #    os.mkdir(output_dir)
        #    #filename = os.path.join(output_dir, "variation.vcf.gz")

        #    print(filename)
        #    vcf_path = self.VU.get_variation_as_vcf({
        #        'variation_ref': params['variation_ref'],
        #        'filename':filename
        #    })
        # TODO current vcf path is hard coded for testing which need to be removed.

        self.SU.validate_params(params)
        vcf_path = "/kb/module/work/variation.vcf.gz"
        print(vcf_path)

        # TODO: Need to think through how to get this from the USERS
        # because variation_ref may or may not have a genome_ref field filled in
        # our spec.json may require some work
        # There is a chance that user may provide wrong genome as input if we don't deal with this properly
        # params['genome_ref']
        # Download gff and assembly based on geome_ref
        #gff_path = .....
        #assembly_path ...

        workspace = params['workspace_name']
        self.ws_url = self.config['workspace-url']
        self.ws = Workspace(self.ws_url, token=ctx['token'])

        # TODO current file name is hard coded but that need to be changed later.
        filename = "/kb/module/work/variation.vcf"
        output_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        os.mkdir(output_dir)
        
        shutil.copytree("/kb/module/deps/snp_eff", output_dir + "/snp_eff")

        variation_ref = params['variation_ref']
        variation_obj = self.ws.get_objects2({'objects': [{'ref': variation_ref}]})['data'][0]


        data = self.ws.get_objects2( {'objects':[{"ref":variation_ref, 'included': ['/sample_set_ref']}]})['data'][0]['data']
        sample_set_ref = data['sample_set_ref']

        assembly_ref = variation_obj['data']['assembly_ref']
        assembly_path = self.DU.get_assembly(assembly_ref, output_dir)

        gff_ref = params['genome_ref']
        gff_path = self.DU.get_gff(gff_ref, output_dir)
       
        # Todo: It is temporary fix but need to find logical removal of exons based on coordinates.
        fix_cmd = "grep -v \"exon\" "+ gff_path + " > /kb/module/work/tmp/output.gff"
        print(fix_cmd)
        os.system(fix_cmd)
        #os.system("cp /kb/module/work/tmp/output.gff " + os.path.join(output_dir, "/snp_eff/data/kbase_v1/genes.gff")) 
        #shutil.copyfile("/kb/module/work/tmp/output.gff", output_dir + "/snp_eff/data/kbase_v1/genes.gff")
        
        vcf_path = self.VU.get_variation_as_vcf({
                'variation_ref': params['variation_ref'],
                'filename': filename
            })

        new_gff_path = "/kb/module/work/tmp/output.gff"

        genome_index_name = self.SU.build_genome(new_gff_path, assembly_path, output_dir)
        annotated_vcf_path = self.SU.annotate_variants(genome_index_name, vcf_path['path'], params, output_dir)
        '''
        params['vcf_staging_file_path'] = annotated_vcf_path
        params['variation_object_name'] = params['output_object_name']
        params['genome_or_assembly_ref'] = params['genome_ref']
        '''
        save_variation_params = {'workspace_name': params['workspace_name'],
            'genome_or_assembly_ref': params['genome_ref'],      
            'sample_set_ref': sample_set_ref,
            'sample_attribute_name':'sample_attr',
            'vcf_staging_file_path': annotated_vcf_path,
            'variation_object_name': params['output_object_name']
            }  
       
        variantion_ref = self.VU.save_variation_from_vcf(save_variation_params)['variation_ref']

        created_objects = []
        created_objects.append({
            "ref": variation_ref,
            "description": "Variation Object"
            })

        #self.VU.   #upload file to shock
        # TODO: Add parameters for snpeff in parameters
        # Parse the snpeff parameters from params and build snpeff command
        # TODO: We are hardcoding this for now
        
        print("\n\n\n")
        print("$$$$$$$$" + output_dir + "$$$$$$$$$")
        arr = os.listdir(output_dir + "/snp_eff")
        for files in arr:
            print("########" + files + "###########")
        print("\n\n\n")
        
        #os.rename(os.path.join(output_dir, "snp_eff/snpEff_summary.html"), os.path.join(output_dir, "snp_eff/index.html"))
        snp_eff_resultdir = os.path.join(output_dir, "snp_eff_results")
        os.mkdir(snp_eff_resultdir)
        #shutil.copyfile(os.path.join(output_dir, "snp_eff/index.html"), os.path.join(snp_eff_resultdir, "index.html"))
        shutil.copyfile(os.path.join(output_dir, "snp_eff/snpEff_genes.txt"), os.path.join(snp_eff_resultdir, "snpEff_genes.txt"))

        #report_dirpath = os.path.join(output_dir, "snp_eff")

        logging.info("creating html report ...")
        output = self.HU.create_html_report(self.callback_url, snp_eff_resultdir, workspace)
        # output = self.HU.create_html_report(self.callback_url, snp_eff_resultdir, workspace, created_objects)

        '''
        report = KBaseReport(self.callback_url)
        output = {
            "x":vcf_path
        }
        '''
        #END annotate_variants

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method annotate_variants return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {'state': "OK",
                     'message': "",
                     'version': self.VERSION,
                     'git_url': self.GIT_URL,
                     'git_commit_hash': self.GIT_COMMIT_HASH}
        #END_STATUS
        return [returnVal]
示例#3
0
class VariationAnalyzer:
    '''
    Module Name:
    VariationAnalyzer

    Module Description:
    A KBase module: VariationAnalyzer
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.0.1"
    GIT_URL = ""
    GIT_COMMIT_HASH = ""

    #BEGIN_CLASS_HEADER
    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.shared_folder = config['scratch']
        logging.basicConfig(format='%(created)s %(levelname)s: %(message)s',
                            level=logging.INFO)
        self.dfu = DownloadFastqUtils()
        self.su = SnippyUtils()
        self.vu = VariationUtil(self.callback_url)
        #END_CONSTRUCTOR
        pass

    def run_VariationAnalyzer(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of type "InputParams" -> structure: parameter
           "obj_name" of String, parameter "workspace_name" of String,
           parameter "fastq_ref" of String, parameter "map_qual" of Long,
           parameter "base_qual" of Long, parameter "min_cov" of Long,
           parameter "min_qual" of Long
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_VariationAnalyzer

        self.su.validate_params(params)

        logging.info("Downloading Fastq File")
        fastq_file = self.dfu._stage_input_file(params['fastq_ref'],
                                                "paired_end")

        logging.info("Downloading assembly file")
        genome_assembly = self.dfu.download_genome(
            params['genome_or_assembly_ref'])

        self.su.deinterleave(fastq_file['files']['fwd'], self.shared_folder)

        sample_name = "snippy_output"  #hardcoded to match with attribute mapping file

        snippy_output = self.shared_folder + "/" + sample_name

        cmd = self.su.build_snippy_command(genome_assembly['path'],
                                           snippy_output, self.shared_folder)

        self.su.run_snippy_command(cmd)

        params[
            'vcf_staging_file_path'] = self.shared_folder + "/" + sample_name + "/snps.vcf"

        self.vu.save_variation_from_vcf(params)

        report = KBaseReport(self.callback_url)
        report_info = report.create({
            'report': {
                'objects_created': [],
                'text_message': params['fastq_ref']
            },
            'workspace_name': params['workspace_name']
        })
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref'],
        }
        #END run_VariationAnalyzer

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_VariationAnalyzer return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]
示例#4
0
class kb_ReadSim:
    '''
    Module Name:
    kb_ReadSim

    Module Description:
    A KBase module: kb_ReadSim
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.0.1"
    GIT_URL = "https://github.com/kbasecollaborations/kb_ReadSim.git"
    GIT_COMMIT_HASH = "c9c0185e34d25be57cc6e1c901d8801fbc0f4784"

    #BEGIN_CLASS_HEADER
    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.shared_folder = config['scratch']
        self.du = DownloadUtils(self.callback_url)
        self.su = SimUtils()
        self.ru = ReadsUtils(self.callback_url)
        self.vu = VariationUtil(self.callback_url)
        self.eu = VcfEvalUtils()
        self.hu = htmlreportutils()
        self.ws_url = config['workspace-url']
        self.wsc = Workspace(self.ws_url)
        logging.basicConfig(format='%(created)s %(levelname)s: %(message)s',
                            level=logging.INFO)
        #END_CONSTRUCTOR
        pass

    def run_kb_ReadSim(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of type "Inparams" -> structure: parameter
           "workspace_name" of String, parameter "input_sample_set" of
           String, parameter "strain_info" of String, parameter
           "assembly_or_genome_ref" of String, parameter "base_error_rate" of
           String, parameter "outer_distance" of String, parameter
           "standard_deviation" of String, parameter "num_read_pairs" of
           String, parameter "len_first_read" of String, parameter
           "len_second_read" of String, parameter "mutation_rate" of String,
           parameter "frac_indels" of String, parameter
           "variation_object_name" of String, parameter "output_read_object"
           of String
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_kb_ReadSim
        output_dir = self.shared_folder
        print(params)
        self.su.validate_simreads_params(params)

        genome_or_assembly_ref = params['assembly_or_genome_ref']
        obj_type = self.wsc.get_object_info3(
            {'objects': [{
                'ref': genome_or_assembly_ref
            }]})['infos'][0][2]
        if ('KBaseGenomes.Genome' in obj_type):
            genome_ref = genome_or_assembly_ref
            subset = self.wsc.get_object_subset([{
                'included': ['/assembly_ref'],
                'ref': genome_ref
            }])
            assembly_ref = subset[0]['data']['assembly_ref']
        elif ('KBaseGenomeAnnotations.Assembly' in obj_type):
            assembly_ref = genome_or_assembly_ref
        else:
            raise ValueError(obj_type +
                             ' is not the right input for this method. ' +
                             'Valid input include KBaseGenomes.Genome or ' +
                             'KBaseGenomeAnnotations.Assembly ')

        self.du.download_genome(assembly_ref, output_dir)

        ref_genome = os.path.join(self.shared_folder, "ref_genome.fa")
        output_fwd_paired_file_path = os.path.join(self.shared_folder,
                                                   "raed1.fq")
        output_rev_paired_file_path = os.path.join(self.shared_folder,
                                                   "raed2.fq")

        self.eu.check_path_exists(ref_genome)

        self.su.simreads(ref_genome, output_fwd_paired_file_path,
                         output_rev_paired_file_path, params)
        self.eu.check_path_exists(output_fwd_paired_file_path)
        self.eu.check_path_exists(output_rev_paired_file_path)

        retVal = self.ru.upload_reads({
            'wsname': params['workspace_name'],
            'name': params['output_read_object'],
            'sequencing_tech': 'illumina',
            'fwd_file': output_fwd_paired_file_path,
            'rev_file': output_rev_paired_file_path
        })

        logfile = os.path.join(self.shared_folder, "variant.txt")
        self.eu.check_path_exists(logfile)

        vcf_file = self.su.format_vcf(logfile)
        self.eu.check_path_exists(vcf_file)

        save_variation_params = {
            'workspace_name': params['workspace_name'],
            'genome_or_assembly_ref': params['assembly_or_genome_ref'],
            'sample_set_ref': params['input_sample_set'],
            'sample_attribute_name': 'sample_attr',
            'vcf_staging_file_path': vcf_file,
            'variation_object_name': params['variation_object_name']
        }
        self.vu.save_variation_from_vcf(save_variation_params)

        report = KBaseReport(self.callback_url)
        report_info = report.create({
            'report': {
                'objects_created': [],
                'text_message': 'Success'
            },
            'workspace_name': params['workspace_name']
        })
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref'],
        }
        #END run_kb_ReadSim

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_kb_ReadSim return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def run_eval_variantcalling(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of type "Evalparams" -> structure: parameter
           "workspace_name" of String, parameter "sim_varobject_name" of
           String, parameter "calling_varobject_name" of String, parameter
           "output_var_object" of String
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_eval_variantcalling

        print(params)
        self.eu.validate_eval_params(params)

        report_dir = os.path.join(self.shared_folder, str(uuid.uuid4()))
        os.mkdir(report_dir)

        self.ws = Workspace(url=self.ws_url, token=ctx['token'])

        var_object_ref1 = params['varobject_ref1']
        sampleset_ref1 = self.ws.get_objects2({
            'objects': [{
                "ref": var_object_ref1,
                'included': ['/sample_set_ref']
            }]
        })['data'][0]['data']['sample_set_ref']

        var_object_ref2 = params['varobject_ref2']
        sampleset_ref2 = self.ws.get_objects2({
            'objects': [{
                "ref": var_object_ref2,
                'included': ['/sample_set_ref']
            }]
        })['data'][0]['data']['sample_set_ref']

        if (sampleset_ref1 != sampleset_ref2):
            raise Exception(
                "Variation objects are from different sample set\n")

        assembly_ref_set = set()
        genomeset_ref_set = set()

        variation_obj1 = self.ws.get_objects2(
            {'objects': [{
                'ref': var_object_ref1
            }]})['data'][0]

        if 'assembly_ref' in variation_obj1['data']:
            assembly_ref1 = variation_obj1['data']['assembly_ref']
            assembly_ref_set.add(assembly_ref1)
        elif 'genome_ref' in variation_obj1['data']:
            genome_ref1 = variation_obj1['data']['genome_ref']
            genomeset_ref_set.add(genome_ref1)

        variation_obj2 = self.ws.get_objects2(
            {'objects': [{
                'ref': var_object_ref2
            }]})['data'][0]
        if 'assembly_ref' in variation_obj2['data']:
            assembly_ref2 = variation_obj2['data']['assembly_ref']
            assembly_ref_set.add(assembly_ref2)
        elif 'genome_ref' in variation_obj2['data']:
            genome_ref2 = variation_obj2['data']['genome_ref']
            genomeset_ref_set.add(genome_ref2)

        assembly_or_genome_ref = None

        if (not genomeset_ref_set and len(assembly_ref_set) != 1):
            raise Exception(
                "variation objects are from different assembly refs")
        elif (not assembly_ref_set and len(genomeset_ref_set) != 1):
            raise Exception("variation objects are from different genome refs")

        simvarfile = os.path.join(report_dir, "simvarinat.vcf.gz")
        simvarpath = self.du.download_variations(var_object_ref1, simvarfile)

        os.rename(simvarpath, simvarfile)
        self.eu.index_vcf(simvarfile)

        callingvarfile = os.path.join(report_dir, "callingvarinat.vcf.gz")
        callingvarpath = self.du.download_variations(var_object_ref2,
                                                     callingvarfile)

        os.rename(callingvarpath, callingvarfile)
        self.eu.index_vcf(callingvarfile)

        eval_results = self.eu.variant_evalation(simvarfile, callingvarfile,
                                                 report_dir)

        unique_vcf1 = eval_results['unique1']
        self.eu.check_path_exists(unique_vcf1)

        unique_vcf2 = eval_results['unique2']
        self.eu.check_path_exists(unique_vcf2)

        common_vcf = eval_results['common']
        self.eu.check_path_exists(common_vcf)

        image_path = self.eu.plot_venn_diagram(report_dir, unique_vcf1,
                                               unique_vcf2, common_vcf)
        self.eu.check_path_exists(image_path)
        '''
        if(len(assembly_ref_set) != 0):
            assembly_or_genome_ref = assembly_ref_set.pop()
        elif(len(genomeset_ref_set) != 0):
            assembly_or_genome_ref = genomeset_ref_set.pop()

        logging.info("Saving Unique1 vcf\n")
        save_unique_variation_params1 = {'workspace_name': params['workspace_name'],
                                        'genome_or_assembly_ref': assembly_or_genome_ref,
                                        'sample_set_ref': sampleset_ref1,
                                        'sample_attribute_name': 'sample_unique_attr1',
                                        'vcf_staging_file_path': unique_vcf1,
                                        'variation_object_name': params['output_variant_object'] + "_sample1_unique"
        }
        self.vu.save_variation_from_vcf(save_unique_variation_params1)
        logging.info("Saving done\n")

        logging.info("Saving Unique2 vcf\n")
        save_unique_variation_params2 = {'workspace_name': params['workspace_name'],
                                        'genome_or_assembly_ref': assembly_or_genome_ref,
                                        'sample_set_ref': sampleset_ref1,
                                        'sample_attribute_name': 'sample_unique_attr2',
                                        'vcf_staging_file_path': unique_vcf2,
                                        'variation_object_name': params['output_variant_object'] + "_sample2_unique"
        }
        self.vu.save_variation_from_vcf(save_unique_variation_params2)
        logging.info("Saving done\n")

        logging.info("Saving Common vcf\n")
        save_common_variation_params = {'workspace_name': params['workspace_name'],
                                 'genome_or_assembly_ref': assembly_or_genome_ref,
                                 'sample_set_ref': sampleset_ref1,
                                 'sample_attribute_name': 'sample_common_attr',
                                 'vcf_staging_file_path': common_vcf,
                                 'variation_object_name': params['output_variant_object'] + "_sample1_sample2_common"
        }
        self.vu.save_variation_from_vcf(save_common_variation_params)
        logging.info("Saving done\n")
        '''

        workspace = params['workspace_name']
        output = self.hu.create_html_report(self.callback_url, report_dir,
                                            workspace)
        #END run_eval_variantcalling

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_eval_variantcalling return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]
示例#5
0
class kb_GATK:
    '''
    Module Name:
    kb_GATK

    Module Description:
    A KBase module: kb_GATK
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.0.1"
    GIT_URL = "https://github.com/kbasecollaborations/kb_GATK.git"
    GIT_COMMIT_HASH = "5e6e4bdca9a7749bba0abab081736c56007212ed"

    #BEGIN_CLASS_HEADER
    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.shared_folder = config['scratch']
        self.ws_url = config['workspace-url']
        self.wsc = Workspace(self.ws_url)
        self.gu = GATKUtils()
        logging.basicConfig(format='%(created)s %(levelname)s: %(message)s',
                            level=logging.INFO)
        self.vu = VariationUtil(self.callback_url)
        self.du = DownloadAlignmentUtils(self.callback_url)
        #END_CONSTRUCTOR
        pass

    def run_kb_GATK(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_kb_GATK
        source_ref = params['alignment_ref']
        alignment_out = self.du.downloadreadalignment(source_ref, params,
                                                      self.callback_url)
        sam_file = os.path.join(alignment_out['destination_dir'],
                                "reads_alignment.sam")
        '''
        #Todo Reading sample set and sample strains information
        '''
        '''
        command.extend(["-filter-name", "\"QD_filter\"", "-filter", "\"QD", "<", params['snp_filter']['snp_qd_filter'] + "\""])
        command.extend(["-filter-name", "\"FS_filter\"", "-filter", "\"FS", "<", params['snp_filter']['snp_fs_filter'] + "\""])
        command.extend(["-filter-name", "\"MQ_filter\"", "-filter", "\"MQ", "<", params['snp_filter']['snp_mq_filter'] + "\""])
        command.extend(["-filter-name", "\"SOR_filter\"", "-filter", "\"SOR", "<", params['snp_filter']['snp_sor_filter'] + "\""])
        command.extend(["-filter-name", "\"MQRankSum_filter\"", "-filter", "\"MQRankSum", "<", params['snp_filter']['snp_mqrankSum_filter'] + "\""])
        command.extend(["-filter-name", "\"ReadPosRankSum_filter\"", "-filter", "\"ReadPosRankSum", "<", params['snp_filter']['snp_readposranksum_filter'] + "\""])
        '''
        print(params)
        strain_info = params['strain_info']
        output_dir = os.path.join(self.shared_folder, str(uuid.uuid4()))
        os.mkdir(output_dir)

        genome_or_assembly_ref = params['assembly_or_genome_ref']
        obj_type = self.wsc.get_object_info3(
            {'objects': [{
                'ref': genome_or_assembly_ref
            }]})['infos'][0][2]
        if ('KBaseGenomes.Genome' in obj_type):
            genome_ref = genome_or_assembly_ref
            subset = self.wsc.get_object_subset([{
                'included': ['/assembly_ref'],
                'ref': genome_ref
            }])
            assembly_ref = subset[0]['data']['assembly_ref']
        elif ('KBaseGenomeAnnotations.Assembly' in obj_type):
            assembly_ref = genome_or_assembly_ref
        else:
            raise ValueError(obj_type +
                             ' is not the right input for this method. ' +
                             'Valid input include KBaseGenomes.Genome or ' +
                             'KBaseGenomeAnnotations.Assembly ')

        assembly_file = self.du.download_genome(assembly_ref,
                                                output_dir)['path']

        #output_dir = output_dir + "/"

        #Todo: check time for building index file or donwload from cache.
        #Todo: To discuss about cache_id to be used.
        #Todo: In case of copying genome, find the way of finding original genome (ref id) for getting original cache id.

        self.gu.build_genome(assembly_file)
        self.gu.index_assembly(assembly_file)
        self.gu.generate_sequence_dictionary(assembly_file)
        self.gu.duplicate_marking(output_dir, sam_file)
        #self.gu.sort_bam_index(output_dir)
        self.gu.collect_alignment_and_insert_size_metrics(
            assembly_file, output_dir)
        #self.gu.analyze_covariates(output_dir)

        #Todo: avoid writing intermediate fies to save space and time I/O.
        self.gu.variant_calling(assembly_file, output_dir)
        self.gu.extract_variants(assembly_file, output_dir)
        self.gu.filter_SNPs(assembly_file, "filtered_snps.vcf", output_dir,
                            params)
        self.gu.filter_Indels(assembly_file, "filtered_indels.vcf", output_dir,
                              params)
        self.gu.exclude_filtered_variants(output_dir)
        self.gu.base_quality_score_recalibration(assembly_file,
                                                 "recal_data.table",
                                                 output_dir)
        self.gu.apply_BQSR(assembly_file, "recal_data.table", output_dir)
        self.gu.base_quality_score_recalibration(assembly_file,
                                                 "post_recal_data.table",
                                                 output_dir)
        self.gu.apply_BQSR(assembly_file, "post_recal_data.table", output_dir)
        self.gu.filter_SNPs(assembly_file, "filtered_snps_final.vcf",
                            output_dir, params)

        #Todo: To save indels also using VariationUtils or merge with snps and sort them with chr & pos and save using variaiotiontuils.
        #Todo: To get an example for saving structural variants(specially CNV) and compare with standard vcf output.

        self.gu.filter_Indels(assembly_file, "filtered_indels_final.vcf",
                              output_dir, params)
        '''
        os.system("grep   '##fileformat' " + output_dir + "/filtered_snps_final.vcf > " + output_dir + "/sample.vcf")
        cmd = "grep -v  '##' " + output_dir + "/filtered_snps_final.vcf >> " + output_dir + "/sample.vcf"
        os.system(cmd)            # TODO : need to remove system command after fixing variationUtils.
        '''

        vcf_filepath = self.gu.index_vcf_file(output_dir +
                                              "/filtered_snps_final.vcf")
        reheader_vcf_file = self.gu.reheader(vcf_filepath, strain_info)
        #Todo : check existence of final filtered finals snps.
        #Todo : chnage assembly_or_genome_ref to genome_or_assembly_ref

        #Todo: to derive name of sample_attribute_name from sample set ref by prefixing/suffixing. Attribute mapping should have one sample.

        save_variation_params = {
            'workspace_name': params['workspace_name'],
            'genome_or_assembly_ref': params['assembly_or_genome_ref'],
            'sample_set_ref': params['input_sample_set'],
            'sample_attribute_name': 'sample_attr',
            'vcf_staging_file_path': reheader_vcf_file,
            'variation_object_name': params['variation_object_name']
        }

        self.vu.save_variation_from_vcf(save_variation_params)

        report = KBaseReport(self.callback_url)
        report_info = report.create({
            'report': {
                'objects_created': [],
                'text_message': 'Success'
            },
            'workspace_name': params['workspace_name']
        })
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref'],
        }
        #END run_kb_GATK

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_kb_GATK return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]