Пример #1
0
    def ligate_shapeitchunks(self,
                             vcf_f,
                             scaffolded_samples,
                             chunk_str,
                             output_prefix,
                             verbose=False):
        '''
        Run ligateHAPLOTYPES to ligate together all haplotype chunks produced by SHAPEIT
        (see https://mathgen.stats.ox.ac.uk/genetics_software/shapeit/shapeit.html#haplegsample)

        Parameters
        ----------
        vcf_f : filename
                VCF file with the Genotype likelihoods
        scaffolded_samples : filename
                             File with the list of samples (separated by '\n')
                             that have been scaffolded
        chunk_str : str
                    String with the paths to the different files generated by SHAPEIT for
                    the different chromosome chunks
                    (i.e. 's2.chunk1.hap.gz s2.chunk1.hap.gz s2.chunk1.hap.gz')
        output_prefix : str
                        String with the output prefixes (i.e. 'output.shapeit.22.ligated.haps.gz
                        output.shapeit.22.ligated.haps.sample')

        Returns
        -------
        dict
            A dict with the path to the 2 output files (*.haps.gz and *.haps.sample)
        '''

        if self.ligateHAPLOTYPES_folder is None:
            raise Exception("ligateHAPLOTYPES_folder must be defined")

        Arg = namedtuple('Argument', 'option value')

        args = [
            Arg('--vcf', vcf_f),
            Arg('--scaffold', scaffolded_samples),
            Arg('--chunks', chunk_str),
            Arg(
                '--output', '{0}.ligated.haps.gz '
                '{0}.ligated.haps.sample'.format(output_prefix))
        ]

        runner = RunProgram(path=self.ligateHAPLOTYPES_folder,
                            program='ligateHAPLOTYPES',
                            args=args)

        if verbose is True:
            print("Command line is: {0}".format(runner.cmd_line))

        stdout = runner.run_checkoutput()

        outdict = {
            'hap_gz': '{0}.ligated.haps.gz'.format(output_prefix),
            'hap_sample': '{0}.ligated.haps.sample'.format(output_prefix)
        }

        return outdict
Пример #2
0
    def drop_info(self, outfile, verbose=False):
        '''
        Function to remove the INFO annotation from a VCF.
        This function uses bcftools annotate  to perform this operation

        Parameters
        ----------
        outfile : filename
                  File where the output VCF will be written
        verbose : bool, optional
                  increase the verbosity, default=False

        Returns
        -------
        filename
                 Path to the vcf.gz file without the INFO annotation
        '''

        Arg = namedtuple('Argument', 'option value')

        args = [Arg('-o', outfile), Arg('-O', 'z')]

        runner = RunProgram(path=self.bcftools_folder,
                            program='bcftools annotate --remove INFO',
                            args=args,
                            parameters=[self.vcf])

        if verbose is True:
            print("Command line is: {0}".format(runner.cmd_line))

        stdout = runner.run_checkoutput()

        return outfile
Пример #3
0
    def run_vtnormalize(self,outprefix,reference,compress=False, verbose=False, outdir=None, n=False):
        '''
        Run vt normalize on a vcf file

        Parameters
        ----------
        outprefix : str, required
              prefix for outputfile
        reference : str, required
              path to Fasta file with reference
        compress : boolean, optional
              bgzip compress the normalized VCF
        outdir : str, optional
            If provided, then put output files in this folder
        n : bool, optional
            warns but does not exit when REF is inconsistent
            with reference sequence for non SNPs. Default=False
        verbose : bool, optional
                  if true, then increase verbosity

        Returns
        -------
        A string with path to normalized file
        '''

        if self.vt_folder is None:
            raise Exception("Provide a vt_folder containing the vt binary")

        Arg = namedtuple('Argument', 'option value')

        if outdir:
            outprefix = "{0}/{1}".format(outdir, outprefix)

        outprefix = outprefix+".norm.vcf"

        args=[Arg('-r',reference)]

        parameters=[self.vcf]
        if n is True:
            parameters.append('-n')

        runner=None
        pipelist=None
        if compress is True:
            outprefix += ".gz"
            compressRunner=RunProgram(path=self.bgzip_folder,program='bgzip',parameters=[ '-c', '>', outprefix])
            pipelist=[compressRunner]
        elif compress is None or compress is False:
            args.append(Arg('-o',outprefix))

        runner=RunProgram(path=self.vt_folder, program='vt normalize', args=args, parameters=parameters, downpipe=pipelist)


        if verbose is True:
             print("Command line for running vt normalize is: {0}".format(runner.cmd_line))

        runner.run_checkoutput()
        
        return outprefix
Пример #4
0
    def run_CollectVariantCallingMetrics(self,
                                         outprefix,
                                         truth_vcf,
                                         intervals=None,
                                         verbose=None):
        '''
        Method to run Picard's CollectVariantCallingMetrics on a VcfQC object.

        Parameters
        ----------
        outprefix : str
                    Prefix for outfiles: prefix.variant_calling_detail_metrics
                    and prefix.variant_calling_summary_metrics.
        truth_vcf : str
                    Reference VCF file.
        intervals : str, optional
                    Target intervals to restrict analysis to.
        verbose : bool, optional
                  Increase verbosity.

        Returns
        -------
        CollectVCallingMetrics object
        '''

        if self.picard_folder is None:
            raise Exception("Provide a picard folder")

        Arg = namedtuple('Argument', 'option value')

        args = [
            Arg('I', self.vcf),
            Arg('O', outprefix),
            Arg('DBSNP', truth_vcf)
        ]

        if intervals:
            args.append(Arg('TI', intervals))

        runner = RunProgram(
            program='java -jar {0}/picard.jar CollectVariantCallingMetrics'.
            format(self.picard_folder),
            args=args,
            arg_sep="=")

        if verbose is True:
            print("Command line is: {0}".format(runner.cmd_line))

        stdout = runner.run_checkoutput()

        #create CollectVCallingMetrics object with the output files
        cvcmetrics = CollectVCallingMetrics(vc_detail_metrics_file=outprefix\
                                            +".variant_calling_detail_metrics",
                                            vc_summary_metrics_file=outprefix\
                                            +".variant_calling_summary_metrics")

        return cvcmetrics
Пример #5
0
    def run_bcftoolsnorm(self, outprefix, reference, multiallelics=None, type=None, outdir=None,verbose=False):
        '''
        Run bcftools norm on a vcf file

        Parameters
        ----------
        outprefix : str, required
              prefix for outputfile
        reference : str, required
              path to Fasta file with reference
        multiallelic : str, optional
              Operate on multiallelic variants and either split or merge them.
              Possible values are: 'split'/'merge'
        type: str, optional
              If 'multiallelic' is defined then operate on this type of variant.
              Possible values are: snps|indels|both|any
        outdir : str, optional
            If provided, then put output files in this folder
        verbose : bool, optional
                  if true, then increase verbosity

        Returns
        -------
        A string with path to normalized file
        '''

        if outdir:
            outprefix = "{0}/{1}".format(outdir, outprefix)

        outprefix = outprefix+".norm.vcf.gz"

        Arg = namedtuple('Argument', 'option value')
        
        args=[Arg('-f',reference), Arg('-o',outprefix)]

        if multiallelics == "split":
            if type is None: raise Exception("'multiallelics' option is defined, so please provide a 'type' value")
            args.append(Arg('-m',"\'-{0}\'".format(type)))
        elif multiallelics == "merge":
            if type is None: raise Exception("'multiallelics' option is defined, so please provide a 'type' value")
            args.append(Arg('-m',"\'+{0}\'".format(type)))
        else:
            if multiallelics is not None: raise Exception("'multiallelics' value is not recognized: {0}".format(multiallelics))
            
        parameters=[self.vcf,'-Oz']

        runner=RunProgram(path=self.bcftools_folder, program='bcftools norm', args=args, parameters=parameters)

        if verbose is True:
             print("Command line for running bcftools norm is: {0}".format(runner.cmd_line))

        runner.run_checkoutput()

        return outprefix
Пример #6
0
    def prepare_Gen_From_Beagle4(self,prefix_in,outprefix,threshold=0.995,verbose=False):
        '''
        Method that uses prepareGenFromBeagle4 in order to convert the different Beagle chunks
        generated by 'self.make_beagle_chunks' into a single concatenated output that can be used 
        with SHAPEIT.
        see https://mathgen.stats.ox.ac.uk/genetics_software/shapeit/shapeit.html#gettingstarted

        Parameters
        ----------
        prefix_in : str
                    prefix used in the output of the different Beagle chunks after running  method 'self.run_beagle'.
                    i.e. output.beagle4.22.*.
        outprefix : str
                    Prefix used for output files. i.e. If prefix 'input.shapeit.chr22' is used. Then it will generate the following files:
                    input.shapeit.chr22.gen.gz
                    input.shapeit.chr22.gen.sample
                    input.shapeit.chr22.hap.gz
                    input.shapeit.chr22.hap.sample
        threshold : float, optional
                    Threshold meaning that all genotypes with a posterior above 0.995 are directly fixed and will only need phasing in the SHAPEIT step.
                    Default: 0.995
        verbose : bool, optional
                  if true, then print the command line used for running this tool.Default=False

        Returns
        -------
        dict
            A dict with the path to the 4 output files (*.gen.* and *.hap.*) that can be used with SHAPEIT
        '''

        if self.prepareGenFromBeagle4_folder is None:
            raise Exception("Provide the folder for the prepareGenFromBeagle4 binary")

        posteriors="{0}*.vcf.gz".format(prefix_in)

        Arg = namedtuple('Argument', 'option value')

        args=[Arg('--likelihoods',self.vcf), Arg('--posteriors',posteriors), Arg('--output',outprefix)]

        runner=RunProgram(path="{0}/".format(self.prepareGenFromBeagle4_folder), program='prepareGenFromBeagle4', args=args)

        if verbose is True:
             print("Command line for running prepareGenFromBeagle4 is: {0}".format(runner.cmd_line))

        runner.run_checkoutput()

        outdict={ 'gen_gz' :'{0}.gen.gz'.format(outprefix),
                  'gen_sample' : '{0}.gen.sample'.format(outprefix),
                  'hap_gz' : '{0}.hap.gz'.format(outprefix),
                  'hap_sample' : '{0}.hap.sample'.format(outprefix)
        }
        
        return outdict
Пример #7
0
    def select_variants(self,
                        outprefix,
                        uncalled=None,
                        threads=1,
                        verbose=None):
        '''
        Run bcftools view to select only the variants (exclude the 0|0 genotypes)

        Parameters
        ----------
        outprefix : str
                    Prefix used for the output file
        uncalled : {'exclude','include'}, optional. 
                   Select/Exclude sites with an uncalled genotype
        threads: int, optional
                 Number of output compression threads to use in addition to main thread. Default=0
        verbose : Boolean, optional
                  Increase verbosity

        Returns
        -------
        filename
                Returns the path for the VCF with the selected variants
        '''
        outfile = outprefix + ".onlyvariants.vcf.gz"

        Arg = namedtuple('Argument', 'option value')

        args = [Arg('-o', outfile), Arg('-O', 'z'), Arg('--threads', threads)]

        params = []
        if uncalled == 'exclude':
            params.append('-U')
        elif uncalled == 'include':
            params.append('-u')

        params.append(self.vcf)

        runner = RunProgram(path=self.bcftools_folder,
                            program='bcftools view',
                            args=args,
                            parameters=params)

        if verbose is True:
            print("Command line is: {0}".format(runner.cmd_line))

        stdout = runner.run_checkoutput()

        return outfile
Пример #8
0
    def make_beagle_chunks(self, window, overlap, outfile, verbose=True):
        '''
        Method to generate the chromosome chunks for Beagle
        see https://mathgen.stats.ox.ac.uk/genetics_software/shapeit/shapeit.html#gettingstarted

        Parameters
        ----------
        window : int
                 The chunk size (--window) in number of variant sites
        overlap : int
                  The overlap size (--overlap) in number of variant sites
        outfile : filename
                  Output file name. i.e. 'chunk.coordinates'
        verbose : bool, optional
                  If true, then print the command line used for running this tool.Default=True

        Returns
        -------
        filename
                Path to file with the coordinates of the chunk

        '''

        if self.makeBGLCHUNKS_folder is None:
            raise Exception("Provide the folder for the makeBGLCHUNKS binary")

        Arg = namedtuple('Argument', 'option value')

        args = [
            Arg('--vcf', self.vcf),
            Arg('--window', window),
            Arg('--overlap', overlap),
            Arg('--output', outfile)
        ]

        runner = RunProgram(path="{0}/".format(self.makeBGLCHUNKS_folder),
                            program='makeBGLCHUNKS',
                            args=args)

        print(runner.cmd_line)
        if verbose is True:
            print("Command line for running makeBGLCHUNKS is: {0}".format(
                runner.cmd_line))

        runner.run_checkoutput()

        return outfile
Пример #9
0
    def run_gatk_VariantsToAllelicPrimitives(self, outprefix, reference,
                                             outdir=None, compress=None, verbose=None):
        '''
        Run GATK VariantsToAllelicPrimitives in order to decompose MNPs
         into more basic/primitive alleles

        Parameters
        ----------

        outprefix : str, required
                    prefix for outputfiles
        reference : str, Required
                     Path to fasta file containing the reference
        outdir : str, optional
                   If provided, then put output files in this folder
        compress : boolean, optional
                   bgzip compress the normalized VCF
        verbose : bool, optional
                  if true, then increase verbosity

        Returns
        -------
        A string with path to decomposed file

        '''

        if self.gatk_folder is None:
            raise Exception("Error. I need that the folder containing the GATK "
                            "jar file is defined!")

        if outdir: 
            outprefix = "{0}/{1}".format(outdir, outprefix)

        outprefix = outprefix+".aprimitives.vcf"

        Arg = namedtuple('Argument', 'option value')

        args=[Arg('-T','VariantsToAllelicPrimitives'), Arg('-R',reference),
              Arg('-V',self.vcf), Arg('-o',outprefix) ]
        
        runner=RunProgram(program='java -jar {0}/GenomeAnalysisTK.jar'.format(self.gatk_folder), args=args)

        if verbose is True:
            print("Command line is: {0}".format(runner.cmd_line))

        stdout,stderr=runner.run_popen()

        if compress is True:
            compressRunner=RunProgram(path=self.bgzip_folder,program='bgzip',parameters=[ '-c', outprefix, '>', outprefix+".gz"])
            compressRunner.run_checkoutput()
            #delete tmp files
            os.remove(outprefix)
            os.remove(outprefix+".idx")
            outprefix += ".gz"
        elif compress is False:
            return outprefix
        else:
            raise Exception("'compress' parameter is not valid")
        
        return outprefix    
Пример #10
0
    def reheader(self, newheader, outprefix, samplefile=None, verbose=False):
        '''
        Modifiy the VCF's header with the newheader

        Parameters
        ----------
        newheader : string
                    Path to the file containing the new header
        outprefix : string
                    Prefix for output files
        samplefile : string, optional
                     Path to the file with the sample names that will included
                     in the new header
        verbose : bool, optional
                  increase the verbosity, default=False

        Returns
        -------
        filename
                 Path to the VCF with the modified header
        '''

        outfile = outprefix + ".reheaded.vcf.gz"

        Arg = namedtuple('Argument', 'option value')

        args = [Arg('-h', newheader), Arg('-o', outfile)]

        if samplefile is not None:
            args.append(Arg('-s', samplefile))

        runner = RunProgram(path=self.bcftools_folder,
                            program='bcftools reheader',
                            args=args,
                            parameters=[self.vcf])

        if verbose is True:
            print("Command line is: {0}".format(runner.cmd_line))

        stdout = runner.run_checkoutput()

        return outfile
Пример #11
0
    def filter(self, name, expression, verbose=None):
        '''
        Run bcftools filter on a VCF file

        Parameters
        ----------
        name : str
                 annotate FILTER column with <str>
        expression : str
                   exclude sites for which expression is true. i.e. 'INFO/DP>24304 | MQ<34'
        verbose : bool, optional
                  Increase verbosity

        Returns
        -------
        filename
                Path to the filtered VCF file
        '''

        outfile = self.vcf + ".filtered.vcf.gz"

        Arg = namedtuple('Argument', 'option value')

        args = [
            Arg('-s', name),
            Arg('-e', '\'{0}\''.format(expression)),
            Arg('-o', outfile),
            Arg('-O', 'z')
        ]

        runner = RunProgram(path=self.bcftools_folder,
                            program='bcftools filter',
                            args=args,
                            parameters=[self.vcf])

        if verbose is True:
            print("Command line is: {0}".format(runner.cmd_line))

        stdout = runner.run_checkoutput()

        return outfile
Пример #12
0
    def number_variants_in_region(self, region, outprefix, verbose=None):
        '''
        Method to get the number of variants in a particular region/s

        Parameters
        ----------
        region : str
                 String with path to the BED file containing the regions for
                 which the number will be calculated.
        outprefix : str
                    Prefix for outfile.
        verbose : bool, optional
                  Increase verbosity.

        Returns
        -------
        filename
                File with the number of variants for each particular region.
        '''

        outprefix = outprefix + ".counts"

        params = ['-counts', '>', outprefix]

        Arg = namedtuple('Argument', 'option value')

        args = [Arg('-a', region), Arg('-b', self.vcf)]

        runner = RunProgram(path=self.bedtools_folder,
                            program='bedtools coverage',
                            args=args,
                            parameters=params)

        if verbose is True:
            print("Command line is: {0}".format(runner.cmd_line))

        stdout = runner.run_checkoutput()

        return outprefix
Пример #13
0
    def convert_PL2GL(self, outfile, threads=1, verbose=False):
        '''
        Function to convert PL fields into GT.
        This function makes use of Bcftools +tag2tag plugin
        
        Parameters
        ----------
        outfile : filename
                  File where the output VCF will be written
        threads : int, optional
                  Number of trades to use. Default=1
        verbose : bool, optional
                  increase the verbosity, default=False

        Returns
        -------
        filename
                 Path to the vcf.gz file with the PL fields converted
        
        '''

        Arg = namedtuple('Argument', 'option value')

        params = [self.vcf, '-Oz', '--', '-r', '--pl-to-gl']

        runner = RunProgram(
            path=self.bcftools_folder,
            program='bcftools +tag2tag',
            args=[Arg('--threads', threads),
                  Arg('-o', outfile)],
            parameters=params)

        if verbose is True:
            print("Command line is: {0}".format(runner.cmd_line))

        stdout = runner.run_checkoutput()

        return outfile
Пример #14
0
    def run_variantrecalibrator(self,
                                resources,
                                mode,
                                max_gaussians=None,
                                intervals=None,
                                annotations=None,
                                tranches=None,
                                outprefix="recalibrate",
                                verbose=None,
                                log_file=None):
        '''
        Run GATK's VariantRecalibrator on a VcfFilter object

        Parameters
        ----------
        resources : filename
                    JSON file with resources to add using the -resources option, Required
        mode : {'SNP','INDEL'}
               Recalibration mode to employ
        intervals :  chr1:1-1000, optional
                     One or more genomic intervals over which to operate
        max_gaussians : int, optional
                        Max number of Gaussians for the positive model
        annotations : list, optional
                      List of annotations to be used. Default=['DP', 'QD', 'FS', 'SOR',
                      'MQ', 'MQRankSum', 'ReadPosRankSum', 'InbreedingCoeff']
        tranches : list, optional
                   Each element in the list will correspond to the  levels of truth
                   sensitivity at which to slice the data. (in percent, that is 1.0
                   for 1 percent). Default=[100.0,99.9,99.0,90.0]
        outprefix : str, optional
                    out prefix used for -recalFile, -tranchesFile, -rscriptFile.
                    Default= recalibrate
        verbose : bool, optional
                  Increase verbosity
        log_file : filename, optional
                   Path to file that will used for logging the GATK stderr and stdout

        Returns
        -------
        dict
            Dictionary with location of tranches and recal files
        '''

        if annotations is None:
            annotations = [
                'DP', 'QD', 'FS', 'SOR', 'MQ', 'MQRankSum', 'ReadPosRankSum',
                'InbreedingCoeff'
            ]

        if tranches is None:
            tranches = [100.0, 99.9, 99.0, 90.0]

        if self.caller != 'UG':
            raise Exception("VCF caller %s is incompatible" % self.caller)

        if mode not in ('SNP', 'INDEL'):
            raise Exception("VariantRecalibrator mode is not valid."
                            "Valid values are 'SNP','INDEL'" % mode)

        # prepare the prefix used for output files
        outprefix += "_%s" % mode

        Arg = namedtuple('Argument', 'option value')

        args = [
            Arg('-T', 'VariantRecalibrator'),
            Arg('-R', self.reference),
            Arg('-input', self.vcf),
            Arg('-mode', mode),
            Arg('-recalFile', "{0}.recal".format(outprefix)),
            Arg('-tranchesFile', "{0}.tranches".format(outprefix)),
            Arg('-rscriptFile', "{0}_plots.R".format(outprefix))
        ]

        # Read-in the different resources from the resource JSON file
        resources_str = ""

        with open(resources) as data_file:
            data = json.load(data_file)
            bits = data['resources']
            for dummy, dic in enumerate(bits):
                args.extend([
                    Arg(
                        "-resource:%s,known=%s,training=%s,"
                        "truth=%s,prior=%.1f" %
                        (dic['resource'], str(dic['known']).lower(),
                         str(dic['training']).lower(), str(
                             dic['truth']).lower(), dic['prior']), dic['path'])
                ])

        # prepare the -an options
        for elt in annotations:
            args.append(Arg('-an', elt))

        # prepare the list of -tranche option
        if type(tranches) == str:
            tranches = ast.literal_eval(tranches)

        for elt in tranches:
            args.append(Arg('-tranche', elt))

        runner = RunProgram(
            program='java -jar {0}/GenomeAnalysisTK.jar'.format(
                self.gatk_folder),
            args=args,
            log_file=log_file)

        if verbose is True:
            print("Command line is: {0}".format(runner.cmd_line))

        stdout, stderr, is_error = runner.run_popen()

        recal_f = glob.glob("{0}*.recal".format(outprefix))
        tranches_f = glob.glob("{0}*.tranches".format(outprefix))

        if not recal_f:
            raise Exception(
                "No *.recal files were retrieved after running VariantRecalibrator"
            )
        elif not tranches_f:
            raise Exception(
                "No *.tranches files were retrieved after running VariantRecalibrator"
            )

        if len(recal_f) > 1:
            raise Exception("More than one *.recal file was retrieved")
        elif len(tranches_f) > 1:
            raise Exception("More than one *.tranches file was retrieved")

        return {'recal_f': recal_f[0], 'tranches_f': tranches_f[0]}
Пример #15
0
    def filter_by_variant_type(self,
                               outprefix,
                               v_type="snps",
                               compress=True,
                               biallelic=False,
                               action="select",
                               verbose=None):
        '''
        Method to filter a VCF file by variant type. For example, to extract only the SNPs
        
       Parameters
       ----------
        v_type : {'snps','indels','mnps','other','both'}
                 Default=snps
                 Extract/Filter (depending on the value of the 'action'
                 argument) a certain variant type
        compress : bool, optional
                   If True then generate a vcf.gz file. Default=True 
        biallelic : bool, optional
                    Select only biallelic variants. Default=False
        action : {'select', 'exclude'} 
                 Default=select
        outprefix : str 
                    Prefix used for the output files
        verbose : bool, optional
                  Increase verbosity

        Returns
        -------
        filename
                 Path to the filtered VCF
        '''

        if v_type != "snps" and v_type != "indels" and v_type != "mnps" and v_type != "other" and v_type != "both":
            raise Exception("type value is not valid. Valid values are 'snps'/"
                            "'indels'/'mnps'/'other'/'both'")
        if action != "select" and action != "exclude":
            raise Exception(
                "action value is not valid. Valid values are 'select' or 'exclude'"
            )

        Arg = namedtuple('Argument', 'option value')

        args = []
        params = []

        if action == "select":
            if v_type != 'both':
                outprefix = outprefix + ".{0}.".format(v_type)
                args.append(Arg('-v', v_type))
        elif action == "exclude":
            if v_type != 'both':
                outprefix = outprefix + ".no{0}.".format(v_type)
                args.append(Arg('-V', v_type))
        if biallelic is True:
            outprefix += "biallelic."
            params.extend(['-m2', '-M2'])

        if compress is True:
            outprefix += "vcf.gz"
            args.extend([Arg('-o', outprefix), Arg('-O', 'z')])
            params.append(self.vcf)
        elif compress is False:
            outprefix += "vcf"
            args.extend([Arg('-o', outprefix), Arg('-O', 'v')])
            params.append(self.vcf)
        elif compress is None:
            raise Exception("'compress' parameter can't be None")

        runner = RunProgram(path=self.bcftools_folder,
                            program='bcftools view',
                            args=args,
                            parameters=params)

        if verbose is True:
            print("Command line is: {0}".format(runner.cmd_line))

        stdout = runner.run_checkoutput()

        return outprefix
Пример #16
0
    def run_bcftools(self, outprefix, E=False, p=False, annots=['DP','SP','AD'], P="ILLUMINA", F=0.002, \
                     C=50, m_pileup=1, m_call=False, d=250, v=False, O='z', ploidy="GRCh38", threads=1, S=None, r=None, verbose=True):
        '''
        Run BCFTools mpileup and then pipe to BCTools call in order to do the variant calling

        Parameters
        ----------
   
        outprefix : str, Required
                    Prefix for output VCF file. i.e. /path/to/file/test
        E : bool, Optional
            mpileup parameter
            Recalculate BAQ on the fly, ignore existing BQ tags. Default=False
        p : bool, Optional
            mpileup parameter
            Apply -m and -F thresholds per sample to increase sensitivity of calling. 
            By default both options are applied to reads pooled from all samples.
        annots : list, Optional
                 mpileup parameter
                 Comma separated list of annotations used to decorate the VCF
        P : str, Optional
            mpileup parameter
            Comma-delimited list of patforms (determined by @RG-PL) from which indel candidates are obtained. Default= ILLUMINA
        F : float, Optional
            mpileup parameter
            Minimum fraction of gapped reads. Default=0.002
        C : int, Optional
            mpileup parameter
            Coefficient for downgrading mapping quality for reads containing excessive mismatches. Default=50
        d : int, Optional
            mpileup parameter
            At a position, read maximally INT reads per input file. Default=250
        m_mpileup : int, Optional
                    mpileup parameter
                    Minimum number gapped reads for indel candidates. Default=1
        m_call : boolean, Optional
                 call parameter
                 alternative modelfor multiallelic and rare-variant calling designed to overcome known limitations in -c calling model
        v : bool, Optional
            call parameter
            output variant sites only
        O : str, Optional
            call parameter
            output type. Default= 'z'
            Possible values are: BCF (b), uncompressed BCF (u), compressed VCF (z), uncompressed VCF (v)
        ploidy : str, Optional
                 predefined ploidy. Default: GRCh38
        threads : int, Optional
                  Number of extra output compression threads.Default=1
        S : str, Optional
            call parameter
            File of sample names to include or exclude if prefixed with "^"
        r: str, Optional
           Region used for doing the variant calling in the format chr20:10000-20000
        
        verbose : bool, Optional
                  Increase verbosity. Default= True

        Returns
        -------
        A VCF file with variants

        '''

        Arg = namedtuple('Argument', 'option value')

        arguments_mpileup = [Arg('-f', self.reference)]

        for a in annots:
            arguments_mpileup.append(Arg('-a', a))

        arguments_mpileup.append(Arg('-P', P))
        arguments_mpileup.append(Arg('-F', F))
        arguments_mpileup.append(Arg('-C', C))
        arguments_mpileup.append(Arg('-d', d))
        arguments_mpileup.append(Arg('-m', m_pileup))
        arguments_mpileup.append(Arg('--threads', threads))

        if r is not None:
            region_str = re.sub(':|-', '_', r)
            outprefix += ".{0}".format(region_str)
            arguments_mpileup.append(Arg('-r', r))

        params_mpileup = []
        if E is True:
            params_mpileup.append('-E')
        if p is True:
            params_mpileup.append('-p')

        params_mpileup.append(self.bam)

        params_call = []
        if m_call is True:
            params_call.append('-m')
        if v is True:
            params_call.append('-v')

        arguments_call = []
        arguments_call.append(Arg('-O', O))
        arguments_call.append(Arg('--ploidy', ploidy))
        if S is not None:
            arguments_call.append(Arg('-S', S))

        outprefix += ".vcf.gz"
        arguments_call.append(Arg('-o', outprefix))

        pipelist = None
        bcftools_callRunner = RunProgram(program='bcftools call',
                                         args=arguments_call,
                                         parameters=params_call)
        pipelist = [bcftools_callRunner]

        runner = RunProgram(program='{0}/bcftools mpileup'.format(
            self.bcftools_folder),
                            args=arguments_mpileup,
                            parameters=params_mpileup,
                            downpipe=pipelist)

        if verbose is True:
            print("Command line is: {0}".format(runner.cmd_line))

        stdout, stderr, is_error = runner.run_popen(raise_exc=False)

        return outprefix
Пример #17
0
    def run_ug(self,
               outprefix,
               glm='SNP',
               compress=True,
               nt=1,
               verbose=None,
               intervals=None,
               log_file=None,
               **kwargs):
        '''
        Run GATK UnifiedGenotyper

        Parameters
        ----------
        outprefix : str, Required
                    Prefix for output VCF file. i.e. /path/to/file/test
        glm : str, Required
              Genotype likelihoods calculation model to employ -- SNP is the default option,
              while INDEL is also available for calling indels and BOTH is available for
              calling both together. Default= SNP
        compress : boolean, Default= True
                   Compress the output VCF
        nt : int, Optional
             Number of data threads to allocate to UG
        intervals : : list, Optional
                      List in which each of the elements is a path to file with genomic intervals
                      to operate with. Also coordinates can be set directly on the command line.
                      For example: ['chr1:100-200', 'chr2:200-300']. If the list contains
                      more than one interval, then it is useful to set the
                      --interval_set_rule option
        verbose : bool, optional
                  if true, then print the command line used for running this program
        alleles: str, Optional
                 Path to VCF.
                 When --genotyping_mode is set to
                 GENOTYPE_GIVEN_ALLELES mode, the caller will genotype the samples
                 using only the alleles provide in this callset
        genotyping_mode: str, Optional
                         Specifies how to determine the alternate alleles to use for genotyping
                         Possible values are: DISCOVERY, GENOTYPE_GIVEN_ALLELES
        output_mode: str, Optional
                     Which type of calls we should output.
                     Possible values are: EMIT_VARIANTS_ONLY,
                     EMIT_ALL_CONFIDENT_SITES, EMIT_ALL_SITES
                     Default: EMIT_VARIANTS_ONLY
        log_file : str, Optional
                   Path to file that will used for logging the GATK stderr and stdout

        Returns
        -------
        A VCF file
        '''

        Arg = namedtuple('Argument', 'option value')

        arguments = [
            Arg('-T', 'UnifiedGenotyper'),
            Arg('-R', self.reference),
            Arg('-I', self.bam),
            Arg('-glm', glm),
            Arg('-nt', nt)
        ]

        if intervals is not None:
            for i in intervals:
                arguments.append(Arg('--intervals', i))

        for k, v in kwargs.items():
            if v is not None: arguments.append(Arg("--{0}".format(k), v))

        pipelist = None
        if compress is True:
            outprefix += ".vcf.gz"
            compressRunner = RunProgram(path=self.bgzip_folder,
                                        program='bgzip',
                                        parameters=['-c', '>', outprefix])
            pipelist = [compressRunner]
        else:
            outprefix += ".vcf"
            arguments.append(Arg('-o', outprefix))

        runner = RunProgram(
            program='java -jar {0}/GenomeAnalysisTK.jar'.format(
                self.gatk_folder),
            args=arguments,
            downpipe=pipelist,
            log_file=log_file)

        if verbose is True:
            print("Command line is: {0}".format(runner.cmd_line))

        stdout, stderr, is_error = runner.run_popen(raise_exc=False)

        if is_error is True:
            '''
            This piece of code is necessary as GATK crashes when the intersection between
            the genomic chunk and the alleles passed in the VCF
            are calculated and there are no sites.

            If that's the case then GATK  will be run without the interval intersection
            '''
            patt = re.compile(
                '##### ERROR MESSAGE: Bad input: '
                'The INTERSECTION of your -L options produced no intervals.')
            lines = stderr.split('\n')
            interval_error_seen = False
            for l in lines:
                m = patt.match(l)
                if m:
                    interval_error_seen = True
                    alleles = ([
                        arg.value for arg in arguments
                        if arg.option == '--alleles'
                    ])[0]
                    for k, i in enumerate(arguments):
                        if i.option == '--intervals' and i.value == alleles:
                            del arguments[k]
                        elif i.option == '--interval_set_rule':
                            del arguments[k]
            if interval_error_seen is False:
                raise Exception(stderr)
            elif interval_error_seen is True:
                runner = RunProgram(
                    program='java -jar {0}/GenomeAnalysisTK.jar'.format(
                        self.gatk_folder),
                    downpipe=pipelist,
                    args=arguments,
                    log_file=log_file)
                if verbose is True:
                    print("Command line is: {0}".format(runner.cmd_line))

                stdout, stderr, is_error = runner.run_popen(raise_exc=True)

        return outprefix
Пример #18
0
    def convert2vcf(self,
                    input_prefix,
                    output_prefix,
                    compress=False,
                    verbose=False,
                    logfile=None):
        '''
        Function to use SHAPEIT's -convert in order to convert the *.haps.gz & *.haps.sample files into VCF

        Parameters
        ----------
        input_prefix : str
                       Prefix for the files in HAPS/SAMPLE format
        output_prefix : str
                        String with the output prefix for the VCF file
        verbose : bool, optional
                  if true, then print the command line used for running this program
        logfile : filename, optional
                  Path to log file
        
        Returns
        -------
        filename
                A VCF file
        '''

        if self.shapeit_folder is None:
            raise Exception("shapeit_folder must be defined")

        outfile = "{0}.vcf".format(output_prefix)

        Arg = namedtuple('Argument', 'option value')

        args = [
            Arg('--input-haps', '{0}.gz {0}.sample'.format(input_prefix)),
            Arg('--output-vcf', outfile)
        ]

        if logfile is not None:
            args.append(Arg('--output-log', logfile))

        compressRunner = None
        if compress is True:
            runner = RunProgram(path=self.shapeit_folder,
                                program='shapeit -convert',
                                args=args)
            if verbose is True:
                print("Command line is: {0}".format(runner.cmd_line))
            runner.run_checkoutput()
            compressRunner = RunProgram(
                path=self.bgzip_folder,
                program='bgzip',
                parameters=['-c', outfile, '>', outfile + ".gz"])
            compressRunner.run_checkoutput()
            os.remove(outfile)
            outfile = outfile + ".gz"
        else:
            runner = RunProgram(path=self.shapeit_folder,
                                program='shapeit -convert',
                                args=arguments)
            if verbose is True:
                print("Command line is: {0}".format(runner.cmd_line))
            runner.run_checkoutput()

        return outfile
Пример #19
0
    def calc_concordance(self,
                         truth_vcf,
                         truth_sample,
                         call_sample,
                         outprefix,
                         outdir=None,
                         intervals=None,
                         verbose=None):
        '''
        Method to calculate the genotype concordance between VcfQC.vcf and Truth VCF.
        It will run Picard's GenotypeConcordance

        Parameters
        ----------
        truth_vcf : str
                    The VCF containing the truth sample.
        truth_sample : str
                       The name of the truth sample within the truth VCF.
        call_sample : str
                      The name of the call sample within the call VCF.
        outprefix : str
                    String used as the prefix in the output file.
        outdir : str, optional
                 If provided, then put output files in this folder.
        intervals : str
                    One or more interval list files that will be used to limit the
                    genotype concordance.
        verbose : bool, optional
                  Ff true, then print the command line used for running this program.

        Returns
        -------
        GTPconcordance object
        '''

        if self.picard_folder is None:
            raise Exception("Folder containing Picard jar file is required")

        if outdir:
            outprefix = "%s/%s" % (outdir, outprefix)

        Arg = namedtuple('Argument', 'option value')

        args = [
            Arg('TRUTH_VCF', truth_vcf),
            Arg('CALL_VCF', self.vcf),
            Arg('TRUTH_SAMPLE', truth_sample),
            Arg('CALL_SAMPLE', call_sample),
            Arg('O', outprefix)
        ]

        if intervals:
            args.append(Arg('INTERVALS', intervals))

        runner = RunProgram(
            program='java -jar {0}/picard.jar GenotypeConcordance'.format(
                self.picard_folder),
            args=args,
            arg_sep='=')

        if verbose is True:
            print("Command line is: {0}".format(runner.cmd_line))

        stdout = runner.run_checkoutput()

        gtp_con = GTPconcordance(summary_metrics_file=outprefix+\
                                 ".genotype_concordance_summary_metrics")

        return gtp_con
Пример #20
0
    def stats(self,
              outpath,
              filter_str=None,
              region=None,
              region_file=None,
              verbose=None):
        '''
        Run bcftools stats on the VCF file

        Parameters
        ----------
        outpath : str
                  output path
        filter_str : str, optional. 
                     Example:  PASS,.
                     Apply filters when calculating the stats.
        region : str, optional
                 Example: chr20
                 Region used for calculating the stats.
        region_file : filename, optional
                      BED file with the regions that will be analyzed.
        verbose : bool, optional

        Returns
        -------
        BcftoolsStats object
        '''

        Arg = namedtuple('Argument', 'option value')

        args = []

        if region != None:
            outpath = "{0}.{1}".format(outpath, region)
            args.append(Arg('-r', region))

        if region_file != None:
            args.append(Arg('-R', region_file))

        if filter_str != None:
            outpath = outpath + ".filter_str"
            args.append(Arg('-f', filter_str))

        outpath = outpath + ".stats"

        params = [self.vcf, '>', outpath]

        runner = RunProgram(path=self.bcftools_folder,
                            program='bcftools stats',
                            args=args,
                            parameters=params)

        if verbose is True:
            print("Command line is: {0}".format(runner.cmd_line))

        runner.run_checkoutput()

        stats = BcftoolsStats(filename=outpath)

        with open(outpath) as fi:
            d = {}
            for line in fi:
                line = line.rstrip('\n')
                if line.startswith('SN\t'):
                    key = line.split('\t')[2]
                    value = int(line.split('\t')[3])
                    d[key] = value
                elif line.startswith('TSTV\t'):
                    ts_tv = line.split('\t')[4]
                    ts_tv_1stalt = line.split('\t')[7]
                    stats.ts_tv = ts_tv
                    stats.ts_tv_1stalt = ts_tv_1stalt
                elif line.startswith('SiS\t'):
                    no_singleton_snps = line.split('\t')[3]
                    stats.no_singleton_snps = no_singleton_snps

            stats.summary_numbers = d
        return stats
Пример #21
0
    def run_applyrecalibration(self,
                               mode,
                               recal_file,
                               tranches_file,
                               outprefix,
                               ts_filter_level=99.0,
                               num_threads=1,
                               compress=True,
                               verbose=None,
                               log_file=None):
        '''
        Run GATK's ApplyRecalibration on a VcfFilter object

        Parameters
        ----------
        mode : {'SNP','INDEL'}
               Recalibration mode to employ
        recal_file : filename
                     The input recal file used by ApplyRecalibration
        tranches_file : filename
                        The input tranches file describing where to cut the data
        outprefix : str
                    Prefix used for the output
        ts_filter_level : float, optional
                          The truth sensitivity level at which to start filtering. Default=99.0
        num_threads : int, optional
                      Number of data threads to allocate to this analysis. Default=1
        compress : bool
                   Compress the recalibrated VCF. Default= True
        verbose : bool, optional
                  Increase verbosity
        log_file : filename, optional
                   Path to file that will used for logging the GATK stderr and stdout

        Returns
        -------
        filename
                 Path to filtered VCF file
        '''

        if self.caller != 'UG':
            raise Exception("VCF type %s is incompatible" % self.caller)

        if mode != 'SNP' and mode != 'INDEL':
            raise Exception("ApplyRecalibration mode is not valid."
                            "Valid values are 'SNP','INDEL'" % mode)

        # generate output file name
        outfile = ""
        if mode == 'SNP':
            outfile += "%s.recalibrated_snps_raw_indels.vcf" % outprefix
        elif mode == 'INDEL':
            outfile += "%s.recalibrated_variants.vcf" % outprefix

        Arg = namedtuple('Argument', 'option value')

        args = []

        args.extend([
            Arg('-jar', '{0}/GenomeAnalysisTK.jar'.format(self.gatk_folder)),
            Arg('-T', 'ApplyRecalibration'),
            Arg('-R', self.reference),
            Arg('-input', self.vcf),
            Arg('-mode', mode),
            Arg('--ts_filter_level', ts_filter_level),
            Arg('-recalFile', recal_file),
            Arg('--num_threads', num_threads),
            Arg('-tranchesFile', tranches_file)
        ])

        pipelist = None
        if compress is True:
            outfile += ".gz"
            compressRunner = RunProgram(path=self.bgzip_folder,
                                        program='bgzip',
                                        parameters=['-c', '>', outfile])
            pipelist = [compressRunner]
        else:
            args.append(Arg('-o', outfile))

        program_str = None
        if self.tmp_dir is not None:
            program_str = "java -Djava.io.tmpdir={0}".format(self.tmp_dir)
        else:
            program_str = "java"

        runner = RunProgram(program=program_str,
                            args=args,
                            downpipe=pipelist,
                            log_file=log_file)

        if verbose is True:
            print("Command line is: {0}".format(runner.cmd_line))

        stdout, stderr, is_error = runner.run_popen()

        # create an index for the recalibrated file
        if compress is True:
            tabixRunner = RunProgram(path=self.tabix_folder,
                                     program='tabix',
                                     parameters=[outfile])
            stdout = tabixRunner.run_checkoutput()

        return outfile
Пример #22
0
    def get_chros(self, filter_str=None, chr_f=None, verbose=None):
        '''
        Method to get a list of chromosomes present in a file

        Parameters
        ----------
        filter_str : str, optional
                     If defined, apply this filter string so bcftools view
                     apply it before fetching the chros.
        chr_f : str, optional
                Path to file with a list of chromosomes (one per line).
                If provided, the chros in the file will be compared with the
                chromosomes in self.vcf.
        verbose : bool, optional
                  Increase verbosity.

        Returns
        -------
        dict
            Dict with a key named 'in_vcf' and whose values are the chros that are present in self.vcf.

            If list_of_chros is defined, then it will also add 3 keys to the dict:
                 'both' whose values will be the chros present in self.vcf and in 'chr_f'
                 'in_A' whose values will be the chros PRESENT in self.vcf and NOT in 'chr_f'
                 'in_B' whose values will be the chros NOT present in self.vcf and PRESENT in 'chr_f'.
        '''

        params = ['--no-header', self.vcf, "|cut -f1 |uniq"]

        Arg = namedtuple('Argument', 'option value')

        args = []

        if filter_str != None:
            args.append(Arg('-f', filter_str))

        runner = RunProgram(path=self.bcftools_folder,
                            program='bcftools view',
                            args=args,
                            parameters=params)

        if verbose is True:
            print("Command line is: {0}".format(runner.cmd_line))

        out_str = ""

        out = runner.run_checkoutput()
        out_str = out.decode("utf-8")
        out_str = out_str.rstrip('\n')

        list_of_chros = out_str.split("\n")

        chr_list_f = []
        if chr_f != None:
            #parse file with chros
            chr_file = open(chr_f, 'r')
            chr_list_f = chr_file.read().splitlines()

        both = set(list_of_chros) & set(chr_list_f)
        in_a = set(list_of_chros) - set(chr_list_f)
        in_b = set(chr_list_f) - set(list_of_chros)

        return {
            'in_vcf': list_of_chros,
            'both': list(both),
            'in_A': list(in_a),
            'in_B': list(in_b)
        }
Пример #23
0
    def subset_vcf(self,
                   outprefix,
                   bed=None,
                   region=None,
                   outdir=None,
                   create_index=False,
                   verbose=None,
                   action='exclude',
                   apply_filters=None,
                   threads=1):
        '''
        Subset the vcf file using a BED file/region having the coordinates of the
        variants to exclude/include

        Parameters
        ----------
        bed : str, optional
              BED file with coordinates to exclude/include
        region : str, optional
                 String with region to consider: chr1, chr1:1000-1500, etc...
        outprefix : str
                    Prefix for outputfiles
        outdir : str, optional
                 If provided, then put output files in this folder
        create_index : bool, optional
                       Generate a tabix index. Default=False
        verbose : bool, optional
                  verbose
        action : str, optional
                Exclude or include variants from the bed file passed through the
                bed option. Default= exclude
        apply_filters : str, optional
                       Apply a filter string: i.e. "PASS,."
        threads : int, optional
                 Number of output compression threads to use in addition to main thread. Default=0

        Returns
        -------
        filename
                 Path to gzipped VCF file that will have the desired variants excluded/included
        '''

        if action != 'include' and action != 'exclude':
            raise Exception(
                "action argument should be either include or exclude")

        if region:
            bits = outprefix.split(".")
            vcf_ix = bits.index("vcf")

            new = ""
            if apply_filters is not None:
                new = bits[vcf_ix - 1] + "_" + region + ".filt"
            else:
                new = bits[vcf_ix - 1] + "_" + region
            bits[vcf_ix - 1] = new
            outprefix = ".".join(bits)

        if outdir:
            outprefix = "%s/%s" % (outdir, outprefix)

        Arg = namedtuple('Argument', 'option value')

        args = []
        if bed:
            if action == 'exclude':
                args.append(Arg('-T', '^{0}'.format(bed)))
            elif action == 'include':
                args.append(Arg('-T', '{0}'.format(bed)))
        elif region:
            if action == 'exclude':
                args.append(Arg('-t', '^{0}'.format(region)))
            elif action == 'include':
                args.append(Arg('-r', '{0}'.format(region)))

        args.extend(
            [Arg('-o', outprefix),
             Arg('-O', 'z'),
             Arg('--threads', threads)])

        if apply_filters is not None:
            args.append(Arg('-f', '\"{0}\"'.format(apply_filters)))

        runner = RunProgram(path=self.bcftools_folder,
                            program='bcftools view',
                            args=args,
                            parameters=[self.vcf])

        if verbose is True:
            print("Command line is: {0}".format(runner.cmd_line))

        stdout = runner.run_checkoutput()

        return outprefix
Пример #24
0
    def run_hc(self,
               outprefix,
               compress=True,
               verbose=None,
               log_file=None,
               intervals=None,
               **kwargs):
        '''
        Run GATK HaplotypeCaller

        Parameters
        ----------
        outprefix : str, Required
                    Prefix for output VCF file. i.e. /path/to/file/test
        compress : boolean, Default= True
                   Compress the output VCF
        num_cpu_threads_per_data_thread : int, Optional
                   controls the number of CPU threads allocated to each data thread
        intervals : list, Optional
                    List in which each of the elements is a path to file with genomic intervals to
                    operate with. Also coordinates can be set directly on the command line.
                    For example: ['chr1:100-200', 'chr2:200-300']. If the list contains
                    more than one interval, then it is useful to set the --interval_set_rule option
        standard_min_confidence_threshold_for_calling : int, Optional
                                                        The minimum phred-scaled confidence threshold
                                                        at which variants should be called
                                                        Default: 10
        genotyping_mode: str, Optional
                         Specifies how to determine the alternate alleles to use for genotyping
                         Possible values are: DISCOVERY, GENOTYPE_GIVEN_ALLELES
        alleles: str, Optional
                 Path to VCF.
                 When --genotyping_mode is set to
                 GENOTYPE_GIVEN_ALLELES mode, the caller will genotype the samples
                 using only the alleles provide in this callset
        emitRefConfidence: str, Optional
                           Mode for emitting reference confidence scores
                           Possible values are: NONE, BP_RESOLUTION, GVCF
        verbose : bool, optional
                  if true, then print the command line used for running this program
        log_file : str, Optional
                   Path to file that will used for logging the GATK stderr and stdout

        Returns
        -------
        A VCF file

        '''
        Arg = namedtuple('Argument', 'option value')

        arguments = [
            Arg('-T', 'HaplotypeCaller'),
            Arg('-R', self.reference),
            Arg('-I', self.bam)
        ]

        if intervals is not None:
            for i in intervals:
                arguments.append(Arg('--intervals', i))

        for k, v in kwargs.items():
            if v is not None: arguments.append(Arg("--{0}".format(k), v))

        pipelist = None
        if compress is True:
            outprefix += ".vcf.gz"
            compressRunner = RunProgram(path=self.bgzip_folder,
                                        program='bgzip',
                                        parameters=['-c', '>', outprefix])
            pipelist = [compressRunner]
        else:
            outprefix += ".vcf"
            arguments.append(Arg('-o', outprefix))

        runner = RunProgram(
            program='java -jar {0}/GenomeAnalysisTK.jar'.format(
                self.gatk_folder),
            downpipe=pipelist,
            args=arguments,
            log_file=log_file)

        if verbose is True:
            print("Command line is: {0}".format(runner.cmd_line))

        stdout, stderr, is_error = runner.run_popen(raise_exc=False)

        if is_error is True:
            '''
            This piece of code is necessary as GATK crashes when the intersection between
            the genomic chunk and the alleles passed in the VCF are calculated and
            there are no sites.

            If that's the case then GATK  will be run without the interval intersection
            '''
            patt = re.compile(
                '##### ERROR MESSAGE: Bad input: The INTERSECTION of your'
                ' -L options produced no intervals.')
            lines = stderr.split('\n')
            interval_error_seen = False
            for l in lines:
                m = patt.match(l)
                if m:
                    interval_error_seen = True
                    alleles = ([
                        arg.value for arg in arguments
                        if arg.option == '--alleles'
                    ])[0]
                    for k, i in enumerate(arguments):
                        if i.option == '--intervals' and i.value == alleles:
                            del arguments[k]
                        elif i.option == '--interval_set_rule':
                            del arguments[k]
            if interval_error_seen is False:
                raise Exception(stderr)
            elif interval_error_seen is True:
                runner = RunProgram(
                    program='java -jar {0}/GenomeAnalysisTK.jar'.format(
                        self.gatk_folder),
                    downpipe=pipelist,
                    args=arguments,
                    log_file=log_file)
                if verbose is True:
                    print("Command line is: {0}".format(runner.cmd_line))

                stdout, stderr, is_error = runner.run_popen(raise_exc=True)

        return outprefix
Пример #25
0
    def run_beagle(self,
                   outprefix,
                   outdir=None,
                   region=None,
                   verbose=False,
                   correct=False,
                   **kwargs):
        '''
        Method that wraps Beagle (see https://faculty.washington.edu/browning/beagle/beagle.html)
        and will be used to call genotypes on a VCF file containing GT likelihoods

        Parameters
        ----------
        outprefix: str, required
              Prefix used for output file
        outdir : str, optional
                 outdir for output files
        region : str, optional
                 chr or chr interval that will be analyzed. i.e. chr20 or chr20:10000000-11000000
        verbose : bool, optional
                  if true, then print the command line used for running Beagle
        correct : bool, optional
                  Note: that it seems there is an incompatibility between zlib libraries used in Beagle4 and in BOOST on some platforms.
                  This involves either the last line of the file being skipped or a segfault. If correct=True, then this function will fix this issue
                  by recompressing the Beagle4 output files. Default=False
        window: int, optional
                number of markers to include in each sliding
                window. Default: 50000
        overlap: int, optional
                 specifies the number of markers of overlap between sliding
                 windows. Default: 3000
        niterations: unt, optional
                     specifies the number of phasing iterations. Default:
                     niterations=5
        nthreads : int, optional
                   number of threads. If not specified then the nthreads parameter 
                   will be set equal to the number of CPU cores on the host machine

        Returns
        -------
        Compressed VCF file with the genotype calls
        '''

        if self.beagle_folder is None or self.beagle_jar is None:
            raise Exception(
                "Provide the folder containing the Beagle jar file and the Beagle jar file name"
            )

        Arg = namedtuple('Argument', 'option value')
        args = []

        outfile = ""
        if outdir is not None:
            outfile = "{0}/{1}.".format(outdir, outprefix)
        else:
            outfile = "{0}.".format(outprefix)

        if region is not None:
            region_str = re.sub(":|-", ".", region)
            outfile += "{0}.".format(region_str)
            args.append(Arg('chrom', region))

        outfile += "beagle"

        args.extend([Arg('gl', self.vcf), Arg('out', outfile)])

        for k, v in kwargs.items():
            args.append(Arg(k, v))

        runner = RunProgram(program='java -jar {0}/{1}'.format(
            self.beagle_folder, self.beagle_jar),
                            args=args,
                            arg_sep="=")

        if verbose is True:
            print("Command line is: {0}".format(runner.cmd_line))

        stdout = runner.run_checkoutput()

        outfile = outfile + ".vcf.gz"

        if correct is True:
            # creating temp file in order to perform the correction
            temp = tempfile.NamedTemporaryFile(delete=False)
            gzipRunner = RunProgram(program='gzip',
                                    parameters=['-c', '>', temp.name])
            zcatRunner = RunProgram(program='zcat',
                                    parameters=[outfile],
                                    downpipe=[gzipRunner])

            if verbose is True:
                print(
                    "Command line for vcf.gz correction partA is: {0}".format(
                        zcatRunner.cmd_line))

            #run zcat file | gzip -c > tmp.file
            zcatRunner.run_checkoutput()

            #mv tmp.file back to outfile
            mvRunner = RunProgram(program='mv',
                                  parameters=[temp.name, outfile])

            if verbose is True:
                print(
                    "Command line for vcf.gz correction partB is: {0}".format(
                        mvRunner.cmd_line))

            mvRunner.run_checkoutput()

        return outfile
Пример #26
0
    def run_vcfallelicprimitives(self, outprefix, compress=True, outdir=None,
                                 keepinfo=True, keepgeno=True, downstream_pipe=None, verbose=None):
        '''
        Run vcfallelicprimitives on a vcf file

        This program is used to decompose complex variants into a canonical SNP and
        indel representation,generating phased genotypes for available samples.

        Parameters
        ----------

        outprefix : str, required
              prefix for outputfiles
        compress : boolean, optional
              bgzip compress the normalized VCF
        outdir : str, optional
            If provided, then put output files in this folder
        keepinfo : bool, optional. Default=True
            Maintain site and allele-level annotations when decomposing.
            Note that in many cases, such as multisample VCFs, these won't
            be valid post-decomposition.  For biallelic loci in single-sample
            VCFs, they should be usable with caution
        keepgeno : bool, optional. Default=True
            Maintain genotype-level annotations when decomposing.  Similar
            caution should be used for this as for keep-info.
        downstream_pipe : str, optional
            If defined, then pipe the output VCF to other tools. 
            i.e. "~/bin/vt/vt sort - | ~/bin/vt/vt uniq -"
        verbose : bool, optional
            if true, then increase verbosity

        Returns
        -------
        A string with path to decomposed file
        '''

        if outdir: 
            outprefix = "{0}/{1}".format(outdir, outprefix)

        outprefix = outprefix+".aprimitives.vcf"

        params=[self.vcf]

        if keepinfo is True:
            params.append('--keep-info')

        if keepgeno is True:
            params.append('--keep-geno')

        if downstream_pipe is not None:
            params.append("| {0}".format(downstream_pipe))

        runner=None
        pipelist=None
        if compress is True:
            outprefix += ".gz"
            compressRunner=RunProgram(path=self.bgzip_folder,program='bgzip',parameters=[ '-c', '>', outprefix])
            pipelist=[compressRunner]
        elif compress is None or compress is False:
            params.extend(['>',outprefix])

        runner=RunProgram(path=self.vcflib_folder, program='vcfallelicprimitives', parameters=params, downpipe=pipelist)

        if verbose is True:
             print("Command line for running vcfallelicprimitives is: {0}".format(runner.cmd_line))

        runner.run_checkoutput()

        return outprefix
Пример #27
0
    def run_shapeit(self,
                    output_prefix,
                    input_gen=None,
                    input_init=None,
                    input_scaffold=None,
                    input_bed=None,
                    duohmm=False,
                    input_map=None,
                    verbose=False,
                    **kwargs):
        '''
        Run Shapeit
        
        Parameters
        ----------
        input_gen : str, optional
                    Specifies the genotype/GL input data that you obtain from Beagle4, i.e. 'input.shapeit.20.gen.gz input.shapeit.20.gen.sample'
        input_init : str, optional
                     Specifies the haplotypes that you obtain from Beagle4, i.e. 'input.shapeit.20.hap.gz input.shapeit.20.hap.sample'
        input_scaffold : str, optional
                         SNP-array derived haplotype scaffold used by SHAPEIT. It has to be in Impute2 format. i.e. 'scaffold.haps.gz scaffold.haps.sample'
        input_bed : str, optional
                    Unphased genotypes in Plink Binary BED/BIM/FAM format. i.e. 'file.bed file.bim file.fam'
        duohmm : bool, optional
                 If true, then activate the --duohmm option. Default: False
        output_prefix : str
                        Prefix used for the 2 output files estimated by SHAPEIT, i.e. 'output.shapeit.20.haps.gz output.shapeit.20.haps.sample'
        input_map : filename, optional
                    Path to the file with the genetic map
        i_from : int, optional
                 Specify the region to be phased
        i_to : int, optional
               Specify the region to be phased
        verbose : bool, optional
                  If true, then print the command line used for running this program
        
        Returns
        -------
        dict
            A dict with the path to the 2 output files (*.haps.gz and *.haps.sample) that can be used with SHAPEIT
        '''

        if input_gen is None and input_bed is None:
            raise Exception(
                "Error! Either --input-gen or --input-bed need to be specified as input for SHAPEIT"
            )

        Arg = namedtuple('Argument', 'option value')

        args = []

        if input_gen is not None:
            args.append(Arg('-call --input-gen', input_gen))
        elif input_bed is not None:
            args.append(Arg('--input-bed', input_bed))

        if input_init is not None:
            args.append(Arg('--input-init', input_init))

        if input_scaffold is not None:
            args.append(Arg('--input-scaffold', input_scaffold))

        if input_map is not None:
            args.append(Arg('--input-map', input_map))

        for k, v in kwargs.items():
            args.append(Arg('--{0}'.format(k), v))

        args.extend([
            Arg('--output-max',
                '{0}.haps.gz {0}.haps.sample'.format(output_prefix)),
            Arg('--output-log', '{0}.log'.format(output_prefix))
        ])

        params = []
        if duohmm is True: params = ['--duohmm']

        runner = RunProgram(path=self.shapeit_folder,
                            program='shapeit',
                            args=args,
                            parameters=params)

        if verbose is True:
            print("Command line is: {0}".format(runner.cmd_line))

        stdout = runner.run_checkoutput()

        outdict = {
            'hap_gz': '{0}.haps.gz'.format(output_prefix),
            'hap_sample': '{0}.haps.sample'.format(output_prefix)
        }

        return outdict
Пример #28
0
    def combine(self,
                labels,
                reference,
                outprefix,
                compress=False,
                outdir=None,
                ginterval=None,
                genotypemergeoption=None,
                filteredrecordsmergetype=None,
                threads=1,
                options=None,
                verbose=False):
        '''
        Combine VCFs using GATK's CombineVariants into a single VCF

        Parameters
        ----------
        labels : list
                 List of labels used for each of the VCFs in self.vcflist. The order of the labels
                 should be the same that the VCFs in the list
        reference : str
                    Path to Fasta file with reference
        outprefix : str
                    Prefix used for output file
        compress : bool, optional
                   Compress the output VCF with bgzip. Default=False
        outdir : str, optional
                 Path to folder used to write the results to
        ginterval : str, optional
                    Genomic interval used to restrict the analysis. i.e. chr20:1000-2000
        genotypemergeoption : {'UNIQUIFY', 'PRIORITIZE', 'UNSORTED', 'REQUIRE_UNIQUE'}, optional
                              Determines how we should merge genotype records for samples shared across the ROD files
        filteredrecordsmergetype : {'KEEP_IF_ANY_UNFILTERED', 'KEEP_IF_ANY_UNFILTERED', 'KEEP_UNCONDITIONAL'}, optional
                                   Determines how we should handle records seen at the same site in the VCF, but with different FILTER fields
        threads : int, optional
                  Number of trades to use. Default=1
        options : list, optional
                  List of options. i.e. ['-env','--filteredAreUncalled']
        verbose : bool, optional
                  increase the verbosity, default=False
    
        Returns
        -------
        filename
                Path to the merged VCF
        '''

        Arg = namedtuple('Argument', 'option value')

        args = [
            Arg('-T', 'CombineVariants'),
            Arg('-R', reference),
            Arg('-nt', threads)
        ]

        variants_str = ""
        for path, label in zip(self.vcflist, labels):
            if os.path.isfile(path) == False:
                print("Error reading from {0}".format(path))
                raise Exception("File does not exist")
            args.append(Arg('-V:{0}'.format(label), path))

        outfile = ""
        if outdir:
            outfile = "{0}/".format(outdir)
        outfile += "{0}.vcf".format(outprefix)

        if ginterval is not None:
            args.append(Arg('-L', ginterval))

        if genotypemergeoption is not None:
            args.append(Arg('--genotypemergeoption', genotypemergeoption))

        if filteredrecordsmergetype is not None:
            args.append(
                Arg('--filteredrecordsmergetype', filteredrecordsmergetype))

        params = []
        if options:
            for opt in options:
                params.append(opt)

        pipelist = None
        if compress is True:
            outfile += ".gz"
            compressRunner = RunProgram(path=self.bgzip_folder,
                                        program='bgzip',
                                        parameters=['-c', '>', outfile])
            pipelist = [compressRunner]
        else:
            args.append(Arg('-o', outfile))

        runner = RunProgram(
            path=self.java_folder,
            program='java -jar {0}/GenomeAnalysisTK.jar'.format(
                self.gatk_folder),
            args=args,
            parameters=params,
            downpipe=pipelist)

        if verbose is True:
            print("Command line is: {0}".format(runner.cmd_line))

        stdout, stderr, is_exc = runner.run_popen()

        return outfile